root / tmp / org.txm.core / src / java / org / txm / scripts / importer / MileStoneProjection.groovy @ 2675
History | View | Annotate | Download (3.5 kB)
1 |
package org.txm.scripts.importer
|
---|---|
2 |
|
3 |
import java.io.File; |
4 |
import javax.xml.stream.XMLStreamConstants; |
5 |
import javax.xml.stream.XMLOutputFactory; |
6 |
import javax.xml.stream.XMLStreamWriter; |
7 |
import java.net.URL; |
8 |
import javax.xml.stream.*; |
9 |
import org.txm.importer.filters.*; |
10 |
import org.txm.importer.PersonalNamespaceContext; |
11 |
import org.txm.importer.StaxIdentityParser |
12 |
|
13 |
/**
|
14 |
* add 2 attributes per element to encode the distance to the previous milestone and next milestone
|
15 |
* TODO: only the first attribute is implemented
|
16 |
*
|
17 |
* @author mdecorde
|
18 |
*
|
19 |
*/
|
20 |
class MileStoneProjection extends StaxIdentityParser { |
21 |
String wordTag
|
22 |
String mileStoneTag
|
23 |
String startTag
|
24 |
boolean start = false |
25 |
|
26 |
int mileStoneDistance = 0 |
27 |
|
28 |
String mileStoneID = "" |
29 |
def milestonesLength = [] |
30 |
int milestonesCounter = 0 |
31 |
boolean secondPass = false |
32 |
|
33 |
String msIdAttributeName
|
34 |
String msStartAttributeName
|
35 |
String msEndAttributeName
|
36 |
|
37 |
public MileStoneProjection(File inputFile, String startTag, String wordTag, String mileStoneTag) { |
38 |
super(inputFile)
|
39 |
|
40 |
this.wordTag = wordTag
|
41 |
this.mileStoneTag = mileStoneTag
|
42 |
mileStoneID = mileStoneTag+"_0"
|
43 |
this.startTag = startTag
|
44 |
this.start = false; |
45 |
|
46 |
msIdAttributeName = mileStoneTag+"id";
|
47 |
msStartAttributeName = mileStoneTag+"start";
|
48 |
msEndAttributeName = mileStoneTag+"end";
|
49 |
|
50 |
fetchMilestoneLengths(); |
51 |
start = false // reset |
52 |
} |
53 |
|
54 |
public void fetchMilestoneLengths() { |
55 |
def inputData = inputurl.openStream();
|
56 |
def factory = XMLInputFactory.newInstance();
|
57 |
def parser = factory.createXMLStreamReader(inputData);
|
58 |
|
59 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
60 |
switch (event) {
|
61 |
case XMLStreamConstants.START_ELEMENT:
|
62 |
localname = parser.getLocalName() |
63 |
if (start && localname.equals(mileStoneTag)) {
|
64 |
milestonesLength << mileStoneDistance; |
65 |
mileStoneDistance = 0
|
66 |
milestonesCounter++; |
67 |
} else if (start && localname.equals(wordTag)) { |
68 |
mileStoneDistance++ |
69 |
} else if (localname.equals(startTag)) { |
70 |
start = true
|
71 |
} |
72 |
break;
|
73 |
} |
74 |
} |
75 |
milestonesLength[milestonesCounter] = mileStoneDistance; |
76 |
if (parser != null) parser.close(); |
77 |
if (inputData != null) inputData.close(); |
78 |
milestonesCounter = 0;
|
79 |
//println milestonesLength
|
80 |
} |
81 |
|
82 |
|
83 |
|
84 |
public void processStartElement() { |
85 |
super.processStartElement();
|
86 |
|
87 |
if (start && localname.equals(mileStoneTag)) {
|
88 |
mileStoneDistance = 0
|
89 |
mileStoneID = parser.getAttributeValue(null, "id") |
90 |
if (mileStoneID == null) mileStoneID = parser.getAttributeValue("xml", "id"); |
91 |
if (mileStoneID == null) mileStoneID = "0"; |
92 |
|
93 |
milestonesCounter++; |
94 |
} else if (start && localname.equals(wordTag)) { |
95 |
// println "end of $milestonesCounter len="+milestonesLength[milestonesCounter]+" dist="+mileStoneDistance
|
96 |
writer.writeAttribute(msEndAttributeName, Integer.toString((milestonesLength[milestonesCounter] - mileStoneDistance - 1))) |
97 |
writer.writeAttribute(msStartAttributeName, Integer.toString(mileStoneDistance))
|
98 |
writer.writeAttribute(msIdAttributeName, mileStoneID) |
99 |
|
100 |
mileStoneDistance++ |
101 |
} else if (localname.equals(startTag)) { |
102 |
start = true
|
103 |
} |
104 |
} |
105 |
|
106 |
public static void main(String[] args) { |
107 |
File inputFile = new File(System.getProperty("user.home"), "TXM-0.8.0-dev/corpora/XTZMILESTONES/tokenized/test.xml") |
108 |
File outputFile = new File(System.getProperty("user.home"), "TXM-0.8.0-dev/corpora/XTZMILESTONES/tokenized/result.xml") |
109 |
|
110 |
MileStoneProjection msp = new MileStoneProjection(inputFile, "text", "w", "lb"); |
111 |
println "Sucess: "+msp.process(outputFile)
|
112 |
} |
113 |
} |