root / tmp / org.txm.core / src / java / org / txm / scripts / importer / MileStoneProjection.groovy @ 1000
History | View | Annotate | Download (3.3 kB)
1 | 1000 | mdecorde | package org.txm.scripts.importer
|
---|---|---|---|
2 | 881 | mdecorde | |
3 | 881 | mdecorde | import java.io.File; |
4 | 881 | mdecorde | import javax.xml.stream.XMLStreamConstants; |
5 | 881 | mdecorde | import javax.xml.stream.XMLOutputFactory; |
6 | 881 | mdecorde | import javax.xml.stream.XMLStreamWriter; |
7 | 881 | mdecorde | import java.net.URL; |
8 | 881 | mdecorde | import javax.xml.stream.*; |
9 | 881 | mdecorde | import org.txm.importer.filters.*; |
10 | 881 | mdecorde | import org.txm.importer.PersonalNamespaceContext; |
11 | 986 | mdecorde | import org.txm.importer.StaxIdentityParser |
12 | 881 | mdecorde | |
13 | 881 | mdecorde | /**
|
14 | 881 | mdecorde | * add 2 attributes per element to encode the distance to the previous milestone and next milestone
|
15 | 881 | mdecorde | * TODO: only the first attribute is implemented
|
16 | 881 | mdecorde | *
|
17 | 881 | mdecorde | * @author mdecorde
|
18 | 881 | mdecorde | *
|
19 | 881 | mdecorde | */
|
20 | 881 | mdecorde | class MileStoneProjection extends StaxIdentityParser { |
21 | 881 | mdecorde | String wordTag
|
22 | 881 | mdecorde | String mileStoneTag
|
23 | 881 | mdecorde | String startTag
|
24 | 881 | mdecorde | boolean start = false |
25 | 881 | mdecorde | |
26 | 881 | mdecorde | int mileStoneDistance = 0 |
27 | 881 | mdecorde | |
28 | 881 | mdecorde | String mileStoneID = "" |
29 | 881 | mdecorde | def milestonesLength = [] |
30 | 881 | mdecorde | int milestonesCounter = 0 |
31 | 881 | mdecorde | boolean secondPass = false |
32 | 881 | mdecorde | |
33 | 881 | mdecorde | String msIdAttributeName
|
34 | 881 | mdecorde | String msStartAttributeName
|
35 | 881 | mdecorde | String msEndAttributeName
|
36 | 881 | mdecorde | |
37 | 881 | mdecorde | public MileStoneProjection(File inputFile, String startTag, String wordTag, String mileStoneTag) { |
38 | 881 | mdecorde | super(inputFile)
|
39 | 881 | mdecorde | |
40 | 881 | mdecorde | this.wordTag = wordTag
|
41 | 881 | mdecorde | this.mileStoneTag = mileStoneTag
|
42 | 881 | mdecorde | mileStoneID = mileStoneTag+"_0"
|
43 | 881 | mdecorde | this.startTag = startTag
|
44 | 881 | mdecorde | this.start = false; |
45 | 881 | mdecorde | |
46 | 881 | mdecorde | msIdAttributeName = mileStoneTag+"id";
|
47 | 881 | mdecorde | msStartAttributeName = mileStoneTag+"start";
|
48 | 881 | mdecorde | msEndAttributeName = mileStoneTag+"end";
|
49 | 881 | mdecorde | |
50 | 881 | mdecorde | fetchMilestoneLengths(); |
51 | 881 | mdecorde | start = false // reset |
52 | 881 | mdecorde | } |
53 | 881 | mdecorde | |
54 | 881 | mdecorde | public void fetchMilestoneLengths() { |
55 | 881 | mdecorde | def inputData = inputurl.openStream();
|
56 | 881 | mdecorde | def factory = XMLInputFactory.newInstance();
|
57 | 881 | mdecorde | def parser = factory.createXMLStreamReader(inputData);
|
58 | 881 | mdecorde | |
59 | 881 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
60 | 881 | mdecorde | switch (event) {
|
61 | 881 | mdecorde | case XMLStreamConstants.START_ELEMENT:
|
62 | 881 | mdecorde | localname = parser.getLocalName() |
63 | 881 | mdecorde | if (start && localname == mileStoneTag) {
|
64 | 881 | mdecorde | milestonesLength << mileStoneDistance; |
65 | 881 | mdecorde | mileStoneDistance = 0
|
66 | 881 | mdecorde | milestonesCounter++; |
67 | 881 | mdecorde | } else if (start && localname == wordTag) { |
68 | 881 | mdecorde | mileStoneDistance++ |
69 | 881 | mdecorde | } else if (localname == startTag) { |
70 | 881 | mdecorde | start = true
|
71 | 881 | mdecorde | } |
72 | 881 | mdecorde | break;
|
73 | 881 | mdecorde | } |
74 | 881 | mdecorde | } |
75 | 881 | mdecorde | milestonesLength[milestonesCounter] = mileStoneDistance; |
76 | 881 | mdecorde | parser.close() |
77 | 881 | mdecorde | milestonesCounter = 0;
|
78 | 881 | mdecorde | } |
79 | 881 | mdecorde | |
80 | 881 | mdecorde | |
81 | 881 | mdecorde | |
82 | 881 | mdecorde | public void processStartElement() { |
83 | 881 | mdecorde | super.processStartElement();
|
84 | 881 | mdecorde | |
85 | 881 | mdecorde | if (start && localname == mileStoneTag) {
|
86 | 881 | mdecorde | mileStoneDistance = 0
|
87 | 881 | mdecorde | mileStoneID = parser.getAttributeValue(null, "id") |
88 | 881 | mdecorde | if (mileStoneID == null) mileStoneID = parser.getAttributeValue("xml", "id"); |
89 | 881 | mdecorde | if (mileStoneID == null) mileStoneID = "0"; |
90 | 881 | mdecorde | |
91 | 881 | mdecorde | milestonesCounter++; |
92 | 881 | mdecorde | } else if (start && localname == wordTag) { |
93 | 881 | mdecorde | // println "end of $milestonesCounter len="+milestonesLength[milestonesCounter]+" dist="+mileStoneDistance
|
94 | 881 | mdecorde | writer.writeAttribute(msEndAttributeName, Integer.toString((milestonesLength[milestonesCounter] - mileStoneDistance - 1))) |
95 | 881 | mdecorde | writer.writeAttribute(msStartAttributeName, Integer.toString(mileStoneDistance))
|
96 | 881 | mdecorde | writer.writeAttribute(msIdAttributeName, mileStoneID) |
97 | 881 | mdecorde | |
98 | 881 | mdecorde | mileStoneDistance++ |
99 | 881 | mdecorde | } else if (localname == startTag) { |
100 | 881 | mdecorde | start = true
|
101 | 881 | mdecorde | } |
102 | 881 | mdecorde | } |
103 | 881 | mdecorde | |
104 | 881 | mdecorde | public static void main(String[] args) { |
105 | 881 | mdecorde | File inputFile = new File("/home/mdecorde/TXM/corpora/BVHEPISTEMON2016/tokenized/1538_MarotAdole.xml") |
106 | 881 | mdecorde | File outputFile = new File("/home/mdecorde/TEMP/tmp.xml") |
107 | 881 | mdecorde | |
108 | 881 | mdecorde | MileStoneProjection msp = new MileStoneProjection(inputFile, "body", "w", "lb"); |
109 | 881 | mdecorde | println "Sucess: "+msp.process(outputFile)
|
110 | 881 | mdecorde | } |
111 | 881 | mdecorde | } |