Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / MileStoneProjection.groovy @ 1000

History | View | Annotate | Download (3.3 kB)

1 1000 mdecorde
package org.txm.scripts.importer
2 881 mdecorde
3 881 mdecorde
import java.io.File;
4 881 mdecorde
import javax.xml.stream.XMLStreamConstants;
5 881 mdecorde
import javax.xml.stream.XMLOutputFactory;
6 881 mdecorde
import javax.xml.stream.XMLStreamWriter;
7 881 mdecorde
import java.net.URL;
8 881 mdecorde
import javax.xml.stream.*;
9 881 mdecorde
import org.txm.importer.filters.*;
10 881 mdecorde
import org.txm.importer.PersonalNamespaceContext;
11 986 mdecorde
import org.txm.importer.StaxIdentityParser
12 881 mdecorde
13 881 mdecorde
/**
14 881 mdecorde
 * add 2 attributes per element to encode the distance to the previous milestone and next milestone
15 881 mdecorde
 * TODO: only the first attribute is implemented
16 881 mdecorde
 *
17 881 mdecorde
 * @author mdecorde
18 881 mdecorde
 *
19 881 mdecorde
 */
20 881 mdecorde
class MileStoneProjection extends StaxIdentityParser {
21 881 mdecorde
        String wordTag
22 881 mdecorde
        String mileStoneTag
23 881 mdecorde
        String startTag
24 881 mdecorde
        boolean start = false
25 881 mdecorde
26 881 mdecorde
        int mileStoneDistance = 0
27 881 mdecorde
28 881 mdecorde
        String mileStoneID = ""
29 881 mdecorde
        def milestonesLength = []
30 881 mdecorde
        int milestonesCounter = 0
31 881 mdecorde
        boolean secondPass = false
32 881 mdecorde
33 881 mdecorde
        String msIdAttributeName
34 881 mdecorde
        String msStartAttributeName
35 881 mdecorde
        String msEndAttributeName
36 881 mdecorde
37 881 mdecorde
        public MileStoneProjection(File inputFile, String startTag, String wordTag, String mileStoneTag) {
38 881 mdecorde
                super(inputFile)
39 881 mdecorde
40 881 mdecorde
                this.wordTag = wordTag
41 881 mdecorde
                this.mileStoneTag = mileStoneTag
42 881 mdecorde
                mileStoneID = mileStoneTag+"_0"
43 881 mdecorde
                this.startTag = startTag
44 881 mdecorde
                this.start = false;
45 881 mdecorde
46 881 mdecorde
                msIdAttributeName = mileStoneTag+"id";
47 881 mdecorde
                msStartAttributeName = mileStoneTag+"start";
48 881 mdecorde
                msEndAttributeName = mileStoneTag+"end";
49 881 mdecorde
50 881 mdecorde
                fetchMilestoneLengths();
51 881 mdecorde
                start = false // reset
52 881 mdecorde
        }
53 881 mdecorde
54 881 mdecorde
        public void fetchMilestoneLengths() {
55 881 mdecorde
                def inputData = inputurl.openStream();
56 881 mdecorde
                def factory = XMLInputFactory.newInstance();
57 881 mdecorde
                def parser = factory.createXMLStreamReader(inputData);
58 881 mdecorde
59 881 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
60 881 mdecorde
                        switch (event) {
61 881 mdecorde
                                case XMLStreamConstants.START_ELEMENT:
62 881 mdecorde
                                        localname = parser.getLocalName()
63 881 mdecorde
                                        if (start && localname == mileStoneTag) {
64 881 mdecorde
                                                milestonesLength << mileStoneDistance;
65 881 mdecorde
                                                mileStoneDistance = 0
66 881 mdecorde
                                                milestonesCounter++;
67 881 mdecorde
                                        } else if (start && localname == wordTag) {
68 881 mdecorde
                                                mileStoneDistance++
69 881 mdecorde
                                        } else if (localname == startTag) {
70 881 mdecorde
                                                start = true
71 881 mdecorde
                                        }
72 881 mdecorde
                                        break;
73 881 mdecorde
                        }
74 881 mdecorde
                }
75 881 mdecorde
                milestonesLength[milestonesCounter] = mileStoneDistance;
76 881 mdecorde
                parser.close()
77 881 mdecorde
                milestonesCounter = 0;
78 881 mdecorde
        }
79 881 mdecorde
80 881 mdecorde
81 881 mdecorde
82 881 mdecorde
        public void processStartElement() {
83 881 mdecorde
                super.processStartElement();
84 881 mdecorde
85 881 mdecorde
                if (start && localname == mileStoneTag) {
86 881 mdecorde
                        mileStoneDistance = 0
87 881 mdecorde
                        mileStoneID = parser.getAttributeValue(null, "id")
88 881 mdecorde
                        if (mileStoneID == null) mileStoneID = parser.getAttributeValue("xml", "id");
89 881 mdecorde
                        if (mileStoneID == null) mileStoneID = "0";
90 881 mdecorde
91 881 mdecorde
                        milestonesCounter++;
92 881 mdecorde
                } else if (start && localname == wordTag) {
93 881 mdecorde
                        // println "end of $milestonesCounter len="+milestonesLength[milestonesCounter]+" dist="+mileStoneDistance
94 881 mdecorde
                        writer.writeAttribute(msEndAttributeName, Integer.toString((milestonesLength[milestonesCounter] - mileStoneDistance - 1)))
95 881 mdecorde
                        writer.writeAttribute(msStartAttributeName, Integer.toString(mileStoneDistance))
96 881 mdecorde
                        writer.writeAttribute(msIdAttributeName, mileStoneID)
97 881 mdecorde
98 881 mdecorde
                        mileStoneDistance++
99 881 mdecorde
                } else if (localname == startTag) {
100 881 mdecorde
                        start = true
101 881 mdecorde
                }
102 881 mdecorde
        }
103 881 mdecorde
104 881 mdecorde
        public static void main(String[] args) {
105 881 mdecorde
                File inputFile = new File("/home/mdecorde/TXM/corpora/BVHEPISTEMON2016/tokenized/1538_MarotAdole.xml")
106 881 mdecorde
                File outputFile = new File("/home/mdecorde/TEMP/tmp.xml")
107 881 mdecorde
108 881 mdecorde
                MileStoneProjection msp = new MileStoneProjection(inputFile, "body", "w", "lb");
109 881 mdecorde
                println "Sucess: "+msp.process(outputFile)
110 881 mdecorde
        }
111 881 mdecorde
}