Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / MileStoneProjection.groovy @ 2473

History | View | Annotate | Download (3.4 kB)

1 1000 mdecorde
package org.txm.scripts.importer
2 881 mdecorde
3 881 mdecorde
import java.io.File;
4 881 mdecorde
import javax.xml.stream.XMLStreamConstants;
5 881 mdecorde
import javax.xml.stream.XMLOutputFactory;
6 881 mdecorde
import javax.xml.stream.XMLStreamWriter;
7 881 mdecorde
import java.net.URL;
8 881 mdecorde
import javax.xml.stream.*;
9 881 mdecorde
import org.txm.importer.filters.*;
10 881 mdecorde
import org.txm.importer.PersonalNamespaceContext;
11 986 mdecorde
import org.txm.importer.StaxIdentityParser
12 881 mdecorde
13 881 mdecorde
/**
14 881 mdecorde
 * add 2 attributes per element to encode the distance to the previous milestone and next milestone
15 881 mdecorde
 * TODO: only the first attribute is implemented
16 881 mdecorde
 *
17 881 mdecorde
 * @author mdecorde
18 881 mdecorde
 *
19 881 mdecorde
 */
20 881 mdecorde
class MileStoneProjection extends StaxIdentityParser {
21 881 mdecorde
        String wordTag
22 881 mdecorde
        String mileStoneTag
23 881 mdecorde
        String startTag
24 881 mdecorde
        boolean start = false
25 881 mdecorde
26 881 mdecorde
        int mileStoneDistance = 0
27 881 mdecorde
28 881 mdecorde
        String mileStoneID = ""
29 881 mdecorde
        def milestonesLength = []
30 881 mdecorde
        int milestonesCounter = 0
31 881 mdecorde
        boolean secondPass = false
32 881 mdecorde
33 881 mdecorde
        String msIdAttributeName
34 881 mdecorde
        String msStartAttributeName
35 881 mdecorde
        String msEndAttributeName
36 881 mdecorde
37 881 mdecorde
        public MileStoneProjection(File inputFile, String startTag, String wordTag, String mileStoneTag) {
38 881 mdecorde
                super(inputFile)
39 881 mdecorde
40 881 mdecorde
                this.wordTag = wordTag
41 881 mdecorde
                this.mileStoneTag = mileStoneTag
42 881 mdecorde
                mileStoneID = mileStoneTag+"_0"
43 881 mdecorde
                this.startTag = startTag
44 881 mdecorde
                this.start = false;
45 881 mdecorde
46 881 mdecorde
                msIdAttributeName = mileStoneTag+"id";
47 881 mdecorde
                msStartAttributeName = mileStoneTag+"start";
48 881 mdecorde
                msEndAttributeName = mileStoneTag+"end";
49 881 mdecorde
50 881 mdecorde
                fetchMilestoneLengths();
51 881 mdecorde
                start = false // reset
52 881 mdecorde
        }
53 881 mdecorde
54 881 mdecorde
        public void fetchMilestoneLengths() {
55 881 mdecorde
                def inputData = inputurl.openStream();
56 881 mdecorde
                def factory = XMLInputFactory.newInstance();
57 881 mdecorde
                def parser = factory.createXMLStreamReader(inputData);
58 881 mdecorde
59 881 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
60 881 mdecorde
                        switch (event) {
61 881 mdecorde
                                case XMLStreamConstants.START_ELEMENT:
62 881 mdecorde
                                        localname = parser.getLocalName()
63 1177 mdecorde
                                        if (start && localname.equals(mileStoneTag)) {
64 881 mdecorde
                                                milestonesLength << mileStoneDistance;
65 881 mdecorde
                                                mileStoneDistance = 0
66 881 mdecorde
                                                milestonesCounter++;
67 1177 mdecorde
                                        } else if (start && localname.equals(wordTag)) {
68 881 mdecorde
                                                mileStoneDistance++
69 1177 mdecorde
                                        } else if (localname.equals(startTag)) {
70 881 mdecorde
                                                start = true
71 881 mdecorde
                                        }
72 881 mdecorde
                                        break;
73 881 mdecorde
                        }
74 881 mdecorde
                }
75 881 mdecorde
                milestonesLength[milestonesCounter] = mileStoneDistance;
76 1688 mdecorde
                if (parser != null) parser.close();
77 1688 mdecorde
                if (inputData != null) inputData.close();
78 881 mdecorde
                milestonesCounter = 0;
79 1177 mdecorde
                //println milestonesLength
80 881 mdecorde
        }
81 881 mdecorde
82 881 mdecorde
83 881 mdecorde
84 881 mdecorde
        public void processStartElement() {
85 881 mdecorde
                super.processStartElement();
86 881 mdecorde
87 1177 mdecorde
                if (start && localname.equals(mileStoneTag)) {
88 881 mdecorde
                        mileStoneDistance = 0
89 881 mdecorde
                        mileStoneID = parser.getAttributeValue(null, "id")
90 881 mdecorde
                        if (mileStoneID == null) mileStoneID = parser.getAttributeValue("xml", "id");
91 881 mdecorde
                        if (mileStoneID == null) mileStoneID = "0";
92 881 mdecorde
93 881 mdecorde
                        milestonesCounter++;
94 1177 mdecorde
                } else if (start && localname.equals(wordTag)) {
95 881 mdecorde
                        // println "end of $milestonesCounter len="+milestonesLength[milestonesCounter]+" dist="+mileStoneDistance
96 881 mdecorde
                        writer.writeAttribute(msEndAttributeName, Integer.toString((milestonesLength[milestonesCounter] - mileStoneDistance - 1)))
97 881 mdecorde
                        writer.writeAttribute(msStartAttributeName, Integer.toString(mileStoneDistance))
98 881 mdecorde
                        writer.writeAttribute(msIdAttributeName, mileStoneID)
99 881 mdecorde
100 881 mdecorde
                        mileStoneDistance++
101 1177 mdecorde
                } else if (localname.equals(startTag)) {
102 881 mdecorde
                        start = true
103 881 mdecorde
                }
104 881 mdecorde
        }
105 881 mdecorde
106 881 mdecorde
        public static void main(String[] args) {
107 1177 mdecorde
                File inputFile = new File("/home/mdecorde/TXM-0.8.0-dev/corpora/XTZMILESTONES/tokenized/test.xml")
108 1177 mdecorde
                File outputFile = new File("/home/mdecorde/TXM-0.8.0-dev/corpora/XTZMILESTONES/tokenized/result.xml")
109 881 mdecorde
110 1177 mdecorde
                MileStoneProjection msp = new MileStoneProjection(inputFile, "text", "w", "lb");
111 881 mdecorde
                println "Sucess: "+msp.process(outputFile)
112 881 mdecorde
        }
113 881 mdecorde
}