Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / MileStoneProjection.groovy @ 1688

History | View | Annotate | Download (3.4 kB)

1
package org.txm.scripts.importer
2

    
3
import java.io.File;
4
import javax.xml.stream.XMLStreamConstants;
5
import javax.xml.stream.XMLOutputFactory;
6
import javax.xml.stream.XMLStreamWriter;
7
import java.net.URL;
8
import javax.xml.stream.*;
9
import org.txm.importer.filters.*;
10
import org.txm.importer.PersonalNamespaceContext;
11
import org.txm.importer.StaxIdentityParser
12

    
13
/**
14
 * add 2 attributes per element to encode the distance to the previous milestone and next milestone
15
 * TODO: only the first attribute is implemented
16
 * 
17
 * @author mdecorde
18
 *
19
 */
20
class MileStoneProjection extends StaxIdentityParser {
21
        String wordTag
22
        String mileStoneTag
23
        String startTag
24
        boolean start = false
25

    
26
        int mileStoneDistance = 0
27

    
28
        String mileStoneID = ""
29
        def milestonesLength = []
30
        int milestonesCounter = 0
31
        boolean secondPass = false
32
        
33
        String msIdAttributeName
34
        String msStartAttributeName
35
        String msEndAttributeName
36

    
37
        public MileStoneProjection(File inputFile, String startTag, String wordTag, String mileStoneTag) {
38
                super(inputFile)
39

    
40
                this.wordTag = wordTag
41
                this.mileStoneTag = mileStoneTag
42
                mileStoneID = mileStoneTag+"_0"
43
                this.startTag = startTag
44
                this.start = false;
45
                
46
                msIdAttributeName = mileStoneTag+"id";
47
                msStartAttributeName = mileStoneTag+"start";
48
                msEndAttributeName = mileStoneTag+"end";
49

    
50
                fetchMilestoneLengths();
51
                start = false // reset
52
        }
53

    
54
        public void fetchMilestoneLengths() {
55
                def inputData = inputurl.openStream();
56
                def factory = XMLInputFactory.newInstance();
57
                def parser = factory.createXMLStreamReader(inputData);
58

    
59
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
60
                        switch (event) {
61
                                case XMLStreamConstants.START_ELEMENT:
62
                                        localname = parser.getLocalName()
63
                                        if (start && localname.equals(mileStoneTag)) {
64
                                                milestonesLength << mileStoneDistance;
65
                                                mileStoneDistance = 0
66
                                                milestonesCounter++;
67
                                        } else if (start && localname.equals(wordTag)) {
68
                                                mileStoneDistance++
69
                                        } else if (localname.equals(startTag)) {
70
                                                start = true
71
                                        }
72
                                        break;
73
                        }
74
                }
75
                milestonesLength[milestonesCounter] = mileStoneDistance;
76
                if (parser != null) parser.close();
77
                if (inputData != null) inputData.close();
78
                milestonesCounter = 0;
79
                //println milestonesLength
80
        }
81

    
82

    
83
        
84
        public void processStartElement() {
85
                super.processStartElement();
86

    
87
                if (start && localname.equals(mileStoneTag)) {
88
                        mileStoneDistance = 0
89
                        mileStoneID = parser.getAttributeValue(null, "id")
90
                        if (mileStoneID == null) mileStoneID = parser.getAttributeValue("xml", "id");
91
                        if (mileStoneID == null) mileStoneID = "0";
92
                        
93
                        milestonesCounter++;
94
                } else if (start && localname.equals(wordTag)) {
95
                        // println "end of $milestonesCounter len="+milestonesLength[milestonesCounter]+" dist="+mileStoneDistance
96
                        writer.writeAttribute(msEndAttributeName, Integer.toString((milestonesLength[milestonesCounter] - mileStoneDistance - 1)))
97
                        writer.writeAttribute(msStartAttributeName, Integer.toString(mileStoneDistance))
98
                        writer.writeAttribute(msIdAttributeName, mileStoneID)
99

    
100
                        mileStoneDistance++
101
                } else if (localname.equals(startTag)) {
102
                        start = true
103
                }
104
        }
105

    
106
        public static void main(String[] args) {
107
                File inputFile = new File("/home/mdecorde/TXM-0.8.0-dev/corpora/XTZMILESTONES/tokenized/test.xml")
108
                File outputFile = new File("/home/mdecorde/TXM-0.8.0-dev/corpora/XTZMILESTONES/tokenized/result.xml")
109

    
110
                MileStoneProjection msp = new MileStoneProjection(inputFile, "text", "w", "lb");
111
                println "Sucess: "+msp.process(outputFile)
112
        }
113
}