Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / MileStoneProjection.groovy @ 2675

History | View | Annotate | Download (3.5 kB)

1
package org.txm.scripts.importer
2

    
3
import java.io.File;
4
import javax.xml.stream.XMLStreamConstants;
5
import javax.xml.stream.XMLOutputFactory;
6
import javax.xml.stream.XMLStreamWriter;
7
import java.net.URL;
8
import javax.xml.stream.*;
9
import org.txm.importer.filters.*;
10
import org.txm.importer.PersonalNamespaceContext;
11
import org.txm.importer.StaxIdentityParser
12

    
13
/**
14
 * add 2 attributes per element to encode the distance to the previous milestone and next milestone
15
 * TODO: only the first attribute is implemented
16
 * 
17
 * @author mdecorde
18
 *
19
 */
20
class MileStoneProjection extends StaxIdentityParser {
21
        String wordTag
22
        String mileStoneTag
23
        String startTag
24
        boolean start = false
25
        
26
        int mileStoneDistance = 0
27
        
28
        String mileStoneID = ""
29
        def milestonesLength = []
30
        int milestonesCounter = 0
31
        boolean secondPass = false
32
        
33
        String msIdAttributeName
34
        String msStartAttributeName
35
        String msEndAttributeName
36
        
37
        public MileStoneProjection(File inputFile, String startTag, String wordTag, String mileStoneTag) {
38
                super(inputFile)
39
                
40
                this.wordTag = wordTag
41
                this.mileStoneTag = mileStoneTag
42
                mileStoneID = mileStoneTag+"_0"
43
                this.startTag = startTag
44
                this.start = false;
45
                
46
                msIdAttributeName = mileStoneTag+"id";
47
                msStartAttributeName = mileStoneTag+"start";
48
                msEndAttributeName = mileStoneTag+"end";
49
                
50
                fetchMilestoneLengths();
51
                start = false // reset
52
        }
53
        
54
        public void fetchMilestoneLengths() {
55
                def inputData = inputurl.openStream();
56
                def factory = XMLInputFactory.newInstance();
57
                def parser = factory.createXMLStreamReader(inputData);
58
                
59
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
60
                        switch (event) {
61
                                case XMLStreamConstants.START_ELEMENT:
62
                                        localname = parser.getLocalName()
63
                                        if (start && localname.equals(mileStoneTag)) {
64
                                                milestonesLength << mileStoneDistance;
65
                                                mileStoneDistance = 0
66
                                                milestonesCounter++;
67
                                        } else if (start && localname.equals(wordTag)) {
68
                                                mileStoneDistance++
69
                                        } else if (localname.equals(startTag)) {
70
                                                start = true
71
                                        }
72
                                        break;
73
                        }
74
                }
75
                milestonesLength[milestonesCounter] = mileStoneDistance;
76
                if (parser != null) parser.close();
77
                if (inputData != null) inputData.close();
78
                milestonesCounter = 0;
79
                //println milestonesLength
80
        }
81
        
82
        
83
        
84
        public void processStartElement() {
85
                super.processStartElement();
86
                
87
                if (start && localname.equals(mileStoneTag)) {
88
                        mileStoneDistance = 0
89
                        mileStoneID = parser.getAttributeValue(null, "id")
90
                        if (mileStoneID == null) mileStoneID = parser.getAttributeValue("xml", "id");
91
                        if (mileStoneID == null) mileStoneID = "0";
92
                        
93
                        milestonesCounter++;
94
                } else if (start && localname.equals(wordTag)) {
95
                        // println "end of $milestonesCounter len="+milestonesLength[milestonesCounter]+" dist="+mileStoneDistance
96
                        writer.writeAttribute(msEndAttributeName, Integer.toString((milestonesLength[milestonesCounter] - mileStoneDistance - 1)))
97
                        writer.writeAttribute(msStartAttributeName, Integer.toString(mileStoneDistance))
98
                        writer.writeAttribute(msIdAttributeName, mileStoneID)
99
                        
100
                        mileStoneDistance++
101
                } else if (localname.equals(startTag)) {
102
                        start = true
103
                }
104
        }
105
        
106
        public static void main(String[] args) {
107
                File inputFile = new File(System.getProperty("user.home"), "TXM-0.8.0-dev/corpora/XTZMILESTONES/tokenized/test.xml")
108
                File outputFile = new File(System.getProperty("user.home"), "TXM-0.8.0-dev/corpora/XTZMILESTONES/tokenized/result.xml")
109
                
110
                MileStoneProjection msp = new MileStoneProjection(inputFile, "text", "w", "lb");
111
                println "Sucess: "+msp.process(outputFile)
112
        }
113
}