Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / MileStoneProjection.groovy @ 479

History | View | Annotate | Download (3.3 kB)

1
package org.txm.importer
2

    
3
import java.io.File;
4
import javax.xml.stream.XMLStreamConstants;
5
import javax.xml.stream.XMLOutputFactory;
6
import javax.xml.stream.XMLStreamWriter;
7
import java.io.File;
8
import java.net.URL;
9
import javax.xml.stream.*;
10
import org.txm.importer.filters.*;
11
import org.txm.importer.graal.PersonalNamespaceContext;
12

    
13
/**
14
 * add 2 attributes per element to encode the distance to the previous milestone and next milestone
15
 * TODO: only the first attribute is implemented
16
 * 
17
 * @author mdecorde
18
 *
19
 */
20
class MileStoneProjection extends StaxIdentityParser {
21
        String wordTag
22
        String mileStoneTag
23
        String startTag
24
        boolean start = false
25

    
26
        int mileStoneDistance = 0
27

    
28
        String mileStoneID = ""
29
        def milestonesLength = []
30
        int milestonesCounter = 0
31
        boolean secondPass = false
32
        
33
        String msIdAttributeName
34
        String msStartAttributeName
35
        String msEndAttributeName
36

    
37
        public MileStoneProjection(File inputFile, String startTag, String wordTag, String mileStoneTag) {
38
                super(inputFile)
39

    
40
                this.wordTag = wordTag
41
                this.mileStoneTag = mileStoneTag
42
                mileStoneID = mileStoneTag+"_0"
43
                this.startTag = startTag
44
                this.start = false;
45
                
46
                msIdAttributeName = mileStoneTag+"id";
47
                msStartAttributeName = mileStoneTag+"start";
48
                msEndAttributeName = mileStoneTag+"end";
49

    
50
                fetchMilestoneLengths();
51
                start = false // reset
52
        }
53

    
54
        public void fetchMilestoneLengths() {
55
                def inputData = inputurl.openStream();
56
                def factory = XMLInputFactory.newInstance();
57
                def parser = factory.createXMLStreamReader(inputData);
58

    
59
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
60
                        switch (event) {
61
                                case XMLStreamConstants.START_ELEMENT:
62
                                        localname = parser.getLocalName()
63
                                        if (start && localname == mileStoneTag) {
64
                                                milestonesLength << mileStoneDistance;
65
                                                mileStoneDistance = 0
66
                                                milestonesCounter++;
67
                                        } else if (start && localname == wordTag) {
68
                                                mileStoneDistance++
69
                                        } else if (localname == startTag) {
70
                                                start = true
71
                                        }
72
                                        break;
73
                        }
74
                }
75
                milestonesLength[milestonesCounter] = mileStoneDistance;
76
                parser.close()
77
                milestonesCounter = 0;
78
        }
79

    
80

    
81
        
82
        public void processStartElement() {
83
                super.processStartElement();
84

    
85
                if (start && localname == mileStoneTag) {
86
                        mileStoneDistance = 0
87
                        mileStoneID = parser.getAttributeValue(null, "id")
88
                        if (mileStoneID == null) mileStoneID = parser.getAttributeValue("xml", "id");
89
                        if (mileStoneID == null) mileStoneID = "0";
90
                        
91
                        milestonesCounter++;
92
                } else if (start && localname == wordTag) {
93
                        // println "end of $milestonesCounter len="+milestonesLength[milestonesCounter]+" dist="+mileStoneDistance
94
                        writer.writeAttribute(msEndAttributeName, Integer.toString((milestonesLength[milestonesCounter] - mileStoneDistance - 1)))
95
                        writer.writeAttribute(msStartAttributeName, Integer.toString(mileStoneDistance))
96
                        writer.writeAttribute(msIdAttributeName, mileStoneID)
97

    
98
                        mileStoneDistance++
99
                } else if (localname == startTag) {
100
                        start = true
101
                }
102
        }
103

    
104
        public static void main(String[] args) {
105
                File inputFile = new File("/home/mdecorde/TXM/corpora/BVHEPISTEMON2016/tokenized/1538_MarotAdole.xml")
106
                File outputFile = new File("/home/mdecorde/TEMP/tmp.xml")
107

    
108
                MileStoneProjection msp = new MileStoneProjection(inputFile, "body", "w", "lb");
109
                println "Sucess: "+msp.process(outputFile)
110
        }
111
}