Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / nlp / XMLStaxTokenizerAdapter2.groovy @ 1688

History | View | Annotate | Download (4.4 kB)

1
package org.txm.scripts.tal
2

    
3
import java.net.URL;
4

    
5
import javax.xml.stream.XMLInputFactory;
6
import javax.xml.stream.XMLOutputFactory;
7
import javax.xml.stream.XMLStreamConstants;
8
import javax.xml.stream.XMLStreamReader;
9
import javax.xml.stream.XMLStreamWriter;
10

    
11
import org.txm.importer.StaxIdentityParser;
12
import org.txm.scripts.importer.graal.PersonalNamespaceContext;
13
import org.txm.tal.melt.SxPipe
14

    
15

    
16
/**
17
 * Test of tokenizer that tokenize parts of a XML file using XPath resolved with the Stax parser
18
 * 
19
 * @author mdecorde
20
 *
21
 */
22
class XMLStaxTokenizerAdapter2 {
23
        def xpaths;
24
        boolean debug = false;
25
        boolean doTokenize = false
26
        def segments = []
27
        String str;
28

    
29
        protected URL inputurl;
30
        protected def inputData;
31
        protected XMLInputFactory factory;
32
        protected XMLStreamReader parser;
33

    
34
        public static String TXMNS = "http://textometrie.org/1.0";
35
        public static String TEINS = "http://www.tei-c.org/ns/1.0";
36
        protected static PersonalNamespaceContext Nscontext = new PersonalNamespaceContext();
37

    
38
        String currentXPath = "/"
39
        String localname;
40

    
41
        public XMLStaxTokenizerAdapter2(File infile, def xpaths) {
42
                this.inputurl = infile.toURI().toURL();
43
                inputData = inputurl.openStream();
44
                factory = XMLInputFactory.newInstance();
45
                parser = factory.createXMLStreamReader(inputData);
46
                this.xpaths = xpaths
47
        }
48

    
49
        public boolean process() {
50
                try {
51
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
52
                                switch (event) {
53
                                        case XMLStreamConstants.START_ELEMENT:
54
                                                localname = parser.getLocalName();
55
                                                currentXPath += localname+"/"
56
                                                processStartElement();
57
                                                break;
58
                                        case XMLStreamConstants.CHARACTERS:
59
                                                processCharacters();
60
                                                break;
61
                                        case XMLStreamConstants.END_ELEMENT:
62
                                                localname = parser.getLocalName();
63
                                                processEndElement();
64
                                                currentXPath = currentXPath.substring(0, currentXPath.length() - localname.length() -1)
65
                                                break;
66

    
67
                                }
68
                        }
69
                } catch(Exception e) {
70
                        println("Error while parsing file "+inputurl);
71
                        println("Location "+parser.getLocation());
72
                        org.txm.utils.logger.Log.printStackTrace(e);
73
                        if (parser != null) parser.close();
74
                        if (inputData != null) inputData.close();
75
                        return false;
76
                }
77

    
78
                if (parser != null) parser.close();
79
                if (inputData != null) inputData.close();
80
                return true;
81
        }
82

    
83
        protected void processStartElement() {
84
                String localname = parser.getLocalName();
85
                if (debug) println "start element $currentXPath"
86
                if (doTokenize) {
87
                        if (debug) println "Found text element $currentXPath"
88
                        tokenizeCurrentText();
89
                } else {
90
                        if (xpaths.contains(currentXPath)) {
91
                                doTokenize = true;
92
                                str = "";
93
                        }
94
                }
95
        }
96

    
97
        protected void processEndElement() {
98
                if (xpaths.contains(currentXPath)) {
99
                        tokenizeCurrentText()
100
                        doTokenize = false;
101
                        if (debug) println "Found text element: stop tokenizing"
102
                }
103
        }
104

    
105
        protected void processCharacters() {
106
                if (doTokenize) {
107
                        str += parser.getText();
108
                }
109
        }
110

    
111
        int nsegments = 0;
112
        protected tokenizeCurrentText() {
113
                if (str.length() == 0) return;
114

    
115
                segments << str
116
                nsegments++;
117
                str = "" // continue tokenizing
118
        }
119

    
120
        /**
121
         * Best tokenizer ever.
122
         * Extends this function
123
         * 
124
         * @param str
125
         * @return
126
         */
127
        protected List<CharSequence> tokenize(String str) {
128
                return str.tokenize();
129
        }
130

    
131
        public static void main(String[] args) {
132
                File inputXMLFile = new File("/home/mdecorde/xml/comere/cmrclavardage/cmr-getalp_org-actu-tei-v1.xml");
133
                def firstPass = new XMLStaxTokenizerAdapter2(inputXMLFile, ["/TEI/text/body/div/post/p/"]);
134

    
135
                println "FIRST PASS"
136
                firstPass.process()
137
                println firstPass.getSegments().size()
138

    
139
                println "TOKENIZING..."
140
                StringBuffer buffer = new StringBuffer();
141
                for(def segment : firstPass.getSegments()) {
142
                        buffer << segment.replaceAll("\n", " ") + "\n"
143
                }
144
                new File("/home/mdecorde/Bureau/test1.txt").withWriter {writer -> writer.println(buffer)}
145

    
146
                SxPipe sxpipe = new SxPipe()
147
                sxpipe.start()
148
                sxpipe.processText(buffer.toString())
149
                sxpipe.stop()
150

    
151
                new File("/home/mdecorde/Bureau/test.txt").withWriter {writer -> writer.println(sxpipe.getResult())}
152
                def tokenizedSegments = sxpipe.getResultSegments()
153
                println "tokenizedSegments size "+tokenizedSegments.size()
154
                //                for(def segment : firstPass.getSegments()) {
155
                //                        tokenizedSegments << segment.tokenize()
156
                //                }
157

    
158
                //                println "SECOND PASS"
159
                //                def secondPass = new XMLStaxTokenizerAdapter(inputXMLFile, ["/TEI/text/body/div/post/p/"], tokenizedSegments);
160
                //                secondPass.process(new File("/home/mdecorde/xml/comere/cmrclavardage/tmp.xml"))
161
        }
162
}