Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / tal / XMLStaxTokenizerAdapter2.groovy @ 479

History | View | Annotate | Download (4.3 kB)

1
package org.txm.tal
2

    
3
import java.net.URL;
4

    
5
import javax.xml.stream.XMLInputFactory;
6
import javax.xml.stream.XMLOutputFactory;
7
import javax.xml.stream.XMLStreamConstants;
8
import javax.xml.stream.XMLStreamReader;
9
import javax.xml.stream.XMLStreamWriter;
10

    
11
import org.txm.importer.StaxIdentityParser;
12
import org.txm.importer.graal.PersonalNamespaceContext;
13
import org.txm.tal.melt.SxPipe
14

    
15

    
16
/**
17
 * Test of tokenizer that tokenize parts of a XML file using XPath resolved with the Stax parser
18
 * 
19
 * @author mdecorde
20
 *
21
 */
22
class XMLStaxTokenizerAdapter2 {
23
        def xpaths;
24
        boolean debug = false;
25
        boolean doTokenize = false
26
        def segments = []
27
        String str;
28

    
29
        protected URL inputurl;
30
        protected def inputData;
31
        protected XMLInputFactory factory;
32
        protected XMLStreamReader parser;
33

    
34
        public static String TXMNS = "http://textometrie.org/1.0";
35
        public static String TEINS = "http://www.tei-c.org/ns/1.0";
36
        protected static PersonalNamespaceContext Nscontext = new PersonalNamespaceContext();
37

    
38
        String currentXPath = "/"
39
        String localname;
40

    
41
        public XMLStaxTokenizerAdapter2(File infile, def xpaths) {
42
                this.inputurl = infile.toURI().toURL();
43
                inputData = inputurl.openStream();
44
                factory = XMLInputFactory.newInstance();
45
                parser = factory.createXMLStreamReader(inputData);
46
                this.xpaths = xpaths
47
        }
48

    
49
        public boolean process() {
50
                try {
51
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
52
                                switch (event) {
53
                                        case XMLStreamConstants.START_ELEMENT:
54
                                                localname = parser.getLocalName();
55
                                                currentXPath += localname+"/"
56
                                                processStartElement();
57
                                                break;
58
                                        case XMLStreamConstants.CHARACTERS:
59
                                                processCharacters();
60
                                                break;
61
                                        case XMLStreamConstants.END_ELEMENT:
62
                                                localname = parser.getLocalName();
63
                                                processEndElement();
64
                                                currentXPath = currentXPath.substring(0, currentXPath.length() - localname.length() -1)
65
                                                break;
66

    
67
                                }
68
                        }
69
                } catch(Exception e) {
70
                        println("Error while parsing file "+inputurl);
71
                        println("Location "+parser.getLocation());
72
                        org.txm.utils.logger.Log.printStackTrace(e);
73
                        parser.close();
74
                        return false;
75
                }
76

    
77
                if (parser !=null) parser.close();
78
                return true;
79
        }
80

    
81
        protected void processStartElement() {
82
                String localname = parser.getLocalName();
83
                if (debug) println "start element $currentXPath"
84
                if (doTokenize) {
85
                        if (debug) println "Found text element $currentXPath"
86
                        tokenizeCurrentText();
87
                } else {
88
                        if (xpaths.contains(currentXPath)) {
89
                                doTokenize = true;
90
                                str = "";
91
                        }
92
                }
93
        }
94

    
95
        protected void processEndElement() {
96
                if (xpaths.contains(currentXPath)) {
97
                        tokenizeCurrentText()
98
                        doTokenize = false;
99
                        if (debug) println "Found text element: stop tokenizing"
100
                }
101
        }
102

    
103
        protected void processCharacters() {
104
                if (doTokenize) {
105
                        str += parser.getText();
106
                }
107
        }
108

    
109
        int nsegments = 0;
110
        protected tokenizeCurrentText() {
111
                if (str.length() == 0) return;
112

    
113
                segments << str
114
                nsegments++;
115
                str = "" // continue tokenizing
116
        }
117

    
118
        /**
119
         * Best tokenizer ever.
120
         * Extends this function
121
         * 
122
         * @param str
123
         * @return
124
         */
125
        protected List<CharSequence> tokenize(String str) {
126
                return str.tokenize();
127
        }
128

    
129
        public static void main(String[] args) {
130
                File inputXMLFile = new File("/home/mdecorde/xml/comere/cmrclavardage/cmr-getalp_org-actu-tei-v1.xml");
131
                def firstPass = new XMLStaxTokenizerAdapter2(inputXMLFile, ["/TEI/text/body/div/post/p/"]);
132

    
133
                println "FIRST PASS"
134
                firstPass.process()
135
                println firstPass.getSegments().size()
136

    
137
                println "TOKENIZING..."
138
                StringBuffer buffer = new StringBuffer();
139
                for(def segment : firstPass.getSegments()) {
140
                        buffer << segment.replaceAll("\n", " ") + "\n"
141
                }
142
                new File("/home/mdecorde/Bureau/test1.txt").withWriter {writer -> writer.println(buffer)}
143

    
144
                SxPipe sxpipe = new SxPipe()
145
                sxpipe.start()
146
                sxpipe.processText(buffer.toString())
147
                sxpipe.stop()
148

    
149
                new File("/home/mdecorde/Bureau/test.txt").withWriter {writer -> writer.println(sxpipe.getResult())}
150
                def tokenizedSegments = sxpipe.getResultSegments()
151
                println "tokenizedSegments size "+tokenizedSegments.size()
152
                //                for(def segment : firstPass.getSegments()) {
153
                //                        tokenizedSegments << segment.tokenize()
154
                //                }
155

    
156
                //                println "SECOND PASS"
157
                //                def secondPass = new XMLStaxTokenizerAdapter(inputXMLFile, ["/TEI/text/body/div/post/p/"], tokenizedSegments);
158
                //                secondPass.process(new File("/home/mdecorde/xml/comere/cmrclavardage/tmp.xml"))
159
        }
160
}