Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / filters / Tokeniser / SimpleTokenizerXml.groovy @ 1688

History | View | Annotate | Download (23.9 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
22 321 mdecorde
//
23 321 mdecorde
// This file is part of the TXM platform.
24 321 mdecorde
//
25 321 mdecorde
// The TXM platform is free software: you can redistribute it and/or modif y
26 321 mdecorde
// it under the terms of the GNU General Public License as published by
27 321 mdecorde
// the Free Software Foundation, either version 3 of the License, or
28 321 mdecorde
// (at your option) any later version.
29 321 mdecorde
//
30 321 mdecorde
// The TXM platform is distributed in the hope that it will be useful,
31 321 mdecorde
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32 321 mdecorde
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33 321 mdecorde
// GNU General Public License for more details.
34 321 mdecorde
//
35 321 mdecorde
// You should have received a copy of the GNU General Public License
36 321 mdecorde
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37 321 mdecorde
//
38 321 mdecorde
//
39 321 mdecorde
//
40 321 mdecorde
// $LastChangedDate:$
41 321 mdecorde
// $LastChangedRevision:$
42 321 mdecorde
// $LastChangedBy:$
43 321 mdecorde
//
44 1000 mdecorde
package org.txm.scripts.filters.Tokeniser;
45 321 mdecorde
46 321 mdecorde
import static groovy.transform.TypeCheckingMode.SKIP
47 321 mdecorde
import groovy.transform.CompileStatic
48 321 mdecorde
49 321 mdecorde
import java.util.regex.Matcher
50 321 mdecorde
import java.util.regex.Pattern
51 321 mdecorde
52 321 mdecorde
import javax.xml.stream.*
53 321 mdecorde
54 986 mdecorde
import org.txm.scripts.importer.graal.PersonalNamespaceContext
55 321 mdecorde
import org.txm.tokenizer.TokenizerClasses
56 321 mdecorde
57 321 mdecorde
@CompileStatic
58 321 mdecorde
public class SimpleTokenizerXml {
59 321 mdecorde
60 321 mdecorde
        /** The word_tags. */
61 321 mdecorde
        String word_tags = TokenizerClasses.word_tags;
62 1177 mdecorde
        String word_element_to_create = TokenizerClasses.word_element_to_create;
63 321 mdecorde
        Pattern reg_word_tags;
64 321 mdecorde
        /** The intraword_tags. */
65 321 mdecorde
        String intraword_tags = TokenizerClasses.intraword_tags;
66 321 mdecorde
67 321 mdecorde
        /** The punct_strong. */
68 321 mdecorde
        String punct_strong = TokenizerClasses.punct_strong;
69 321 mdecorde
70 321 mdecorde
        /** The punct_all. */
71 321 mdecorde
        String punct_all = TokenizerClasses.punct_all;
72 321 mdecorde
73 321 mdecorde
        /** The word_chars. */
74 321 mdecorde
        String word_chars = TokenizerClasses.word_chars;
75 321 mdecorde
76 321 mdecorde
        String fclitics = null; // default behavior don't manage clitics
77 321 mdecorde
        String pclitics = null; // default behavior don't manage clitics
78 321 mdecorde
79 321 mdecorde
        /** The outside_text_tags_ignore_content. */
80 321 mdecorde
        String note_content = null;
81 321 mdecorde
        String outside_text_tags_keep_content = null // tag and content NOT removed but not tokenized
82 321 mdecorde
        String outside_text_tags = null // tag and content removed
83 321 mdecorde
        String startTag = null
84 321 mdecorde
        Pattern reg_note_content;
85 321 mdecorde
        Pattern reg_outside_text_tags_keep_content;
86 321 mdecorde
        Pattern reg_outside_text_tags;
87 321 mdecorde
        Pattern reg_startTag;
88 321 mdecorde
89 321 mdecorde
        Pattern reg_punct_other = Pattern.compile("\\p{P}")
90 321 mdecorde
91 321 mdecorde
        /** The DEBUG. */
92 321 mdecorde
        public boolean DEBUG = false;
93 321 mdecorde
94 321 mdecorde
        /** The outfile. */
95 321 mdecorde
        File outfile;
96 321 mdecorde
97 321 mdecorde
        /** The infile. */
98 321 mdecorde
        File infile;
99 321 mdecorde
100 321 mdecorde
        String lang;
101 321 mdecorde
102 321 mdecorde
        /** The buffer. */
103 321 mdecorde
        StringBuffer buffer;
104 321 mdecorde
105 321 mdecorde
        /** The writer. */
106 321 mdecorde
        XMLStreamWriter writer;
107 321 mdecorde
        BufferedOutputStream output;
108 321 mdecorde
109 321 mdecorde
        /** The parser. */
110 321 mdecorde
        XMLStreamReader parser
111 321 mdecorde
112 321 mdecorde
        /** The localname. */
113 321 mdecorde
        String localname;
114 321 mdecorde
115 321 mdecorde
        /** The prefix. */
116 321 mdecorde
        String prefix;
117 321 mdecorde
        String filename;
118 321 mdecorde
119 321 mdecorde
        def regElision = null;
120 321 mdecorde
        def reg3pts = null;
121 321 mdecorde
        def regPunct;
122 321 mdecorde
        def regFClitics = null ;
123 321 mdecorde
        def regPClitics = null ;
124 321 mdecorde
        String whitespaces;
125 321 mdecorde
        Pattern regWhitespaces;
126 321 mdecorde
        Pattern regLN;
127 321 mdecorde
        Pattern regCTRL;
128 321 mdecorde
        Pattern regSplitWhiteSpaces;
129 321 mdecorde
130 321 mdecorde
        public SimpleTokenizerXml(File infile, File outfile) {
131 321 mdecorde
                this(infile, outfile, "");
132 321 mdecorde
        }
133 321 mdecorde
134 321 mdecorde
        /**
135 321 mdecorde
         * Instantiates a new simple tokenizer xml.
136 321 mdecorde
         *
137 321 mdecorde
         * @param infile the infile
138 321 mdecorde
         * @param outfile the outfile
139 321 mdecorde
         */
140 321 mdecorde
        public SimpleTokenizerXml(File infile, File outfile, String lang) {
141 321 mdecorde
                this.lang = lang;
142 321 mdecorde
                if (lang != null)
143 321 mdecorde
                        if (lang.startsWith("en")) {
144 321 mdecorde
                                fclitics = TokenizerClasses.FClitic_en
145 321 mdecorde
                        } else if (lang.startsWith("fr")) {
146 321 mdecorde
                                fclitics = TokenizerClasses.FClitic_fr
147 321 mdecorde
                                pclitics = TokenizerClasses.PClitic_fr
148 321 mdecorde
                        } else if (lang.startsWith("gl")) {
149 321 mdecorde
                                fclitics = TokenizerClasses.FClitic_gl
150 321 mdecorde
                        } else if (lang.startsWith("it")) {
151 321 mdecorde
                                pclitics = TokenizerClasses.PClitic_it
152 321 mdecorde
                        }
153 321 mdecorde
154 321 mdecorde
                word_tags = TokenizerClasses.word_tags;
155 321 mdecorde
                reg_word_tags = Pattern.compile(word_tags);
156 1177 mdecorde
157 321 mdecorde
                intraword_tags = TokenizerClasses.intraword_tags;
158 321 mdecorde
                punct_strong = TokenizerClasses.punct_strong;
159 321 mdecorde
                punct_all = TokenizerClasses.punct_all;
160 321 mdecorde
                word_chars = TokenizerClasses.word_chars;
161 321 mdecorde
162 321 mdecorde
                this.outfile = outfile;
163 321 mdecorde
                this.infile = infile;
164 321 mdecorde
                this.filename = infile.getName();
165 479 mdecorde
                int index = filename.lastIndexOf(".");
166 321 mdecorde
                if (index > 0) filename = filename.substring(0, index);
167 321 mdecorde
168 321 mdecorde
                String strRegElision = TokenizerClasses.regElision;
169 321 mdecorde
                if (strRegElision != null && strRegElision.length() > 0)
170 321 mdecorde
                        regElision = ~/^([\p{L}-]++${strRegElision}[\p{L}-]++)(.*)$/ // the test must start with the start of string  ^
171 321 mdecorde
                reg3pts = ~/^(.*?)(\.\.\.)(.*)$/
172 321 mdecorde
173 321 mdecorde
                String strRegPunct = TokenizerClasses.regPunct;
174 321 mdecorde
                if (strRegPunct != null && strRegPunct.length() > 0)
175 321 mdecorde
                        regPunct = ~/^(.*?)($strRegPunct)(.*)$/
176 321 mdecorde
177 321 mdecorde
                if (fclitics != null && fclitics.length() > 0)
178 321 mdecorde
                        regFClitics = ~/(.+)($fclitics)$/ // the test must end with the end of string $
179 321 mdecorde
180 321 mdecorde
                if (pclitics != null && pclitics.length() > 0)
181 321 mdecorde
                        regPClitics = /^($pclitics)(.*)/ // the test must start with the start of string  ^
182 321 mdecorde
183 321 mdecorde
                if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0)
184 321 mdecorde
                        regWhitespaces = ~TokenizerClasses.whitespaces;
185 321 mdecorde
186 321 mdecorde
                if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0)
187 321 mdecorde
                        regSplitWhiteSpaces = Pattern.compile(TokenizerClasses.whitespaces);
188 321 mdecorde
189 321 mdecorde
                regLN = Pattern.compile("/\n/");
190 321 mdecorde
                regCTRL = Pattern.compile("/\\p{C}/");
191 321 mdecorde
        }
192 321 mdecorde
193 321 mdecorde
        /**
194 321 mdecorde
         * Fill infos.
195 321 mdecorde
         *
196 321 mdecorde
         * @param event the event
197 321 mdecorde
         * @return the java.lang. object
198 321 mdecorde
         */
199 321 mdecorde
        public fillInfos(int event) {
200 321 mdecorde
                if (event == XMLStreamConstants.START_ELEMENT || event == XMLStreamConstants.END_ELEMENT) {
201 321 mdecorde
                        localname = parser.getLocalName();
202 321 mdecorde
                        prefix = parser.getPrefix();
203 321 mdecorde
                }
204 321 mdecorde
        }
205 321 mdecorde
206 321 mdecorde
        /**
207 321 mdecorde
         * Donothing: just write what is read
208 321 mdecorde
         *
209 321 mdecorde
         * special processing for words: prefix id attributes with "w_" and remove special characters like \n \t etc.
210 321 mdecorde
         *
211 321 mdecorde
         * @param event the event
212 321 mdecorde
         * @param wordid the wordid
213 321 mdecorde
         * @return the java.lang. object
214 321 mdecorde
         */
215 321 mdecorde
        public donothing(int event, Integer wordid) {
216 321 mdecorde
                if (event == XMLStreamConstants.START_ELEMENT ) {
217 479 mdecorde
218 321 mdecorde
                        localname = parser.getLocalName();
219 479 mdecorde
                        if (wordid != null) localname = word_element_to_create;
220 479 mdecorde
221 321 mdecorde
                        if (prefix != null && prefix.length() > 0)
222 321 mdecorde
                                writer.writeStartElement(prefix+":"+localname);
223 321 mdecorde
                        else
224 321 mdecorde
                                //                                if(namespace != null)
225 321 mdecorde
                                //                                        writer.writeStartElement(namespace, localname);
226 321 mdecorde
                                //                                else
227 321 mdecorde
                                writer.writeStartElement(localname);
228 321 mdecorde
229 321 mdecorde
                        //                        if(parser.getNamespaceCount() > 0)
230 321 mdecorde
                        //                                writer.writeDefaultNamespace(parser.getNamespaceURI(0))
231 321 mdecorde
                        //                        for(int i = 1 ; i < parser.getNamespaceCount() ; i++)
232 321 mdecorde
                        //                                writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
233 321 mdecorde
234 321 mdecorde
                        String namespace_prefix;
235 321 mdecorde
                        for (int i = 0 ; i<   parser.getNamespaceCount() ; i++) {
236 321 mdecorde
                                namespace_prefix = parser.getNamespacePrefix(i);
237 321 mdecorde
                                if ((namespace_prefix != null)&&   (namespace_prefix.length()>   0)) {
238 321 mdecorde
                                        writer.writeNamespace(namespace_prefix, parser.getNamespaceURI(i));
239 321 mdecorde
                                } else {
240 321 mdecorde
                                        writer.writeDefaultNamespace(parser.getNamespaceURI(i));
241 321 mdecorde
                                }
242 321 mdecorde
                        }
243 321 mdecorde
244 321 mdecorde
                        String attrprefix, attname;
245 321 mdecorde
                        boolean hasId = false;
246 321 mdecorde
                        //boolean hasType = false
247 321 mdecorde
                        boolean hasN = false
248 321 mdecorde
                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
249 321 mdecorde
                                attname = parser.getAttributeLocalName(i);
250 321 mdecorde
                                attrprefix = parser.getAttributePrefix(i);
251 321 mdecorde
                                if ("id".equals(attname)) hasId = true;
252 321 mdecorde
                                //if ("type".equals(attname)) hasType = true;
253 321 mdecorde
                                if ("n".equals(attname)) hasN = true;
254 321 mdecorde
255 321 mdecorde
                                if (attrprefix != null && attrprefix.length() > 0)
256 321 mdecorde
                                        writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i))
257 321 mdecorde
                                else
258 321 mdecorde
                                        writer.writeAttribute(attname, parser.getAttributeValue(i))
259 321 mdecorde
                        }
260 321 mdecorde
261 479 mdecorde
                        if (wordid != null && !hasId && localname == word_element_to_create)
262 321 mdecorde
                                writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
263 321 mdecorde
264 479 mdecorde
                        if (!hasN && localname == word_element_to_create)
265 321 mdecorde
                                writer.writeAttribute("n", ""+wordcount);
266 321 mdecorde
267 321 mdecorde
                        writer.writeCharacters("\n");
268 321 mdecorde
                }
269 321 mdecorde
                else if(event == XMLStreamConstants.END_ELEMENT)
270 321 mdecorde
                {
271 321 mdecorde
                        writer.writeEndElement();
272 321 mdecorde
                        writer.writeCharacters("\n");
273 321 mdecorde
                }
274 321 mdecorde
                else if(event == XMLStreamConstants.CHARACTERS)
275 321 mdecorde
                {
276 321 mdecorde
                        //println parser.getText();
277 321 mdecorde
                        //writer.writeCharacters("𦟛");
278 321 mdecorde
279 321 mdecorde
280 321 mdecorde
                        // checks if the token starts with an high surrogate
281 321 mdecorde
                        //                        if(isHighSurrogate(parser.getText().charAt(0)))        {
282 321 mdecorde
                        //                                println "warning: invalid UTF-8 XML range, token " + parser.getText() + " has been replaced.";
283 321 mdecorde
                        //                                writer.writeCharacters("__invalidXMLChar__")
284 321 mdecorde
                        //                                //writer.writeCharacters("𦟛");
285 321 mdecorde
                        //                                println "high surrogate: " + Integer.toHexString((int)parser.getText().charAt(0));
286 321 mdecorde
                        //                                println "low surrogate: " + Integer.toHexString((int)parser.getText().charAt(1));
287 321 mdecorde
                        //                                int charSum = parser.getText().charAt(0) + parser.getText().charAt(1);
288 321 mdecorde
                        //                                println "char sum: " + charSum;
289 321 mdecorde
                        //                                println "test " + surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
290 321 mdecorde
                        //                                int scalar = surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
291 321 mdecorde
                        //                                //writer.writeCharacters(String.valueOf((char)112692));
292 321 mdecorde
                        //                                writer.writeCharacters("&#" + scalar + ";");
293 321 mdecorde
                        //                        }
294 321 mdecorde
                        //                        else
295 321 mdecorde
296 321 mdecorde
297 321 mdecorde
                        if (insideword) { // ensure there is not \t or \n in the word form value
298 321 mdecorde
                                writer.writeCharacters(parser.getText().trim().replace("\n", " ").replace("\t", " "));
299 321 mdecorde
                        } else {
300 321 mdecorde
                                writer.writeCharacters(parser.getText());
301 321 mdecorde
                        }
302 321 mdecorde
                }
303 321 mdecorde
        }
304 321 mdecorde
305 321 mdecorde
306 321 mdecorde
        /**
307 321 mdecorde
         * Converts the specified surrogates pair to scalar.
308 321 mdecorde
         * @param highSurrogate
309 321 mdecorde
         * @param lowSurrogate
310 321 mdecorde
         * @return
311 321 mdecorde
         */
312 321 mdecorde
        public int surrogatesPairToScalar(char highSurrogate, char lowSurrogate)        {
313 321 mdecorde
                return ((highSurrogate - 0xD800) * 0x400) + (lowSurrogate - 0xDC00) + 0x10000;
314 321 mdecorde
        }
315 321 mdecorde
316 321 mdecorde
        /**
317 321 mdecorde
         * Checks if the specified character is an high/leading surrogate.
318 321 mdecorde
         * @param character
319 321 mdecorde
         * @return
320 321 mdecorde
         */
321 321 mdecorde
        public boolean isHighSurrogate(char character)        {
322 321 mdecorde
                return (character >= 0xD800 && character <= 0xDBFF);
323 321 mdecorde
        }
324 321 mdecorde
325 321 mdecorde
326 321 mdecorde
327 321 mdecorde
        /** The wordcount. */
328 321 mdecorde
        int wordcount = 0;
329 321 mdecorde
330 321 mdecorde
        /** The ignorecontent. */
331 321 mdecorde
        boolean ignorecontent = true; // tokenization starts with element name matching $reg_startTag
332 321 mdecorde
        boolean insideword = false;
333 321 mdecorde
        /**
334 321 mdecorde
         * Process.
335 321 mdecorde
         *
336 321 mdecorde
         * @return true, if successful
337 321 mdecorde
         */
338 321 mdecorde
        public boolean process()
339 321 mdecorde
        {
340 321 mdecorde
                if (!infile.exists()) {
341 321 mdecorde
                        println "$infile does not exists"
342 321 mdecorde
                        return false;
343 321 mdecorde
                }
344 321 mdecorde
                XMLOutputFactory factory = XMLOutputFactory.newInstance();
345 321 mdecorde
                output = new BufferedOutputStream(new FileOutputStream(outfile))
346 321 mdecorde
                writer = factory.createXMLStreamWriter(output, "UTF-8")
347 321 mdecorde
                writer.setNamespaceContext(new PersonalNamespaceContext());
348 321 mdecorde
349 321 mdecorde
                def inputData = infile.toURI().toURL().openStream();
350 321 mdecorde
                def inputfactory = XMLInputFactory.newInstance();
351 321 mdecorde
                //inputfactory.setProperty("http://apache.org/xml/properties/input-buffer-size", new Integer(2048));
352 321 mdecorde
                //inputfactory.setExpandEntityReferences(false);
353 321 mdecorde
                parser = inputfactory.createXMLStreamReader(inputData, "UTF-8");
354 321 mdecorde
                //println "PARSER: "+parser.getClass()
355 321 mdecorde
                writer.writeStartDocument("UTF-8","1.0");
356 321 mdecorde
                writer.writeCharacters("\n");
357 321 mdecorde
358 321 mdecorde
                int previousEvent = 0;
359 321 mdecorde
                boolean startProcess = false;
360 321 mdecorde
                if (startTag == null) // if no startTag specified we process from the start
361 321 mdecorde
                        startProcess = true;
362 321 mdecorde
                ignorecontent = !startProcess;
363 321 mdecorde
364 321 mdecorde
                buffer = new StringBuffer();
365 321 mdecorde
                //println "process - start start tag: "+startTag+" startProcess: $startProcess"
366 321 mdecorde
//                println "reg_outside_text_tags_keep_content=$reg_outside_text_tags_keep_content"
367 321 mdecorde
//                println "reg_outside_text_tags=$reg_outside_text_tags"
368 321 mdecorde
//                println "reg_note_content=$reg_note_content"
369 321 mdecorde
                try {
370 321 mdecorde
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
371 321 mdecorde
                                if (!startProcess) {
372 321 mdecorde
                                        if (event == XMLStreamConstants.START_ELEMENT) {
373 321 mdecorde
                                                if (reg_startTag.matcher(parser.getLocalName()).matches()) {
374 321 mdecorde
                                                        startProcess = true
375 321 mdecorde
                                                        ignorecontent = false;
376 321 mdecorde
                                                }
377 321 mdecorde
                                        }
378 321 mdecorde
                                        if (!startProcess) {
379 321 mdecorde
                                                donothing(event, null);
380 321 mdecorde
                                                continue;
381 321 mdecorde
                                        }
382 321 mdecorde
                                }
383 321 mdecorde
384 321 mdecorde
                                if (previousEvent == XMLStreamConstants.CHARACTERS && previousEvent != event) {
385 321 mdecorde
                                        processWord(); // tokenize now!
386 321 mdecorde
                                        buffer.setLength(0);
387 321 mdecorde
                                }
388 321 mdecorde
                                fillInfos(event);//get localname and prefix
389 321 mdecorde
                                if (event == XMLStreamConstants.START_ELEMENT) {
390 321 mdecorde
                                        //println "Open: "+localname;
391 321 mdecorde
                                        localname = parser.getLocalName()
392 321 mdecorde
                                        if (reg_word_tags.matcher(localname).matches()) { // ignore the content of the word but keep counting
393 321 mdecorde
                                                //println "Found pretagged word";
394 321 mdecorde
                                                wordcount++;
395 321 mdecorde
                                                donothing(event, wordcount);
396 479 mdecorde
                                                //ignorecontent = true;
397 321 mdecorde
                                                insideword = true;
398 321 mdecorde
                                        } else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
399 321 mdecorde
                                                // ignore the tag only
400 321 mdecorde
                                                donothing(event, null); // write the tag
401 321 mdecorde
                                                //println "IGNORING NOTE CONTENT OF "+localname
402 321 mdecorde
                                                ignorecontent = true;
403 321 mdecorde
                                        } else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
404 321 mdecorde
                                                //         ignore the content only
405 321 mdecorde
                                                donothing(event, null); // write the tag
406 321 mdecorde
                                                //println "IGNORING CONTENT OF "+localname
407 321 mdecorde
                                                ignorecontent = true;
408 321 mdecorde
                                        } else if (reg_outside_text_tags != null && reg_outside_text_tags.matcher(localname).matches()) { // ignore the tag and its content of the tag
409 321 mdecorde
                                                goToEndOfElement(localname); // parse until the end of the element is passed
410 321 mdecorde
                                        } else {
411 321 mdecorde
                                                donothing(event, null);
412 321 mdecorde
                                        }
413 321 mdecorde
                                } else if(event == XMLStreamConstants.END_ELEMENT) {
414 321 mdecorde
                                        //println "Close: "+localname;
415 321 mdecorde
                                        localname = parser.getLocalName()
416 321 mdecorde
                                        if (reg_word_tags.matcher(localname).matches()) {
417 321 mdecorde
                                                //ignorecontent = false;
418 321 mdecorde
                                                insideword = false;
419 321 mdecorde
                                                writer.writeEndElement();
420 321 mdecorde
                                                writer.writeCharacters("\n");
421 321 mdecorde
                                        } else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag
422 321 mdecorde
                                                ignorecontent = false;
423 321 mdecorde
                                                donothing(event, null);
424 321 mdecorde
                                        } else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
425 321 mdecorde
                                                ignorecontent = false;
426 321 mdecorde
                                                donothing(event, null);
427 321 mdecorde
                                        } else {
428 321 mdecorde
                                                donothing(event, null);
429 321 mdecorde
                                        }
430 321 mdecorde
                                } else if (event == XMLStreamConstants.CHARACTERS) {
431 479 mdecorde
                                        if (ignorecontent || insideword) {
432 321 mdecorde
                                                //println " dont tokenize chars: "+parser.getText().trim();
433 321 mdecorde
                                                donothing(event, null);
434 321 mdecorde
                                        } else {
435 321 mdecorde
                                                //println " process chars: "+parser.getText().trim();
436 321 mdecorde
                                                buffer.append(parser.getText());
437 321 mdecorde
                                                if (buffer.length() >= 128 && buffer.charAt(buffer.length()-1) == " ") {
438 321 mdecorde
                                                        processWord();
439 321 mdecorde
                                                        buffer = new StringBuffer();
440 321 mdecorde
                                                }
441 321 mdecorde
                                        }
442 321 mdecorde
                                } else if (event == XMLStreamConstants.COMMENT) {
443 321 mdecorde
                                        writer.writeComment(parser.getText())
444 321 mdecorde
                                } else if (event == XMLStreamConstants.DTD) {
445 321 mdecorde
                                        //println "DTD!";
446 321 mdecorde
                                } else {
447 321 mdecorde
                                        if (DEBUG) println "Warning in $infile: ignore XML event at location "+parser.getLocation()
448 321 mdecorde
                                }
449 321 mdecorde
                                previousEvent = event;
450 321 mdecorde
                        }
451 321 mdecorde
452 321 mdecorde
                        parser.close()
453 321 mdecorde
                        writer.close();
454 321 mdecorde
                        output.close();
455 321 mdecorde
                        inputData.close();
456 321 mdecorde
                } catch (Exception e) {
457 321 mdecorde
                        System.err.println("Error : "+infile);
458 321 mdecorde
                        e.printStackTrace();
459 1688 mdecorde
                        if (writer != null) writer.close();
460 1688 mdecorde
                        if (output != null) output.close();
461 1688 mdecorde
                        if (parser != null) parser.close();
462 1688 mdecorde
                        if (inputData != null) inputData.close();
463 321 mdecorde
                        return false;
464 321 mdecorde
                }
465 321 mdecorde
                return true;
466 321 mdecorde
        }
467 321 mdecorde
468 321 mdecorde
        /**
469 321 mdecorde
         * Set the element and content to ignore
470 321 mdecorde
         *
471 321 mdecorde
         * @param regexp
472 321 mdecorde
         */
473 321 mdecorde
        public void setOutSideTextTags(String regexp) {
474 321 mdecorde
                this.outside_text_tags = regexp;
475 321 mdecorde
                this.reg_outside_text_tags = Pattern.compile(outside_text_tags);
476 321 mdecorde
        }
477 321 mdecorde
478 321 mdecorde
        /**
479 321 mdecorde
         * Set element content to NOT tokenize
480 321 mdecorde
         *
481 321 mdecorde
         * @param regexp
482 321 mdecorde
         */
483 321 mdecorde
        public void setNote(String regexp) {
484 321 mdecorde
                this.note_content = regexp;
485 321 mdecorde
                this.reg_note_content = Pattern.compile(note_content);
486 321 mdecorde
        }
487 321 mdecorde
488 321 mdecorde
        /**
489 321 mdecorde
         * Set the element to ignore but not their content
490 321 mdecorde
         *
491 321 mdecorde
         * @param regexp
492 321 mdecorde
         */
493 321 mdecorde
        public void setOutSideTextTagsAndKeepContent(String regexp) {
494 321 mdecorde
                this.outside_text_tags_keep_content = regexp;
495 321 mdecorde
                this.reg_outside_text_tags_keep_content = Pattern.compile(outside_text_tags_keep_content);
496 321 mdecorde
        }
497 321 mdecorde
498 321 mdecorde
        protected void goToEndOfElement(String name) {
499 321 mdecorde
                //println "START ignoring tag and content of $name"
500 321 mdecorde
                def openedTags = []
501 321 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
502 321 mdecorde
                        if (event == XMLStreamConstants.START_ELEMENT) {
503 321 mdecorde
                                openedTags << parser.getLocalName()
504 321 mdecorde
                                //println "append "+openedTags
505 321 mdecorde
                        } else if (event == XMLStreamConstants.END_ELEMENT) {
506 321 mdecorde
                                if (openedTags.size() == 0 && name == parser.getLocalName()) {
507 321 mdecorde
                                        //println "END ignoring tag and content of $name"
508 321 mdecorde
                                        return;
509 321 mdecorde
                                }
510 321 mdecorde
                                openedTags.pop()
511 321 mdecorde
                                //println "pop $openedTags"
512 321 mdecorde
                        }
513 321 mdecorde
                }
514 321 mdecorde
        }
515 321 mdecorde
516 321 mdecorde
        public final static String WHITESPACE = " ";
517 321 mdecorde
        public final static String EMPTY = "";
518 321 mdecorde
        /**
519 321 mdecorde
         * Process word.
520 321 mdecorde
         */
521 321 mdecorde
        protected void processWord() {
522 321 mdecorde
                String text = buffer.toString();//parser.getText().trim().replace("\t", " ");
523 321 mdecorde
                //if (DEBUG) println "-- chars: "+text+"--";
524 321 mdecorde
                text = regLN.matcher(text).replaceAll(WHITESPACE);
525 321 mdecorde
                text = regCTRL.matcher(text).replaceAll(EMPTY);                                                // remove ctrl characters
526 321 mdecorde
                if (regSplitWhiteSpaces != null) {
527 321 mdecorde
                        for (String s : regSplitWhiteSpaces.split(text)) {                // separate with unicode white spaces
528 321 mdecorde
                                //        if (DEBUG){println "process $s"}
529 321 mdecorde
                                        iterate(s);
530 321 mdecorde
                                }
531 321 mdecorde
                } else {
532 321 mdecorde
                        iterate(text);
533 321 mdecorde
                }
534 321 mdecorde
        }
535 321 mdecorde
536 321 mdecorde
        /**
537 321 mdecorde
         * Iterate. a String, should be called when a word is found in a String
538 321 mdecorde
         *
539 321 mdecorde
         * @param s the s
540 321 mdecorde
         * @return the java.lang. object
541 321 mdecorde
         */
542 321 mdecorde
        protected iterate(String s) {
543 321 mdecorde
                while (s != null && s.length() > 0) {
544 321 mdecorde
                //        if (DEBUG){println "  > $s"}
545 321 mdecorde
                        s = standardChecks(s);
546 321 mdecorde
                }
547 321 mdecorde
        }
548 321 mdecorde
549 321 mdecorde
        /**
550 321 mdecorde
         * Standard checks.
551 321 mdecorde
         *
552 321 mdecorde
         * @param s the s
553 321 mdecorde
         * @return the java.lang. object
554 321 mdecorde
         */
555 321 mdecorde
        //        @CompileStatic(SKIP)
556 321 mdecorde
        public String standardChecks(String s) {
557 321 mdecorde
                Matcher m;
558 321 mdecorde
                if (fclitics != null && (m = s =~ regFClitics) ) {
559 321 mdecorde
                //        if (DEBUG) println "CLITIC found: $s ->"+ m
560 321 mdecorde
                        iterate(m.group(1))
561 321 mdecorde
562 321 mdecorde
                        wordcount++;
563 479 mdecorde
                        writer.writeStartElement(word_element_to_create);
564 321 mdecorde
                        writeWordAttributes();// id
565 321 mdecorde
                        //writer.writeAttribute("type", "w");
566 321 mdecorde
                        writer.writeCharacters(m.group(2));
567 321 mdecorde
                        writer.writeEndElement();
568 321 mdecorde
                        writer.writeCharacters("\n");
569 321 mdecorde
570 321 mdecorde
                        return "";
571 321 mdecorde
                } else if (pclitics != null && (m = s =~ regPClitics) ) {
572 321 mdecorde
                        if (DEBUG) println "PCLITIC found: $s ->"+ m
573 321 mdecorde
574 321 mdecorde
                        wordcount++;
575 479 mdecorde
                        writer.writeStartElement(word_element_to_create);
576 321 mdecorde
                        writeWordAttributes();// id
577 321 mdecorde
                        //writer.writeAttribute("type", "w");
578 321 mdecorde
                        writer.writeCharacters(m.group(1));
579 321 mdecorde
                        writer.writeEndElement();
580 321 mdecorde
                        writer.writeCharacters("\n");
581 321 mdecorde
582 321 mdecorde
                        iterate(m.group(2))
583 321 mdecorde
584 321 mdecorde
                        return "";
585 321 mdecorde
                } else if (regElision != null && (m = s =~ regElision) ) {
586 321 mdecorde
                //        if (DEBUG) println "Elision found: $s ->"+ m.group(1)+" + "+m.group(2)+" + "+m.group(3)
587 321 mdecorde
                        //iterate(m.group(1))
588 321 mdecorde
589 321 mdecorde
//                        int sep = s.indexOf("'");
590 321 mdecorde
//                        if (sep < 0)
591 321 mdecorde
//                                sep = s.indexOf("’");
592 321 mdecorde
//                        if (sep < 0)
593 321 mdecorde
//                                sep = s.indexOf("‘");
594 321 mdecorde
595 321 mdecorde
                        wordcount++;
596 479 mdecorde
                        writer.writeStartElement(word_element_to_create);
597 321 mdecorde
                        writeWordAttributes();// id
598 321 mdecorde
                        //writer.writeAttribute("type", "w");
599 321 mdecorde
                        writer.writeCharacters(m.group(1));
600 321 mdecorde
                        writer.writeEndElement();
601 321 mdecorde
                        writer.writeCharacters("\n");
602 321 mdecorde
603 321 mdecorde
                        iterate(m.group(2))
604 321 mdecorde
605 321 mdecorde
                        return "";
606 321 mdecorde
                } else if (reg3pts != null && (m = s =~ reg3pts) )        {
607 321 mdecorde
                //        if(DEBUG){println "REG '...' found: $s -> "+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
608 321 mdecorde
                        iterate(m.group(1))
609 321 mdecorde
610 321 mdecorde
                        wordcount++;
611 479 mdecorde
                        writer.writeStartElement(word_element_to_create);
612 321 mdecorde
                        writeWordAttributes();// id
613 321 mdecorde
                        //writer.writeAttribute("type","pon");
614 321 mdecorde
                        writer.writeCharacters("...");
615 321 mdecorde
                        writer.writeEndElement();
616 321 mdecorde
                        writer.writeCharacters("\n");
617 321 mdecorde
618 321 mdecorde
                        return m.group(3);
619 321 mdecorde
                } else if (regPunct != null && (m = s =~ regPunct) ) {
620 321 mdecorde
                        if(DEBUG){println "PUNCT found: $s ->"+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
621 321 mdecorde
                        iterate(m.group(1));
622 321 mdecorde
623 321 mdecorde
                        wordcount++;
624 479 mdecorde
                        writer.writeStartElement(word_element_to_create);
625 321 mdecorde
                        writeWordAttributes();// id
626 321 mdecorde
                        //writer.writeAttribute("type","pon");
627 321 mdecorde
                        writer.writeCharacters(m.group(2));
628 321 mdecorde
                        writer.writeEndElement();
629 321 mdecorde
                        writer.writeCharacters("\n");
630 321 mdecorde
631 321 mdecorde
                        return m.group(3);
632 321 mdecorde
                } else {
633 321 mdecorde
        //                if(DEBUG){println "Other found: "+s}
634 321 mdecorde
                        wordcount++;
635 479 mdecorde
                        writer.writeStartElement(word_element_to_create);
636 321 mdecorde
                        writeWordAttributes();// id
637 321 mdecorde
//                        if (reg_punct_other.matcher(s).matches())
638 321 mdecorde
//                                writer.writeAttribute("type","pon");
639 321 mdecorde
//                        else
640 321 mdecorde
//                                writer.writeAttribute("type","w");
641 321 mdecorde
                        writer.writeCharacters(s);
642 321 mdecorde
                        writer.writeEndElement();
643 321 mdecorde
                        writer.writeCharacters("\n");
644 321 mdecorde
645 321 mdecorde
                        return "";
646 321 mdecorde
                }
647 321 mdecorde
        }
648 321 mdecorde
649 321 mdecorde
        /**
650 321 mdecorde
         * Write word attributes.
651 321 mdecorde
         *
652 321 mdecorde
         * @return the java.lang. object
653 321 mdecorde
         */
654 321 mdecorde
        protected writeWordAttributes() {
655 321 mdecorde
                writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
656 321 mdecorde
                writer.writeAttribute("n",""+wordcount);
657 321 mdecorde
        }
658 321 mdecorde
659 321 mdecorde
        public void setStartTag(String tag)
660 321 mdecorde
        {
661 321 mdecorde
                this.startTag = tag;
662 321 mdecorde
                this.reg_startTag = Pattern.compile(startTag);
663 321 mdecorde
        }
664 321 mdecorde
665 321 mdecorde
        /**
666 321 mdecorde
         * Tokenize.
667 321 mdecorde
         *
668 321 mdecorde
         * @param str the str
669 321 mdecorde
         * @return the list
670 321 mdecorde
         */
671 321 mdecorde
        public List<String> tokenize(String str)
672 321 mdecorde
        {
673 321 mdecorde
                return str.tokenize()        // cut by whitespace
674 321 mdecorde
        }
675 321 mdecorde
676 321 mdecorde
        /**
677 321 mdecorde
         * The main method.
678 321 mdecorde
         *
679 321 mdecorde
         * @param args the arguments
680 321 mdecorde
         */
681 321 mdecorde
        public static void main(String[] args)
682 321 mdecorde
        {
683 479 mdecorde
                File SVNDIR = new File(System.getProperty("user.home"), "xml/tokenizernum");
684 321 mdecorde
                Runtime.getRuntime().exec("svn", "update", SVNDIR.getAbsolutePath());
685 479 mdecorde
                File inputFile = new File(SVNDIR, "baye1.xml")
686 479 mdecorde
                File outputFile = new File(SVNDIR, "baye1-t.xml")
687 321 mdecorde
                SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, "fr")
688 321 mdecorde
                tokenizer.DEBUG = true
689 321 mdecorde
                tokenizer.setNote("note")
690 479 mdecorde
                //tokenizer.setOutSideTextTags("teiHeader")
691 479 mdecorde
                tokenizer.setOutSideTextTagsAndKeepContent("teiHeader")
692 321 mdecorde
                println outputFile.toString() + " : "+tokenizer.process();
693 321 mdecorde
                // FIXME: tests UTF-8 XML ranges
694 321 mdecorde
                //                File inputDir = new File("C:/Tools/Textometrie/___corpus/xml/test_bug_import_chinois/")
695 321 mdecorde
                //        File inputFile = new File(inputDir, "problem2.xml");
696 321 mdecorde
697 321 mdecorde
                /*
698 321 mdecorde
                File inputDir = new File("/home/mdecorde/xml/bvhepistemon2016")
699 321 mdecorde
                File outputDir = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast")
700 321 mdecorde
                File outputDir2 = new File("/home/mdecorde/xml/bvhepistemon2016/out-slow")
701 321 mdecorde
                File outputDir3 = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast2")
702 321 mdecorde
                File inputFile = new File(inputDir, "1542_RabelaisGrgt.xml");
703 321 mdecorde
                outputDir.deleteDir()
704 321 mdecorde
                outputDir2.deleteDir()
705 321 mdecorde
                outputDir3.deleteDir()
706 321 mdecorde
                outputDir.mkdir()
707 321 mdecorde
                outputDir2.mkdir()
708 321 mdecorde
                outputDir3.mkdir()
709 321 mdecorde
                long time;
710 321 mdecorde
                for (String lang : ["none"]) {
711 321 mdecorde
                        println "LANG: $lang"
712 321 mdecorde

713 321 mdecorde
                        time = System.currentTimeMillis();
714 321 mdecorde
                        File outputFile = new File(outputDir, "${lang}.xml")
715 321 mdecorde
                        SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, lang)
716 321 mdecorde
                        tokenizer.process();
717 321 mdecorde
                        println "FAST $lang: "+(System.currentTimeMillis()-time)
718 321 mdecorde

719 321 mdecorde
//                        time = System.currentTimeMillis();
720 321 mdecorde
//                        File outputFile3 = new File(outputDir3, "${lang}.xml")
721 321 mdecorde
//                        FastSimpleTokenizerXml tokenizer3 = new FastSimpleTokenizerXml(inputFile, outputFile3, lang)
722 321 mdecorde
//                        tokenizer3.process();
723 321 mdecorde
//                        println "FAST2 $lang: "+(System.currentTimeMillis()-time)
724 321 mdecorde
//
725 321 mdecorde
//                        time = System.currentTimeMillis();
726 321 mdecorde
//                        File outputFile2 = new File(outputDir2, "${lang}.xml")
727 321 mdecorde
//                        OldSimpleTokenizerXml tokenizer2 = new OldSimpleTokenizerXml(inputFile, outputFile2, lang)
728 321 mdecorde
//                        tokenizer2.process();
729 321 mdecorde
//                        println "SLOW $lang: "+(System.currentTimeMillis()-time)
730 321 mdecorde
                }
731 321 mdecorde
*/
732 321 mdecorde
                println ""
733 321 mdecorde
                println "Done"
734 321 mdecorde
735 321 mdecorde
                //                String lang = "en"
736 321 mdecorde
                //                File inDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant")
737 321 mdecorde
                //                File outDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant-t")
738 321 mdecorde
                //                outDir.deleteDir()
739 321 mdecorde
                //                outDir.mkdir()
740 321 mdecorde
                //
741 321 mdecorde
                //                println "processing "+inDir.listFiles().size()+" files."
742 321 mdecorde
                //                for (def infile : inDir.listFiles()) {
743 321 mdecorde
                //                        if (!infile.getName().endsWith(".xml")) continue;
744 321 mdecorde
                //
745 321 mdecorde
                //                        print "."
746 321 mdecorde
                //                        File outfile = new File(outDir, infile.getName())
747 321 mdecorde
                //                        SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
748 321 mdecorde
                //                        tokenizer.setDEBUG false
749 321 mdecorde
                //                        tokenizer.process();
750 321 mdecorde
                //                }
751 321 mdecorde
                //                println ""
752 321 mdecorde
                //                println "Done"
753 321 mdecorde
754 321 mdecorde
        }
755 321 mdecorde
}