Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / filters / Tokeniser / SimpleTokenizerXml.groovy @ 1688

History | View | Annotate | Download (23.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate:$
41
// $LastChangedRevision:$
42
// $LastChangedBy:$
43
//
44
package org.txm.scripts.filters.Tokeniser;
45

    
46
import static groovy.transform.TypeCheckingMode.SKIP
47
import groovy.transform.CompileStatic
48

    
49
import java.util.regex.Matcher
50
import java.util.regex.Pattern
51

    
52
import javax.xml.stream.*
53

    
54
import org.txm.scripts.importer.graal.PersonalNamespaceContext
55
import org.txm.tokenizer.TokenizerClasses
56

    
57
@CompileStatic
58
public class SimpleTokenizerXml {
59

    
60
        /** The word_tags. */
61
        String word_tags = TokenizerClasses.word_tags;
62
        String word_element_to_create = TokenizerClasses.word_element_to_create;
63
        Pattern reg_word_tags;
64
        /** The intraword_tags. */
65
        String intraword_tags = TokenizerClasses.intraword_tags;
66

    
67
        /** The punct_strong. */
68
        String punct_strong = TokenizerClasses.punct_strong;
69

    
70
        /** The punct_all. */
71
        String punct_all = TokenizerClasses.punct_all;
72

    
73
        /** The word_chars. */
74
        String word_chars = TokenizerClasses.word_chars;
75

    
76
        String fclitics = null; // default behavior don't manage clitics
77
        String pclitics = null; // default behavior don't manage clitics
78

    
79
        /** The outside_text_tags_ignore_content. */
80
        String note_content = null;
81
        String outside_text_tags_keep_content = null // tag and content NOT removed but not tokenized
82
        String outside_text_tags = null // tag and content removed
83
        String startTag = null
84
        Pattern reg_note_content;
85
        Pattern reg_outside_text_tags_keep_content; 
86
        Pattern reg_outside_text_tags;
87
        Pattern reg_startTag;
88
        
89
        Pattern reg_punct_other = Pattern.compile("\\p{P}")
90

    
91
        /** The DEBUG. */
92
        public boolean DEBUG = false;
93

    
94
        /** The outfile. */
95
        File outfile;
96

    
97
        /** The infile. */
98
        File infile;
99

    
100
        String lang;
101

    
102
        /** The buffer. */
103
        StringBuffer buffer;
104

    
105
        /** The writer. */
106
        XMLStreamWriter writer;
107
        BufferedOutputStream output;
108

    
109
        /** The parser. */
110
        XMLStreamReader parser
111

    
112
        /** The localname. */
113
        String localname;
114

    
115
        /** The prefix. */
116
        String prefix;
117
        String filename;
118

    
119
        def regElision = null;
120
        def reg3pts = null;
121
        def regPunct;
122
        def regFClitics = null ;
123
        def regPClitics = null ;
124
        String whitespaces;
125
        Pattern regWhitespaces;
126
        Pattern regLN;
127
        Pattern regCTRL;
128
        Pattern regSplitWhiteSpaces;
129

    
130
        public SimpleTokenizerXml(File infile, File outfile) {
131
                this(infile, outfile, "");
132
        }
133

    
134
        /**
135
         * Instantiates a new simple tokenizer xml.
136
         *
137
         * @param infile the infile
138
         * @param outfile the outfile
139
         */
140
        public SimpleTokenizerXml(File infile, File outfile, String lang) {
141
                this.lang = lang;
142
                if (lang != null)
143
                        if (lang.startsWith("en")) {
144
                                fclitics = TokenizerClasses.FClitic_en
145
                        } else if (lang.startsWith("fr")) {
146
                                fclitics = TokenizerClasses.FClitic_fr
147
                                pclitics = TokenizerClasses.PClitic_fr
148
                        } else if (lang.startsWith("gl")) {
149
                                fclitics = TokenizerClasses.FClitic_gl
150
                        } else if (lang.startsWith("it")) {
151
                                pclitics = TokenizerClasses.PClitic_it
152
                        }
153

    
154
                word_tags = TokenizerClasses.word_tags;
155
                reg_word_tags = Pattern.compile(word_tags);
156
                
157
                intraword_tags = TokenizerClasses.intraword_tags;
158
                punct_strong = TokenizerClasses.punct_strong;
159
                punct_all = TokenizerClasses.punct_all;
160
                word_chars = TokenizerClasses.word_chars;
161

    
162
                this.outfile = outfile;
163
                this.infile = infile;
164
                this.filename = infile.getName();
165
                int index = filename.lastIndexOf(".");
166
                if (index > 0) filename = filename.substring(0, index);
167

    
168
                String strRegElision = TokenizerClasses.regElision;
169
                if (strRegElision != null && strRegElision.length() > 0)
170
                        regElision = ~/^([\p{L}-]++${strRegElision}[\p{L}-]++)(.*)$/ // the test must start with the start of string  ^
171
                reg3pts = ~/^(.*?)(\.\.\.)(.*)$/
172
                
173
                String strRegPunct = TokenizerClasses.regPunct;
174
                if (strRegPunct != null && strRegPunct.length() > 0)
175
                        regPunct = ~/^(.*?)($strRegPunct)(.*)$/
176
                
177
                if (fclitics != null && fclitics.length() > 0)
178
                        regFClitics = ~/(.+)($fclitics)$/ // the test must end with the end of string $
179
                        
180
                if (pclitics != null && pclitics.length() > 0)
181
                        regPClitics = /^($pclitics)(.*)/ // the test must start with the start of string  ^
182
                        
183
                if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0)
184
                        regWhitespaces = ~TokenizerClasses.whitespaces;
185

    
186
                if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0)
187
                        regSplitWhiteSpaces = Pattern.compile(TokenizerClasses.whitespaces);
188
                        
189
                regLN = Pattern.compile("/\n/");
190
                regCTRL = Pattern.compile("/\\p{C}/");
191
        }
192

    
193
        /**
194
         * Fill infos.
195
         *
196
         * @param event the event
197
         * @return the java.lang. object
198
         */
199
        public fillInfos(int event) {
200
                if (event == XMLStreamConstants.START_ELEMENT || event == XMLStreamConstants.END_ELEMENT) {
201
                        localname = parser.getLocalName();
202
                        prefix = parser.getPrefix();
203
                }
204
        }
205

    
206
        /**
207
         * Donothing: just write what is read
208
         * 
209
         * special processing for words: prefix id attributes with "w_" and remove special characters like \n \t etc.
210
         *
211
         * @param event the event
212
         * @param wordid the wordid
213
         * @return the java.lang. object
214
         */
215
        public donothing(int event, Integer wordid) {
216
                if (event == XMLStreamConstants.START_ELEMENT ) {
217
                        
218
                        localname = parser.getLocalName();
219
                        if (wordid != null) localname = word_element_to_create;
220
                        
221
                        if (prefix != null && prefix.length() > 0)
222
                                writer.writeStartElement(prefix+":"+localname);
223
                        else
224
                                //                                if(namespace != null)
225
                                //                                        writer.writeStartElement(namespace, localname);
226
                                //                                else
227
                                writer.writeStartElement(localname);
228

    
229
                        //                        if(parser.getNamespaceCount() > 0)
230
                        //                                writer.writeDefaultNamespace(parser.getNamespaceURI(0))
231
                        //                        for(int i = 1 ; i < parser.getNamespaceCount() ; i++)
232
                        //                                writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
233

    
234
                        String namespace_prefix;
235
                        for (int i = 0 ; i<   parser.getNamespaceCount() ; i++) {
236
                                namespace_prefix = parser.getNamespacePrefix(i);
237
                                if ((namespace_prefix != null)&&   (namespace_prefix.length()>   0)) {
238
                                        writer.writeNamespace(namespace_prefix, parser.getNamespaceURI(i));
239
                                } else {
240
                                        writer.writeDefaultNamespace(parser.getNamespaceURI(i));
241
                                }
242
                        }
243

    
244
                        String attrprefix, attname;
245
                        boolean hasId = false;
246
                        //boolean hasType = false
247
                        boolean hasN = false
248
                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
249
                                attname = parser.getAttributeLocalName(i);
250
                                attrprefix = parser.getAttributePrefix(i);
251
                                if ("id".equals(attname)) hasId = true;
252
                                //if ("type".equals(attname)) hasType = true;
253
                                if ("n".equals(attname)) hasN = true;
254

    
255
                                if (attrprefix != null && attrprefix.length() > 0)
256
                                        writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i))
257
                                else
258
                                        writer.writeAttribute(attname, parser.getAttributeValue(i))
259
                        }
260

    
261
                        if (wordid != null && !hasId && localname == word_element_to_create)
262
                                writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
263

    
264
                        if (!hasN && localname == word_element_to_create)
265
                                writer.writeAttribute("n", ""+wordcount);
266

    
267
                        writer.writeCharacters("\n");
268
                }
269
                else if(event == XMLStreamConstants.END_ELEMENT)
270
                {
271
                        writer.writeEndElement();
272
                        writer.writeCharacters("\n");
273
                }
274
                else if(event == XMLStreamConstants.CHARACTERS)
275
                {
276
                        //println parser.getText();
277
                        //writer.writeCharacters("𦟛");
278

    
279

    
280
                        // checks if the token starts with an high surrogate
281
                        //                        if(isHighSurrogate(parser.getText().charAt(0)))        {
282
                        //                                println "warning: invalid UTF-8 XML range, token " + parser.getText() + " has been replaced.";
283
                        //                                writer.writeCharacters("__invalidXMLChar__")
284
                        //                                //writer.writeCharacters("𦟛");
285
                        //                                println "high surrogate: " + Integer.toHexString((int)parser.getText().charAt(0));
286
                        //                                println "low surrogate: " + Integer.toHexString((int)parser.getText().charAt(1));
287
                        //                                int charSum = parser.getText().charAt(0) + parser.getText().charAt(1);
288
                        //                                println "char sum: " + charSum;
289
                        //                                println "test " + surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
290
                        //                                int scalar = surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
291
                        //                                //writer.writeCharacters(String.valueOf((char)112692));
292
                        //                                writer.writeCharacters("&#" + scalar + ";");
293
                        //                        }
294
                        //                        else
295

    
296

    
297
                        if (insideword) { // ensure there is not \t or \n in the word form value
298
                                writer.writeCharacters(parser.getText().trim().replace("\n", " ").replace("\t", " "));
299
                        } else {
300
                                writer.writeCharacters(parser.getText());
301
                        }
302
                }
303
        }
304

    
305

    
306
        /**
307
         * Converts the specified surrogates pair to scalar.
308
         * @param highSurrogate
309
         * @param lowSurrogate
310
         * @return
311
         */
312
        public int surrogatesPairToScalar(char highSurrogate, char lowSurrogate)        {
313
                return ((highSurrogate - 0xD800) * 0x400) + (lowSurrogate - 0xDC00) + 0x10000;
314
        }
315

    
316
        /**
317
         * Checks if the specified character is an high/leading surrogate.
318
         * @param character
319
         * @return
320
         */
321
        public boolean isHighSurrogate(char character)        {
322
                return (character >= 0xD800 && character <= 0xDBFF);
323
        }
324

    
325

    
326

    
327
        /** The wordcount. */
328
        int wordcount = 0;
329

    
330
        /** The ignorecontent. */
331
        boolean ignorecontent = true; // tokenization starts with element name matching $reg_startTag
332
        boolean insideword = false;
333
        /**
334
         * Process.
335
         *
336
         * @return true, if successful
337
         */
338
        public boolean process()
339
        {
340
                if (!infile.exists()) {
341
                        println "$infile does not exists"
342
                        return false;
343
                }
344
                XMLOutputFactory factory = XMLOutputFactory.newInstance();
345
                output = new BufferedOutputStream(new FileOutputStream(outfile))
346
                writer = factory.createXMLStreamWriter(output, "UTF-8")
347
                writer.setNamespaceContext(new PersonalNamespaceContext());
348

    
349
                def inputData = infile.toURI().toURL().openStream();
350
                def inputfactory = XMLInputFactory.newInstance();
351
                //inputfactory.setProperty("http://apache.org/xml/properties/input-buffer-size", new Integer(2048));
352
                //inputfactory.setExpandEntityReferences(false);
353
                parser = inputfactory.createXMLStreamReader(inputData, "UTF-8");
354
                //println "PARSER: "+parser.getClass()
355
                writer.writeStartDocument("UTF-8","1.0");
356
                writer.writeCharacters("\n");
357

    
358
                int previousEvent = 0;
359
                boolean startProcess = false;
360
                if (startTag == null) // if no startTag specified we process from the start
361
                        startProcess = true;
362
                ignorecontent = !startProcess;
363
                
364
                buffer = new StringBuffer();
365
                //println "process - start start tag: "+startTag+" startProcess: $startProcess"
366
//                println "reg_outside_text_tags_keep_content=$reg_outside_text_tags_keep_content"
367
//                println "reg_outside_text_tags=$reg_outside_text_tags"
368
//                println "reg_note_content=$reg_note_content"
369
                try {
370
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
371
                                if (!startProcess) {
372
                                        if (event == XMLStreamConstants.START_ELEMENT) {
373
                                                if (reg_startTag.matcher(parser.getLocalName()).matches()) {
374
                                                        startProcess = true
375
                                                        ignorecontent = false;
376
                                                }
377
                                        }
378
                                        if (!startProcess) {
379
                                                donothing(event, null);
380
                                                continue;
381
                                        }
382
                                }
383

    
384
                                if (previousEvent == XMLStreamConstants.CHARACTERS && previousEvent != event) {
385
                                        processWord(); // tokenize now!
386
                                        buffer.setLength(0);
387
                                }
388
                                fillInfos(event);//get localname and prefix
389
                                if (event == XMLStreamConstants.START_ELEMENT) {
390
                                        //println "Open: "+localname;
391
                                        localname = parser.getLocalName()
392
                                        if (reg_word_tags.matcher(localname).matches()) { // ignore the content of the word but keep counting
393
                                                //println "Found pretagged word";
394
                                                wordcount++;
395
                                                donothing(event, wordcount);
396
                                                //ignorecontent = true;
397
                                                insideword = true;
398
                                        } else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
399
                                                // ignore the tag only
400
                                                donothing(event, null); // write the tag
401
                                                //println "IGNORING NOTE CONTENT OF "+localname
402
                                                ignorecontent = true;
403
                                        } else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
404
                                                //         ignore the content only
405
                                                donothing(event, null); // write the tag
406
                                                //println "IGNORING CONTENT OF "+localname
407
                                                ignorecontent = true;
408
                                        } else if (reg_outside_text_tags != null && reg_outside_text_tags.matcher(localname).matches()) { // ignore the tag and its content of the tag
409
                                                goToEndOfElement(localname); // parse until the end of the element is passed
410
                                        } else {
411
                                                donothing(event, null);
412
                                        }
413
                                } else if(event == XMLStreamConstants.END_ELEMENT) {
414
                                        //println "Close: "+localname;
415
                                        localname = parser.getLocalName()
416
                                        if (reg_word_tags.matcher(localname).matches()) {
417
                                                //ignorecontent = false;
418
                                                insideword = false;
419
                                                writer.writeEndElement();
420
                                                writer.writeCharacters("\n");
421
                                        } else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag
422
                                                ignorecontent = false;
423
                                                donothing(event, null);
424
                                        } else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
425
                                                ignorecontent = false;
426
                                                donothing(event, null);
427
                                        } else {
428
                                                donothing(event, null);
429
                                        }
430
                                } else if (event == XMLStreamConstants.CHARACTERS) {
431
                                        if (ignorecontent || insideword) {
432
                                                //println " dont tokenize chars: "+parser.getText().trim();
433
                                                donothing(event, null);
434
                                        } else {
435
                                                //println " process chars: "+parser.getText().trim();
436
                                                buffer.append(parser.getText());
437
                                                if (buffer.length() >= 128 && buffer.charAt(buffer.length()-1) == " ") {
438
                                                        processWord();
439
                                                        buffer = new StringBuffer();
440
                                                }
441
                                        }
442
                                } else if (event == XMLStreamConstants.COMMENT) {
443
                                        writer.writeComment(parser.getText())
444
                                } else if (event == XMLStreamConstants.DTD) {
445
                                        //println "DTD!";
446
                                } else {
447
                                        if (DEBUG) println "Warning in $infile: ignore XML event at location "+parser.getLocation()
448
                                }
449
                                previousEvent = event;
450
                        }
451

    
452
                        parser.close()
453
                        writer.close();
454
                        output.close();
455
                        inputData.close();
456
                } catch (Exception e) {
457
                        System.err.println("Error : "+infile);
458
                        e.printStackTrace();
459
                        if (writer != null) writer.close();
460
                        if (output != null) output.close();
461
                        if (parser != null) parser.close();
462
                        if (inputData != null) inputData.close();
463
                        return false;
464
                }
465
                return true;
466
        }
467

    
468
        /**
469
         * Set the element and content to ignore
470
         * 
471
         * @param regexp
472
         */
473
        public void setOutSideTextTags(String regexp) {
474
                this.outside_text_tags = regexp;
475
                this.reg_outside_text_tags = Pattern.compile(outside_text_tags);
476
        }
477
        
478
        /**
479
         * Set element content to NOT tokenize
480
         *
481
         * @param regexp
482
         */
483
        public void setNote(String regexp) {
484
                this.note_content = regexp;
485
                this.reg_note_content = Pattern.compile(note_content);
486
        }
487
        
488
        /**
489
         * Set the element to ignore but not their content
490
         *
491
         * @param regexp
492
         */
493
        public void setOutSideTextTagsAndKeepContent(String regexp) {
494
                this.outside_text_tags_keep_content = regexp;
495
                this.reg_outside_text_tags_keep_content = Pattern.compile(outside_text_tags_keep_content);
496
        }
497
        
498
        protected void goToEndOfElement(String name) {
499
                //println "START ignoring tag and content of $name"
500
                def openedTags = []
501
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
502
                        if (event == XMLStreamConstants.START_ELEMENT) {
503
                                openedTags << parser.getLocalName()
504
                                //println "append "+openedTags
505
                        } else if (event == XMLStreamConstants.END_ELEMENT) {
506
                                if (openedTags.size() == 0 && name == parser.getLocalName()) {
507
                                        //println "END ignoring tag and content of $name"
508
                                        return;
509
                                }
510
                                openedTags.pop()
511
                                //println "pop $openedTags"
512
                        }
513
                }
514
        }
515
        
516
        public final static String WHITESPACE = " ";
517
        public final static String EMPTY = "";
518
        /**
519
         * Process word.
520
         */
521
        protected void processWord() {
522
                String text = buffer.toString();//parser.getText().trim().replace("\t", " ");
523
                //if (DEBUG) println "-- chars: "+text+"--";
524
                text = regLN.matcher(text).replaceAll(WHITESPACE);
525
                text = regCTRL.matcher(text).replaceAll(EMPTY);                                                // remove ctrl characters
526
                if (regSplitWhiteSpaces != null) {
527
                        for (String s : regSplitWhiteSpaces.split(text)) {                // separate with unicode white spaces
528
                                //        if (DEBUG){println "process $s"}
529
                                        iterate(s);
530
                                }
531
                } else {
532
                        iterate(text);
533
                }
534
        }
535

    
536
        /**
537
         * Iterate. a String, should be called when a word is found in a String
538
         *
539
         * @param s the s
540
         * @return the java.lang. object
541
         */
542
        protected iterate(String s) {
543
                while (s != null && s.length() > 0) {
544
                //        if (DEBUG){println "  > $s"}
545
                        s = standardChecks(s);
546
                }
547
        }
548

    
549
        /**
550
         * Standard checks.
551
         *
552
         * @param s the s
553
         * @return the java.lang. object
554
         */
555
        //        @CompileStatic(SKIP)
556
        public String standardChecks(String s) {
557
                Matcher m;
558
                if (fclitics != null && (m = s =~ regFClitics) ) {
559
                //        if (DEBUG) println "CLITIC found: $s ->"+ m
560
                        iterate(m.group(1))
561
                        
562
                        wordcount++;
563
                        writer.writeStartElement(word_element_to_create);
564
                        writeWordAttributes();// id
565
                        //writer.writeAttribute("type", "w");
566
                        writer.writeCharacters(m.group(2));
567
                        writer.writeEndElement();
568
                        writer.writeCharacters("\n");
569
                        
570
                        return "";
571
                } else if (pclitics != null && (m = s =~ regPClitics) ) {
572
                        if (DEBUG) println "PCLITIC found: $s ->"+ m
573
                        
574
                        wordcount++;
575
                        writer.writeStartElement(word_element_to_create);
576
                        writeWordAttributes();// id
577
                        //writer.writeAttribute("type", "w");
578
                        writer.writeCharacters(m.group(1));
579
                        writer.writeEndElement();
580
                        writer.writeCharacters("\n");
581
                        
582
                        iterate(m.group(2))
583
                        
584
                        return "";
585
                } else if (regElision != null && (m = s =~ regElision) ) {
586
                //        if (DEBUG) println "Elision found: $s ->"+ m.group(1)+" + "+m.group(2)+" + "+m.group(3)
587
                        //iterate(m.group(1))
588

    
589
//                        int sep = s.indexOf("'");
590
//                        if (sep < 0)
591
//                                sep = s.indexOf("’");
592
//                        if (sep < 0)
593
//                                sep = s.indexOf("‘");
594

    
595
                        wordcount++;
596
                        writer.writeStartElement(word_element_to_create);
597
                        writeWordAttributes();// id
598
                        //writer.writeAttribute("type", "w");
599
                        writer.writeCharacters(m.group(1));
600
                        writer.writeEndElement();
601
                        writer.writeCharacters("\n");
602

    
603
                        iterate(m.group(2))
604
                        
605
                        return "";
606
                } else if (reg3pts != null && (m = s =~ reg3pts) )        {
607
                //        if(DEBUG){println "REG '...' found: $s -> "+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
608
                        iterate(m.group(1))
609

    
610
                        wordcount++;
611
                        writer.writeStartElement(word_element_to_create);
612
                        writeWordAttributes();// id
613
                        //writer.writeAttribute("type","pon");
614
                        writer.writeCharacters("...");
615
                        writer.writeEndElement();
616
                        writer.writeCharacters("\n");
617

    
618
                        return m.group(3);
619
                } else if (regPunct != null && (m = s =~ regPunct) ) {
620
                        if(DEBUG){println "PUNCT found: $s ->"+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
621
                        iterate(m.group(1));
622

    
623
                        wordcount++;
624
                        writer.writeStartElement(word_element_to_create);
625
                        writeWordAttributes();// id
626
                        //writer.writeAttribute("type","pon");
627
                        writer.writeCharacters(m.group(2));
628
                        writer.writeEndElement();
629
                        writer.writeCharacters("\n");
630

    
631
                        return m.group(3);
632
                } else {
633
        //                if(DEBUG){println "Other found: "+s}
634
                        wordcount++;
635
                        writer.writeStartElement(word_element_to_create);
636
                        writeWordAttributes();// id
637
//                        if (reg_punct_other.matcher(s).matches())
638
//                                writer.writeAttribute("type","pon");
639
//                        else
640
//                                writer.writeAttribute("type","w");
641
                        writer.writeCharacters(s);
642
                        writer.writeEndElement();
643
                        writer.writeCharacters("\n");
644

    
645
                        return "";
646
                }
647
        }
648

    
649
        /**
650
         * Write word attributes.
651
         *
652
         * @return the java.lang. object
653
         */
654
        protected writeWordAttributes() {
655
                writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
656
                writer.writeAttribute("n",""+wordcount);
657
        }
658

    
659
        public void setStartTag(String tag)
660
        {
661
                this.startTag = tag;
662
                this.reg_startTag = Pattern.compile(startTag);
663
        }
664

    
665
        /**
666
         * Tokenize.
667
         *
668
         * @param str the str
669
         * @return the list
670
         */
671
        public List<String> tokenize(String str)
672
        {
673
                return str.tokenize()        // cut by whitespace
674
        }
675

    
676
        /**
677
         * The main method.
678
         *
679
         * @param args the arguments
680
         */
681
        public static void main(String[] args)
682
        {
683
                File SVNDIR = new File(System.getProperty("user.home"), "xml/tokenizernum");
684
                Runtime.getRuntime().exec("svn", "update", SVNDIR.getAbsolutePath());
685
                File inputFile = new File(SVNDIR, "baye1.xml")
686
                File outputFile = new File(SVNDIR, "baye1-t.xml")
687
                SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, "fr")
688
                tokenizer.DEBUG = true
689
                tokenizer.setNote("note")
690
                //tokenizer.setOutSideTextTags("teiHeader")
691
                tokenizer.setOutSideTextTagsAndKeepContent("teiHeader")
692
                println outputFile.toString() + " : "+tokenizer.process();
693
                // FIXME: tests UTF-8 XML ranges
694
                //                File inputDir = new File("C:/Tools/Textometrie/___corpus/xml/test_bug_import_chinois/")
695
                //        File inputFile = new File(inputDir, "problem2.xml");
696

    
697
                /*
698
                File inputDir = new File("/home/mdecorde/xml/bvhepistemon2016")
699
                File outputDir = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast")
700
                File outputDir2 = new File("/home/mdecorde/xml/bvhepistemon2016/out-slow")
701
                File outputDir3 = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast2")
702
                File inputFile = new File(inputDir, "1542_RabelaisGrgt.xml");
703
                outputDir.deleteDir()
704
                outputDir2.deleteDir()
705
                outputDir3.deleteDir()
706
                outputDir.mkdir()
707
                outputDir2.mkdir()
708
                outputDir3.mkdir()
709
                long time;
710
                for (String lang : ["none"]) {
711
                        println "LANG: $lang"
712

713
                        time = System.currentTimeMillis();
714
                        File outputFile = new File(outputDir, "${lang}.xml")
715
                        SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, lang)
716
                        tokenizer.process();
717
                        println "FAST $lang: "+(System.currentTimeMillis()-time)
718

719
//                        time = System.currentTimeMillis();
720
//                        File outputFile3 = new File(outputDir3, "${lang}.xml")
721
//                        FastSimpleTokenizerXml tokenizer3 = new FastSimpleTokenizerXml(inputFile, outputFile3, lang)
722
//                        tokenizer3.process();
723
//                        println "FAST2 $lang: "+(System.currentTimeMillis()-time)
724
//                        
725
//                        time = System.currentTimeMillis();
726
//                        File outputFile2 = new File(outputDir2, "${lang}.xml")
727
//                        OldSimpleTokenizerXml tokenizer2 = new OldSimpleTokenizerXml(inputFile, outputFile2, lang)
728
//                        tokenizer2.process();
729
//                        println "SLOW $lang: "+(System.currentTimeMillis()-time)
730
                }
731
*/
732
                println ""
733
                println "Done"
734

    
735
                //                String lang = "en"
736
                //                File inDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant")
737
                //                File outDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant-t")
738
                //                outDir.deleteDir()
739
                //                outDir.mkdir()
740
                //
741
                //                println "processing "+inDir.listFiles().size()+" files."
742
                //                for (def infile : inDir.listFiles()) {
743
                //                        if (!infile.getName().endsWith(".xml")) continue;
744
                //
745
                //                        print "."
746
                //                        File outfile = new File(outDir, infile.getName())
747
                //                        SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
748
                //                        tokenizer.setDEBUG false
749
                //                        tokenizer.process();
750
                //                }
751
                //                println ""
752
                //                println "Done"
753

    
754
        }
755
}