Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / filters / Tokeniser / SimpleTokenizerXml.groovy @ 1000

History | View | Annotate | Download (23.7 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate:$
41
// $LastChangedRevision:$
42
// $LastChangedBy:$
43
//
44
package org.txm.scripts.filters.Tokeniser;
45

    
46
import static groovy.transform.TypeCheckingMode.SKIP
47
import groovy.transform.CompileStatic
48

    
49
import java.util.regex.Matcher
50
import java.util.regex.Pattern
51

    
52
import javax.xml.stream.*
53

    
54
import org.txm.scripts.importer.graal.PersonalNamespaceContext
55
import org.txm.tokenizer.TokenizerClasses
56

    
57
@CompileStatic
58
public class SimpleTokenizerXml {
59

    
60
        /** The word_tags. */
61
        String word_tags = TokenizerClasses.word_tags;
62
        String word_element_to_create = "w"
63
        Pattern reg_word_tags;
64
        /** The intraword_tags. */
65
        String intraword_tags = TokenizerClasses.intraword_tags;
66

    
67
        /** The punct_strong. */
68
        String punct_strong = TokenizerClasses.punct_strong;
69

    
70
        /** The punct_all. */
71
        String punct_all = TokenizerClasses.punct_all;
72

    
73
        /** The word_chars. */
74
        String word_chars = TokenizerClasses.word_chars;
75

    
76
        String fclitics = null; // default behavior don't manage clitics
77
        String pclitics = null; // default behavior don't manage clitics
78

    
79
        /** The outside_text_tags_ignore_content. */
80
        String note_content = null;
81
        String outside_text_tags_keep_content = null // tag and content NOT removed but not tokenized
82
        String outside_text_tags = null // tag and content removed
83
        String startTag = null
84
        Pattern reg_note_content;
85
        Pattern reg_outside_text_tags_keep_content; 
86
        Pattern reg_outside_text_tags;
87
        Pattern reg_startTag;
88
        
89
        Pattern reg_punct_other = Pattern.compile("\\p{P}")
90

    
91
        /** The DEBUG. */
92
        public boolean DEBUG = false;
93

    
94
        /** The outfile. */
95
        File outfile;
96

    
97
        /** The infile. */
98
        File infile;
99

    
100
        String lang;
101

    
102
        /** The buffer. */
103
        StringBuffer buffer;
104

    
105
        /** The writer. */
106
        XMLStreamWriter writer;
107
        BufferedOutputStream output;
108

    
109
        /** The parser. */
110
        XMLStreamReader parser
111

    
112
        /** The localname. */
113
        String localname;
114

    
115
        /** The prefix. */
116
        String prefix;
117
        String filename;
118

    
119
        def regElision = null;
120
        def reg3pts = null;
121
        def regPunct;
122
        def regFClitics = null ;
123
        def regPClitics = null ;
124
        String whitespaces;
125
        Pattern regWhitespaces;
126
        Pattern regLN;
127
        Pattern regCTRL;
128
        Pattern regSplitWhiteSpaces;
129

    
130
        public SimpleTokenizerXml(File infile, File outfile) {
131
                this(infile, outfile, "");
132
        }
133

    
134
        /**
135
         * Instantiates a new simple tokenizer xml.
136
         *
137
         * @param infile the infile
138
         * @param outfile the outfile
139
         */
140
        public SimpleTokenizerXml(File infile, File outfile, String lang) {
141
                this.lang = lang;
142
                if (lang != null)
143
                        if (lang.startsWith("en")) {
144
                                fclitics = TokenizerClasses.FClitic_en
145
                        } else if (lang.startsWith("fr")) {
146
                                fclitics = TokenizerClasses.FClitic_fr
147
                                pclitics = TokenizerClasses.PClitic_fr
148
                        } else if (lang.startsWith("gl")) {
149
                                fclitics = TokenizerClasses.FClitic_gl
150
                        } else if (lang.startsWith("it")) {
151
                                pclitics = TokenizerClasses.PClitic_it
152
                        }
153

    
154
                word_tags = TokenizerClasses.word_tags;
155
                reg_word_tags = Pattern.compile(word_tags);
156
                intraword_tags = TokenizerClasses.intraword_tags;
157
                punct_strong = TokenizerClasses.punct_strong;
158
                punct_all = TokenizerClasses.punct_all;
159
                word_chars = TokenizerClasses.word_chars;
160

    
161
                this.outfile = outfile;
162
                this.infile = infile;
163
                this.filename = infile.getName();
164
                int index = filename.lastIndexOf(".");
165
                if (index > 0) filename = filename.substring(0, index);
166

    
167
                String strRegElision = TokenizerClasses.regElision;
168
                if (strRegElision != null && strRegElision.length() > 0)
169
                        regElision = ~/^([\p{L}-]++${strRegElision}[\p{L}-]++)(.*)$/ // the test must start with the start of string  ^
170
                reg3pts = ~/^(.*?)(\.\.\.)(.*)$/
171
                
172
                String strRegPunct = TokenizerClasses.regPunct;
173
                if (strRegPunct != null && strRegPunct.length() > 0)
174
                        regPunct = ~/^(.*?)($strRegPunct)(.*)$/
175
                
176
                if (fclitics != null && fclitics.length() > 0)
177
                        regFClitics = ~/(.+)($fclitics)$/ // the test must end with the end of string $
178
                        
179
                if (pclitics != null && pclitics.length() > 0)
180
                        regPClitics = /^($pclitics)(.*)/ // the test must start with the start of string  ^
181
                        
182
                if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0)
183
                        regWhitespaces = ~TokenizerClasses.whitespaces;
184

    
185
                if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0)
186
                        regSplitWhiteSpaces = Pattern.compile(TokenizerClasses.whitespaces);
187
                        
188
                regLN = Pattern.compile("/\n/");
189
                regCTRL = Pattern.compile("/\\p{C}/");
190
        }
191

    
192
        /**
193
         * Fill infos.
194
         *
195
         * @param event the event
196
         * @return the java.lang. object
197
         */
198
        public fillInfos(int event) {
199
                if (event == XMLStreamConstants.START_ELEMENT || event == XMLStreamConstants.END_ELEMENT) {
200
                        localname = parser.getLocalName();
201
                        prefix = parser.getPrefix();
202
                }
203
        }
204

    
205
        /**
206
         * Donothing: just write what is read
207
         * 
208
         * special processing for words: prefix id attributes with "w_" and remove special characters like \n \t etc.
209
         *
210
         * @param event the event
211
         * @param wordid the wordid
212
         * @return the java.lang. object
213
         */
214
        public donothing(int event, Integer wordid) {
215
                if (event == XMLStreamConstants.START_ELEMENT ) {
216
                        
217
                        localname = parser.getLocalName();
218
                        if (wordid != null) localname = word_element_to_create;
219
                        
220
                        if (prefix != null && prefix.length() > 0)
221
                                writer.writeStartElement(prefix+":"+localname);
222
                        else
223
                                //                                if(namespace != null)
224
                                //                                        writer.writeStartElement(namespace, localname);
225
                                //                                else
226
                                writer.writeStartElement(localname);
227

    
228
                        //                        if(parser.getNamespaceCount() > 0)
229
                        //                                writer.writeDefaultNamespace(parser.getNamespaceURI(0))
230
                        //                        for(int i = 1 ; i < parser.getNamespaceCount() ; i++)
231
                        //                                writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
232

    
233
                        String namespace_prefix;
234
                        for (int i = 0 ; i<   parser.getNamespaceCount() ; i++) {
235
                                namespace_prefix = parser.getNamespacePrefix(i);
236
                                if ((namespace_prefix != null)&&   (namespace_prefix.length()>   0)) {
237
                                        writer.writeNamespace(namespace_prefix, parser.getNamespaceURI(i));
238
                                } else {
239
                                        writer.writeDefaultNamespace(parser.getNamespaceURI(i));
240
                                }
241
                        }
242

    
243
                        String attrprefix, attname;
244
                        boolean hasId = false;
245
                        //boolean hasType = false
246
                        boolean hasN = false
247
                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
248
                                attname = parser.getAttributeLocalName(i);
249
                                attrprefix = parser.getAttributePrefix(i);
250
                                if ("id".equals(attname)) hasId = true;
251
                                //if ("type".equals(attname)) hasType = true;
252
                                if ("n".equals(attname)) hasN = true;
253

    
254
                                if (attrprefix != null && attrprefix.length() > 0)
255
                                        writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i))
256
                                else
257
                                        writer.writeAttribute(attname, parser.getAttributeValue(i))
258
                        }
259

    
260
                        if (wordid != null && !hasId && localname == word_element_to_create)
261
                                writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
262

    
263
                        if (!hasN && localname == word_element_to_create)
264
                                writer.writeAttribute("n", ""+wordcount);
265

    
266
                        writer.writeCharacters("\n");
267
                }
268
                else if(event == XMLStreamConstants.END_ELEMENT)
269
                {
270
                        writer.writeEndElement();
271
                        writer.writeCharacters("\n");
272
                }
273
                else if(event == XMLStreamConstants.CHARACTERS)
274
                {
275
                        //println parser.getText();
276
                        //writer.writeCharacters("𦟛");
277

    
278

    
279
                        // checks if the token starts with an high surrogate
280
                        //                        if(isHighSurrogate(parser.getText().charAt(0)))        {
281
                        //                                println "warning: invalid UTF-8 XML range, token " + parser.getText() + " has been replaced.";
282
                        //                                writer.writeCharacters("__invalidXMLChar__")
283
                        //                                //writer.writeCharacters("𦟛");
284
                        //                                println "high surrogate: " + Integer.toHexString((int)parser.getText().charAt(0));
285
                        //                                println "low surrogate: " + Integer.toHexString((int)parser.getText().charAt(1));
286
                        //                                int charSum = parser.getText().charAt(0) + parser.getText().charAt(1);
287
                        //                                println "char sum: " + charSum;
288
                        //                                println "test " + surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
289
                        //                                int scalar = surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
290
                        //                                //writer.writeCharacters(String.valueOf((char)112692));
291
                        //                                writer.writeCharacters("&#" + scalar + ";");
292
                        //                        }
293
                        //                        else
294

    
295

    
296
                        if (insideword) { // ensure there is not \t or \n in the word form value
297
                                writer.writeCharacters(parser.getText().trim().replace("\n", " ").replace("\t", " "));
298
                        } else {
299
                                writer.writeCharacters(parser.getText());
300
                        }
301
                }
302
        }
303

    
304

    
305
        /**
306
         * Converts the specified surrogates pair to scalar.
307
         * @param highSurrogate
308
         * @param lowSurrogate
309
         * @return
310
         */
311
        public int surrogatesPairToScalar(char highSurrogate, char lowSurrogate)        {
312
                return ((highSurrogate - 0xD800) * 0x400) + (lowSurrogate - 0xDC00) + 0x10000;
313
        }
314

    
315
        /**
316
         * Checks if the specified character is an high/leading surrogate.
317
         * @param character
318
         * @return
319
         */
320
        public boolean isHighSurrogate(char character)        {
321
                return (character >= 0xD800 && character <= 0xDBFF);
322
        }
323

    
324

    
325

    
326
        /** The wordcount. */
327
        int wordcount = 0;
328

    
329
        /** The ignorecontent. */
330
        boolean ignorecontent = true; // tokenization starts with element name matching $reg_startTag
331
        boolean insideword = false;
332
        /**
333
         * Process.
334
         *
335
         * @return true, if successful
336
         */
337
        public boolean process()
338
        {
339
                if (!infile.exists()) {
340
                        println "$infile does not exists"
341
                        return false;
342
                }
343
                XMLOutputFactory factory = XMLOutputFactory.newInstance();
344
                output = new BufferedOutputStream(new FileOutputStream(outfile))
345
                writer = factory.createXMLStreamWriter(output, "UTF-8")
346
                writer.setNamespaceContext(new PersonalNamespaceContext());
347

    
348
                def inputData = infile.toURI().toURL().openStream();
349
                def inputfactory = XMLInputFactory.newInstance();
350
                //inputfactory.setProperty("http://apache.org/xml/properties/input-buffer-size", new Integer(2048));
351
                //inputfactory.setExpandEntityReferences(false);
352
                parser = inputfactory.createXMLStreamReader(inputData, "UTF-8");
353
                //println "PARSER: "+parser.getClass()
354
                writer.writeStartDocument("UTF-8","1.0");
355
                writer.writeCharacters("\n");
356

    
357
                int previousEvent = 0;
358
                boolean startProcess = false;
359
                if (startTag == null) // if no startTag specified we process from the start
360
                        startProcess = true;
361
                ignorecontent = !startProcess;
362
                
363
                buffer = new StringBuffer();
364
                //println "process - start start tag: "+startTag+" startProcess: $startProcess"
365
//                println "reg_outside_text_tags_keep_content=$reg_outside_text_tags_keep_content"
366
//                println "reg_outside_text_tags=$reg_outside_text_tags"
367
//                println "reg_note_content=$reg_note_content"
368
                try {
369
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
370
                                if (!startProcess) {
371
                                        if (event == XMLStreamConstants.START_ELEMENT) {
372
                                                if (reg_startTag.matcher(parser.getLocalName()).matches()) {
373
                                                        startProcess = true
374
                                                        ignorecontent = false;
375
                                                }
376
                                        }
377
                                        if (!startProcess) {
378
                                                donothing(event, null);
379
                                                continue;
380
                                        }
381
                                }
382

    
383
                                if (previousEvent == XMLStreamConstants.CHARACTERS && previousEvent != event) {
384
                                        processWord(); // tokenize now!
385
                                        buffer.setLength(0);
386
                                }
387
                                fillInfos(event);//get localname and prefix
388
                                if (event == XMLStreamConstants.START_ELEMENT) {
389
                                        //println "Open: "+localname;
390
                                        localname = parser.getLocalName()
391
                                        if (reg_word_tags.matcher(localname).matches()) { // ignore the content of the word but keep counting
392
                                                //println "Found pretagged word";
393
                                                wordcount++;
394
                                                donothing(event, wordcount);
395
                                                //ignorecontent = true;
396
                                                insideword = true;
397
                                        } else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
398
                                                // ignore the tag only
399
                                                donothing(event, null); // write the tag
400
                                                //println "IGNORING NOTE CONTENT OF "+localname
401
                                                ignorecontent = true;
402
                                        } else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
403
                                                //         ignore the content only
404
                                                donothing(event, null); // write the tag
405
                                                //println "IGNORING CONTENT OF "+localname
406
                                                ignorecontent = true;
407
                                        } else if (reg_outside_text_tags != null && reg_outside_text_tags.matcher(localname).matches()) { // ignore the tag and its content of the tag
408
                                                goToEndOfElement(localname); // parse until the end of the element is passed
409
                                        } else {
410
                                                donothing(event, null);
411
                                        }
412
                                } else if(event == XMLStreamConstants.END_ELEMENT) {
413
                                        //println "Close: "+localname;
414
                                        localname = parser.getLocalName()
415
                                        if (reg_word_tags.matcher(localname).matches()) {
416
                                                //ignorecontent = false;
417
                                                insideword = false;
418
                                                writer.writeEndElement();
419
                                                writer.writeCharacters("\n");
420
                                        } else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag
421
                                                ignorecontent = false;
422
                                                donothing(event, null);
423
                                        } else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
424
                                                ignorecontent = false;
425
                                                donothing(event, null);
426
                                        } else {
427
                                                donothing(event, null);
428
                                        }
429
                                } else if (event == XMLStreamConstants.CHARACTERS) {
430
                                        if (ignorecontent || insideword) {
431
                                                //println " dont tokenize chars: "+parser.getText().trim();
432
                                                donothing(event, null);
433
                                        } else {
434
                                                //println " process chars: "+parser.getText().trim();
435
                                                buffer.append(parser.getText());
436
                                                if (buffer.length() >= 128 && buffer.charAt(buffer.length()-1) == " ") {
437
                                                        processWord();
438
                                                        buffer = new StringBuffer();
439
                                                }
440
                                        }
441
                                } else if (event == XMLStreamConstants.COMMENT) {
442
                                        writer.writeComment(parser.getText())
443
                                } else if (event == XMLStreamConstants.DTD) {
444
                                        //println "DTD!";
445
                                } else {
446
                                        if (DEBUG) println "Warning in $infile: ignore XML event at location "+parser.getLocation()
447
                                }
448
                                previousEvent = event;
449
                        }
450

    
451
                        parser.close()
452
                        writer.close();
453
                        output.close();
454
                        inputData.close();
455
                } catch (Exception e) {
456
                        System.err.println("Error : "+infile);
457
                        e.printStackTrace();
458
                        return false;
459
                }
460
                return true;
461
        }
462

    
463
        /**
464
         * Set the element and content to ignore
465
         * 
466
         * @param regexp
467
         */
468
        public void setOutSideTextTags(String regexp) {
469
                this.outside_text_tags = regexp;
470
                this.reg_outside_text_tags = Pattern.compile(outside_text_tags);
471
        }
472
        
473
        /**
474
         * Set element content to NOT tokenize
475
         *
476
         * @param regexp
477
         */
478
        public void setNote(String regexp) {
479
                this.note_content = regexp;
480
                this.reg_note_content = Pattern.compile(note_content);
481
        }
482
        
483
        /**
484
         * Set the element to ignore but not their content
485
         *
486
         * @param regexp
487
         */
488
        public void setOutSideTextTagsAndKeepContent(String regexp) {
489
                this.outside_text_tags_keep_content = regexp;
490
                this.reg_outside_text_tags_keep_content = Pattern.compile(outside_text_tags_keep_content);
491
        }
492
        
493
        protected void goToEndOfElement(String name) {
494
                //println "START ignoring tag and content of $name"
495
                def openedTags = []
496
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
497
                        if (event == XMLStreamConstants.START_ELEMENT) {
498
                                openedTags << parser.getLocalName()
499
                                //println "append "+openedTags
500
                        } else if (event == XMLStreamConstants.END_ELEMENT) {
501
                                if (openedTags.size() == 0 && name == parser.getLocalName()) {
502
                                        //println "END ignoring tag and content of $name"
503
                                        return;
504
                                }
505
                                openedTags.pop()
506
                                //println "pop $openedTags"
507
                        }
508
                }
509
        }
510
        
511
        public final static String WHITESPACE = " ";
512
        public final static String EMPTY = "";
513
        /**
514
         * Process word.
515
         */
516
        protected void processWord() {
517
                String text = buffer.toString();//parser.getText().trim().replace("\t", " ");
518
                //if (DEBUG) println "-- chars: "+text+"--";
519
                text = regLN.matcher(text).replaceAll(WHITESPACE);
520
                text = regCTRL.matcher(text).replaceAll(EMPTY);                                                // remove ctrl characters
521
                if (regSplitWhiteSpaces != null) {
522
                        for (String s : regSplitWhiteSpaces.split(text)) {                // separate with unicode white spaces
523
                                //        if (DEBUG){println "process $s"}
524
                                        iterate(s);
525
                                }
526
                } else {
527
                        iterate(text);
528
                }
529
        }
530

    
531
        /**
532
         * Iterate. a String, should be called when a word is found in a String
533
         *
534
         * @param s the s
535
         * @return the java.lang. object
536
         */
537
        protected iterate(String s) {
538
                while (s != null && s.length() > 0) {
539
                //        if (DEBUG){println "  > $s"}
540
                        s = standardChecks(s);
541
                }
542
        }
543

    
544
        /**
545
         * Standard checks.
546
         *
547
         * @param s the s
548
         * @return the java.lang. object
549
         */
550
        //        @CompileStatic(SKIP)
551
        public String standardChecks(String s) {
552
                Matcher m;
553
                if (fclitics != null && (m = s =~ regFClitics) ) {
554
                //        if (DEBUG) println "CLITIC found: $s ->"+ m
555
                        iterate(m.group(1))
556
                        
557
                        wordcount++;
558
                        writer.writeStartElement(word_element_to_create);
559
                        writeWordAttributes();// id
560
                        //writer.writeAttribute("type", "w");
561
                        writer.writeCharacters(m.group(2));
562
                        writer.writeEndElement();
563
                        writer.writeCharacters("\n");
564
                        
565
                        return "";
566
                } else if (pclitics != null && (m = s =~ regPClitics) ) {
567
                        if (DEBUG) println "PCLITIC found: $s ->"+ m
568
                        
569
                        wordcount++;
570
                        writer.writeStartElement(word_element_to_create);
571
                        writeWordAttributes();// id
572
                        //writer.writeAttribute("type", "w");
573
                        writer.writeCharacters(m.group(1));
574
                        writer.writeEndElement();
575
                        writer.writeCharacters("\n");
576
                        
577
                        iterate(m.group(2))
578
                        
579
                        return "";
580
                } else if (regElision != null && (m = s =~ regElision) ) {
581
                //        if (DEBUG) println "Elision found: $s ->"+ m.group(1)+" + "+m.group(2)+" + "+m.group(3)
582
                        //iterate(m.group(1))
583

    
584
//                        int sep = s.indexOf("'");
585
//                        if (sep < 0)
586
//                                sep = s.indexOf("’");
587
//                        if (sep < 0)
588
//                                sep = s.indexOf("‘");
589

    
590
                        wordcount++;
591
                        writer.writeStartElement(word_element_to_create);
592
                        writeWordAttributes();// id
593
                        //writer.writeAttribute("type", "w");
594
                        writer.writeCharacters(m.group(1));
595
                        writer.writeEndElement();
596
                        writer.writeCharacters("\n");
597

    
598
                        iterate(m.group(2))
599
                        
600
                        return "";
601
                } else if (reg3pts != null && (m = s =~ reg3pts) )        {
602
                //        if(DEBUG){println "REG '...' found: $s -> "+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
603
                        iterate(m.group(1))
604

    
605
                        wordcount++;
606
                        writer.writeStartElement(word_element_to_create);
607
                        writeWordAttributes();// id
608
                        //writer.writeAttribute("type","pon");
609
                        writer.writeCharacters("...");
610
                        writer.writeEndElement();
611
                        writer.writeCharacters("\n");
612

    
613
                        return m.group(3);
614
                } else if (regPunct != null && (m = s =~ regPunct) ) {
615
                        if(DEBUG){println "PUNCT found: $s ->"+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
616
                        iterate(m.group(1));
617

    
618
                        wordcount++;
619
                        writer.writeStartElement(word_element_to_create);
620
                        writeWordAttributes();// id
621
                        //writer.writeAttribute("type","pon");
622
                        writer.writeCharacters(m.group(2));
623
                        writer.writeEndElement();
624
                        writer.writeCharacters("\n");
625

    
626
                        return m.group(3);
627
                } else {
628
        //                if(DEBUG){println "Other found: "+s}
629
                        wordcount++;
630
                        writer.writeStartElement(word_element_to_create);
631
                        writeWordAttributes();// id
632
//                        if (reg_punct_other.matcher(s).matches())
633
//                                writer.writeAttribute("type","pon");
634
//                        else
635
//                                writer.writeAttribute("type","w");
636
                        writer.writeCharacters(s);
637
                        writer.writeEndElement();
638
                        writer.writeCharacters("\n");
639

    
640
                        return "";
641
                }
642
        }
643

    
644
        /**
645
         * Write word attributes.
646
         *
647
         * @return the java.lang. object
648
         */
649
        protected writeWordAttributes() {
650
                writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
651
                writer.writeAttribute("n",""+wordcount);
652
        }
653

    
654
        public void setStartTag(String tag)
655
        {
656
                this.startTag = tag;
657
                this.reg_startTag = Pattern.compile(startTag);
658
        }
659

    
660
        /**
661
         * Tokenize.
662
         *
663
         * @param str the str
664
         * @return the list
665
         */
666
        public List<String> tokenize(String str)
667
        {
668
                return str.tokenize()        // cut by whitespace
669
        }
670

    
671
        /**
672
         * The main method.
673
         *
674
         * @param args the arguments
675
         */
676
        public static void main(String[] args)
677
        {
678
                File SVNDIR = new File(System.getProperty("user.home"), "xml/tokenizernum");
679
                Runtime.getRuntime().exec("svn", "update", SVNDIR.getAbsolutePath());
680
                File inputFile = new File(SVNDIR, "baye1.xml")
681
                File outputFile = new File(SVNDIR, "baye1-t.xml")
682
                SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, "fr")
683
                tokenizer.DEBUG = true
684
                tokenizer.setNote("note")
685
                //tokenizer.setOutSideTextTags("teiHeader")
686
                tokenizer.setOutSideTextTagsAndKeepContent("teiHeader")
687
                println outputFile.toString() + " : "+tokenizer.process();
688
                // FIXME: tests UTF-8 XML ranges
689
                //                File inputDir = new File("C:/Tools/Textometrie/___corpus/xml/test_bug_import_chinois/")
690
                //        File inputFile = new File(inputDir, "problem2.xml");
691

    
692
                /*
693
                File inputDir = new File("/home/mdecorde/xml/bvhepistemon2016")
694
                File outputDir = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast")
695
                File outputDir2 = new File("/home/mdecorde/xml/bvhepistemon2016/out-slow")
696
                File outputDir3 = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast2")
697
                File inputFile = new File(inputDir, "1542_RabelaisGrgt.xml");
698
                outputDir.deleteDir()
699
                outputDir2.deleteDir()
700
                outputDir3.deleteDir()
701
                outputDir.mkdir()
702
                outputDir2.mkdir()
703
                outputDir3.mkdir()
704
                long time;
705
                for (String lang : ["none"]) {
706
                        println "LANG: $lang"
707

708
                        time = System.currentTimeMillis();
709
                        File outputFile = new File(outputDir, "${lang}.xml")
710
                        SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, lang)
711
                        tokenizer.process();
712
                        println "FAST $lang: "+(System.currentTimeMillis()-time)
713

714
//                        time = System.currentTimeMillis();
715
//                        File outputFile3 = new File(outputDir3, "${lang}.xml")
716
//                        FastSimpleTokenizerXml tokenizer3 = new FastSimpleTokenizerXml(inputFile, outputFile3, lang)
717
//                        tokenizer3.process();
718
//                        println "FAST2 $lang: "+(System.currentTimeMillis()-time)
719
//                        
720
//                        time = System.currentTimeMillis();
721
//                        File outputFile2 = new File(outputDir2, "${lang}.xml")
722
//                        OldSimpleTokenizerXml tokenizer2 = new OldSimpleTokenizerXml(inputFile, outputFile2, lang)
723
//                        tokenizer2.process();
724
//                        println "SLOW $lang: "+(System.currentTimeMillis()-time)
725
                }
726
*/
727
                println ""
728
                println "Done"
729

    
730
                //                String lang = "en"
731
                //                File inDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant")
732
                //                File outDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant-t")
733
                //                outDir.deleteDir()
734
                //                outDir.mkdir()
735
                //
736
                //                println "processing "+inDir.listFiles().size()+" files."
737
                //                for (def infile : inDir.listFiles()) {
738
                //                        if (!infile.getName().endsWith(".xml")) continue;
739
                //
740
                //                        print "."
741
                //                        File outfile = new File(outDir, infile.getName())
742
                //                        SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
743
                //                        tokenizer.setDEBUG false
744
                //                        tokenizer.process();
745
                //                }
746
                //                println ""
747
                //                println "Done"
748

    
749
        }
750
}