Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / filters / Tokeniser / SimpleTokenizerXml.groovy @ 187

History | View | Annotate | Download (23.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate:$
41
// $LastChangedRevision:$
42
// $LastChangedBy:$
43
//
44
package filters.Tokeniser;
45

    
46
import static groovy.transform.TypeCheckingMode.SKIP
47
import groovy.transform.CompileStatic
48

    
49
import java.util.regex.Matcher
50
import java.util.regex.Pattern
51

    
52
import javax.xml.stream.*
53

    
54
import org.txm.importer.graal.PersonalNamespaceContext
55
import org.txm.tokenizer.TokenizerClasses
56

    
57
@CompileStatic
58
public class SimpleTokenizerXml {
59

    
60
        /** The word_tags. */
61
        String word_tags = TokenizerClasses.word_tags;
62
        Pattern reg_word_tags;
63
        /** The intraword_tags. */
64
        String intraword_tags = TokenizerClasses.intraword_tags;
65

    
66
        /** The punct_strong. */
67
        String punct_strong = TokenizerClasses.punct_strong;
68

    
69
        /** The punct_all. */
70
        String punct_all = TokenizerClasses.punct_all;
71

    
72
        /** The word_chars. */
73
        String word_chars = TokenizerClasses.word_chars;
74

    
75
        String fclitics = null; // default behavior don't manage clitics
76
        String pclitics = null; // default behavior don't manage clitics
77

    
78
        /** The outside_text_tags_ignore_content. */
79
        String note_content = null;
80
        String outside_text_tags_keep_content = null // tag and content NOT removed but not tokenized
81
        String outside_text_tags = null // tag and content removed
82
        String startTag = null
83
        Pattern reg_note_content;
84
        Pattern reg_outside_text_tags_keep_content; 
85
        Pattern reg_outside_text_tags;
86
        Pattern reg_startTag;
87
        
88
        Pattern reg_punct_other = Pattern.compile("\\p{P}")
89

    
90
        /** The DEBUG. */
91
        public boolean DEBUG = false;
92

    
93
        /** The outfile. */
94
        File outfile;
95

    
96
        /** The infile. */
97
        File infile;
98

    
99
        String lang;
100

    
101
        /** The buffer. */
102
        StringBuffer buffer;
103

    
104
        /** The writer. */
105
        XMLStreamWriter writer;
106
        BufferedOutputStream output;
107

    
108
        /** The parser. */
109
        XMLStreamReader parser
110

    
111
        /** The localname. */
112
        String localname;
113

    
114
        /** The prefix. */
115
        String prefix;
116
        String filename;
117

    
118
        def regElision = null;
119
        def reg3pts = null;
120
        def regPunct;
121
        def regFClitics = null ;
122
        def regPClitics = null ;
123
        String whitespaces;
124
        Pattern regWhitespaces;
125
        Pattern regLN;
126
        Pattern regCTRL;
127
        Pattern regSplitWhiteSpaces;
128

    
129
        public SimpleTokenizerXml(File infile, File outfile) {
130
                this(infile, outfile, "");
131
        }
132

    
133
        /**
134
         * Instantiates a new simple tokenizer xml.
135
         *
136
         * @param infile the infile
137
         * @param outfile the outfile
138
         */
139
        public SimpleTokenizerXml(File infile, File outfile, String lang) {
140
                this.lang = lang;
141
                if (lang != null)
142
                        if (lang.startsWith("en")) {
143
                                fclitics = TokenizerClasses.FClitic_en
144
                        } else if (lang.startsWith("fr")) {
145
                                fclitics = TokenizerClasses.FClitic_fr
146
                                pclitics = TokenizerClasses.PClitic_fr
147
                        } else if (lang.startsWith("gl")) {
148
                                fclitics = TokenizerClasses.FClitic_gl
149
                        } else if (lang.startsWith("it")) {
150
                                pclitics = TokenizerClasses.PClitic_it
151
                        }
152

    
153
                word_tags = TokenizerClasses.word_tags;
154
                reg_word_tags = Pattern.compile(word_tags);
155
                intraword_tags = TokenizerClasses.intraword_tags;
156
                punct_strong = TokenizerClasses.punct_strong;
157
                punct_all = TokenizerClasses.punct_all;
158
                word_chars = TokenizerClasses.word_chars;
159

    
160
                this.outfile = outfile;
161
                this.infile = infile;
162
                this.filename = infile.getName();
163
                int index = filename.indexOf(".");
164
                if (index > 0) filename = filename.substring(0, index);
165

    
166
                String strRegElision = TokenizerClasses.regElision;
167
                if (strRegElision != null && strRegElision.length() > 0)
168
                        regElision = ~/^([\p{L}-]++${strRegElision}[\p{L}-]++)(.*)$/ // the test must start with the start of string  ^
169
                reg3pts = ~/^(.*?)(\.\.\.)(.*)$/
170
                
171
                String strRegPunct = TokenizerClasses.regPunct;
172
                if (strRegPunct != null && strRegPunct.length() > 0)
173
                        regPunct = ~/^(.*?)($strRegPunct)(.*)$/
174
                
175
                if (fclitics != null && fclitics.length() > 0)
176
                        regFClitics = ~/(.+)($fclitics)$/ // the test must end with the end of string $
177
                        
178
                if (pclitics != null && pclitics.length() > 0)
179
                        regPClitics = /^($pclitics)(.*)/ // the test must start with the start of string  ^
180
                        
181
                if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0)
182
                        regWhitespaces = ~TokenizerClasses.whitespaces;
183

    
184
                if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0)
185
                        regSplitWhiteSpaces = Pattern.compile(TokenizerClasses.whitespaces);
186
                        
187
                regLN = Pattern.compile("/\n/");
188
                regCTRL = Pattern.compile("/\\p{C}/");
189
        }
190

    
191
        /**
192
         * Fill infos.
193
         *
194
         * @param event the event
195
         * @return the java.lang. object
196
         */
197
        public fillInfos(int event) {
198
                if (event == XMLStreamConstants.START_ELEMENT || event == XMLStreamConstants.END_ELEMENT) {
199
                        localname = parser.getLocalName();
200
                        prefix = parser.getPrefix();
201
                }
202
        }
203

    
204
        /**
205
         * Donothing: just write what is read
206
         * 
207
         * special processing for words: prefix id attributes with "w_" and remove special characters like \n \t etc.
208
         *
209
         * @param event the event
210
         * @param wordid the wordid
211
         * @return the java.lang. object
212
         */
213
        public donothing(int event, Integer wordid) {
214
                if (event == XMLStreamConstants.START_ELEMENT ) {
215
                        localname = parser.getLocalName();
216
                        if (prefix != null && prefix.length() > 0)
217
                                writer.writeStartElement(prefix+":"+localname);
218
                        else
219
                                //                                if(namespace != null)
220
                                //                                        writer.writeStartElement(namespace, localname);
221
                                //                                else
222
                                writer.writeStartElement(localname);
223

    
224
                        //                        if(parser.getNamespaceCount() > 0)
225
                        //                                writer.writeDefaultNamespace(parser.getNamespaceURI(0))
226
                        //                        for(int i = 1 ; i < parser.getNamespaceCount() ; i++)
227
                        //                                writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
228

    
229
                        String namespace_prefix;
230
                        for (int i = 0 ; i<   parser.getNamespaceCount() ; i++) {
231
                                namespace_prefix = parser.getNamespacePrefix(i);
232
                                if ((namespace_prefix != null)&&   (namespace_prefix.length()>   0)) {
233
                                        writer.writeNamespace(namespace_prefix, parser.getNamespaceURI(i));
234
                                } else {
235
                                        writer.writeDefaultNamespace(parser.getNamespaceURI(i));
236
                                }
237
                        }
238

    
239
                        String attrprefix, attname;
240
                        boolean hasId = false;
241
                        //boolean hasType = false
242
                        boolean hasN = false
243
                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
244
                                attname = parser.getAttributeLocalName(i);
245
                                attrprefix = parser.getAttributePrefix(i);
246
                                if ("id".equals(attname)) hasId = true;
247
                                //if ("type".equals(attname)) hasType = true;
248
                                if ("n".equals(attname)) hasN = true;
249

    
250
                                if (attrprefix != null && attrprefix.length() > 0)
251
                                        writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i))
252
                                else
253
                                        writer.writeAttribute(attname, parser.getAttributeValue(i))
254
                        }
255

    
256
                        if (wordid != null && !hasId && localname == "w")
257
                                writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
258

    
259
                        //if (!hasType && localname == "w") writer.writeAttribute("type", "w");
260

    
261
                        if (!hasN && localname == "w")
262
                                writer.writeAttribute("n", ""+wordcount);
263

    
264
                        writer.writeCharacters("\n");
265
                }
266
                else if(event == XMLStreamConstants.END_ELEMENT)
267
                {
268
                        writer.writeEndElement();
269
                        writer.writeCharacters("\n");
270
                }
271
                else if(event == XMLStreamConstants.CHARACTERS)
272
                {
273
                        //println parser.getText();
274
                        //writer.writeCharacters("𦟛");
275

    
276

    
277
                        // checks if the token starts with an high surrogate
278
                        //                        if(isHighSurrogate(parser.getText().charAt(0)))        {
279
                        //                                println "warning: invalid UTF-8 XML range, token " + parser.getText() + " has been replaced.";
280
                        //                                writer.writeCharacters("__invalidXMLChar__")
281
                        //                                //writer.writeCharacters("𦟛");
282
                        //                                println "high surrogate: " + Integer.toHexString((int)parser.getText().charAt(0));
283
                        //                                println "low surrogate: " + Integer.toHexString((int)parser.getText().charAt(1));
284
                        //                                int charSum = parser.getText().charAt(0) + parser.getText().charAt(1);
285
                        //                                println "char sum: " + charSum;
286
                        //                                println "test " + surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
287
                        //                                int scalar = surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
288
                        //                                //writer.writeCharacters(String.valueOf((char)112692));
289
                        //                                writer.writeCharacters("&#" + scalar + ";");
290
                        //                        }
291
                        //                        else
292

    
293

    
294
                        if (insideword) { // ensure there is not \t or \n in the word form value
295
                                writer.writeCharacters(parser.getText().trim().replace("\n", " ").replace("\t", " "));
296
                        } else {
297
                                writer.writeCharacters(parser.getText());
298
                        }
299
                }
300
        }
301

    
302

    
303
        /**
304
         * Converts the specified surrogates pair to scalar.
305
         * @param highSurrogate
306
         * @param lowSurrogate
307
         * @return
308
         */
309
        public int surrogatesPairToScalar(char highSurrogate, char lowSurrogate)        {
310
                return ((highSurrogate - 0xD800) * 0x400) + (lowSurrogate - 0xDC00) + 0x10000;
311
        }
312

    
313
        /**
314
         * Checks if the specified character is an high/leading surrogate.
315
         * @param character
316
         * @return
317
         */
318
        public boolean isHighSurrogate(char character)        {
319
                return (character >= 0xD800 && character <= 0xDBFF);
320
        }
321

    
322

    
323

    
324
        /** The wordcount. */
325
        int wordcount = 0;
326

    
327
        /** The ignorecontent. */
328
        boolean ignorecontent = true; // tokenization starts with element name matching $reg_startTag
329
        boolean insideword = false;
330
        /**
331
         * Process.
332
         *
333
         * @return true, if successful
334
         */
335
        public boolean process()
336
        {
337
                if (!infile.exists()) {
338
                        println "$infile does not exists"
339
                        return false;
340
                }
341
                XMLOutputFactory factory = XMLOutputFactory.newInstance();
342
                output = new BufferedOutputStream(new FileOutputStream(outfile))
343
                writer = factory.createXMLStreamWriter(output, "UTF-8")
344
                writer.setNamespaceContext(new PersonalNamespaceContext());
345

    
346
                def inputData = infile.toURI().toURL().openStream();
347
                def inputfactory = XMLInputFactory.newInstance();
348
                //inputfactory.setProperty("http://apache.org/xml/properties/input-buffer-size", new Integer(2048));
349
                //inputfactory.setExpandEntityReferences(false);
350
                parser = inputfactory.createXMLStreamReader(inputData, "UTF-8");
351
                //println "PARSER: "+parser.getClass()
352
                writer.writeStartDocument("UTF-8","1.0");
353
                writer.writeCharacters("\n");
354

    
355
                int previousEvent = 0;
356
                boolean startProcess = false;
357
                if (startTag == null) // if no startTag specified we process from the start
358
                        startProcess = true;
359
                ignorecontent = !startProcess;
360
                
361
                buffer = new StringBuffer();
362
                //println "process - start start tag: "+startTag+" startProcess: $startProcess"
363
//                println "reg_outside_text_tags_keep_content=$reg_outside_text_tags_keep_content"
364
//                println "reg_outside_text_tags=$reg_outside_text_tags"
365
//                println "reg_note_content=$reg_note_content"
366
                try {
367
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
368
                                if (!startProcess) {
369
                                        if (event == XMLStreamConstants.START_ELEMENT) {
370
                                                if (reg_startTag.matcher(parser.getLocalName()).matches()) {
371
                                                        startProcess = true
372
                                                        ignorecontent = false;
373
                                                }
374
                                        }
375
                                        if (!startProcess) {
376
                                                donothing(event, null);
377
                                                continue;
378
                                        }
379
                                }
380

    
381
                                if (previousEvent == XMLStreamConstants.CHARACTERS && previousEvent != event) {
382
                                        processWord(); // tokenize now!
383
                                        buffer.setLength(0);
384
                                }
385
                                fillInfos(event);//get localname and prefix
386
                                if (event == XMLStreamConstants.START_ELEMENT) {
387
                                        //println "Open: "+localname;
388
                                        localname = parser.getLocalName()
389
                                        if (reg_word_tags.matcher(localname).matches()) { // ignore the content of the word but keep counting
390
                                                //println "Found pretagged word";
391
                                                wordcount++;
392
                                                donothing(event, wordcount);
393
                                                ignorecontent = true;
394
                                                insideword = true;
395
                                        } else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
396
                                                // ignore the tag only
397
                                                donothing(event, null); // write the tag
398
                                                //println "IGNORING NOTE CONTENT OF "+localname
399
                                                ignorecontent = true;
400
                                        } else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
401
                                                //         ignore the content only
402
                                                donothing(event, null); // write the tag
403
                                                //println "IGNORING CONTENT OF "+localname
404
                                                ignorecontent = true;
405
                                        } else if (reg_outside_text_tags != null && reg_outside_text_tags.matcher(localname).matches()) { // ignore the tag and its content of the tag
406
                                                goToEndOfElement(localname); // parse until the end of the element is passed
407
                                        } else {
408
                                                donothing(event, null);
409
                                        }
410
                                } else if(event == XMLStreamConstants.END_ELEMENT) {
411
                                        //println "Close: "+localname;
412
                                        localname = parser.getLocalName()
413
                                        if (reg_word_tags.matcher(localname).matches()) {
414
                                                //ignorecontent = false;
415
                                                insideword = false;
416
                                                writer.writeEndElement();
417
                                                writer.writeCharacters("\n");
418
                                        } else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag
419
                                                ignorecontent = false;
420
                                                donothing(event, null);
421
                                        } else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
422
                                                ignorecontent = false;
423
                                                donothing(event, null);
424
                                        } else {
425
                                                donothing(event, null);
426
                                        }
427
                                } else if (event == XMLStreamConstants.CHARACTERS) {
428
                                        if (ignorecontent) {
429
                                                //println " dont tokenize chars: "+parser.getText().trim();
430
                                                donothing(event, null);
431
                                        } else {
432
                                                //println " process chars: "+parser.getText().trim();
433
                                                buffer.append(parser.getText());
434
                                                if (buffer.length() >= 128 && buffer.charAt(buffer.length()-1) == " ") {
435
                                                        processWord();
436
                                                        buffer = new StringBuffer();
437
                                                }
438
                                        }
439
                                } else if (event == XMLStreamConstants.COMMENT) {
440
                                        writer.writeComment(parser.getText())
441
                                } else if (event == XMLStreamConstants.DTD) {
442
                                        //println "DTD!";
443
                                } else {
444
                                        if (DEBUG) println "Warning in $infile: ignore XML event at location "+parser.getLocation()
445
                                }
446
                                previousEvent = event;
447
                        }
448

    
449
                        parser.close()
450
                        writer.close();
451
                        output.close();
452
                        inputData.close();
453
                } catch (Exception e) {
454
                        System.err.println("Error : "+infile);
455
                        e.printStackTrace();
456
                        return false;
457
                }
458
                return true;
459
        }
460

    
461
        /**
462
         * Set the element and content to ignore
463
         * 
464
         * @param regexp
465
         */
466
        public void setOutSideTextTags(String regexp) {
467
                this.outside_text_tags = regexp;
468
                this.reg_outside_text_tags = Pattern.compile(outside_text_tags);
469
        }
470
        
471
        /**
472
         * Set element content to NOT tokenize
473
         *
474
         * @param regexp
475
         */
476
        public void setNote(String regexp) {
477
                this.note_content = regexp;
478
                this.reg_note_content = Pattern.compile(note_content);
479
        }
480
        
481
        /**
482
         * Set the element to ignore but not their content
483
         *
484
         * @param regexp
485
         */
486
        public void setOutSideTextTagsAndKeepContent(String regexp) {
487
                this.outside_text_tags_keep_content = regexp;
488
                this.reg_outside_text_tags_keep_content = Pattern.compile(outside_text_tags_keep_content);
489
        }
490
        
491
        protected void goToEndOfElement(String name) {
492
                //println "START ignoring tag and content of $name"
493
                def openedTags = []
494
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
495
                        if (event == XMLStreamConstants.START_ELEMENT) {
496
                                openedTags << parser.getLocalName()
497
                                //println "append "+openedTags
498
                        } else if (event == XMLStreamConstants.END_ELEMENT) {
499
                                if (openedTags.size() == 0 && name == parser.getLocalName()) {
500
                                        //println "END ignoring tag and content of $name"
501
                                        return;
502
                                }
503
                                openedTags.pop()
504
                                //println "pop $openedTags"
505
                        }
506
                }
507
        }
508
        
509
        public final static String WHITESPACE = " ";
510
        public final static String EMPTY = "";
511
        /**
512
         * Process word.
513
         */
514
        protected void processWord() {
515
                String text = buffer.toString();//parser.getText().trim().replace("\t", " ");
516
                //if (DEBUG) println "-- chars: "+text+"--";
517
                text = regLN.matcher(text).replaceAll(WHITESPACE);
518
                text = regCTRL.matcher(text).replaceAll(EMPTY);                                                // remove ctrl characters
519
                if (regSplitWhiteSpaces != null) {
520
                        for (String s : regSplitWhiteSpaces.split(text)) {                // separate with unicode white spaces
521
                                //        if (DEBUG){println "process $s"}
522
                                        iterate(s);
523
                                }
524
                } else {
525
                        iterate(text);
526
                }
527
        }
528

    
529
        /**
530
         * Iterate. a String, should be called when a word is found in a String
531
         *
532
         * @param s the s
533
         * @return the java.lang. object
534
         */
535
        protected iterate(String s) {
536
                while (s != null && s.length() > 0) {
537
                //        if (DEBUG){println "  > $s"}
538
                        s = standardChecks(s);
539
                }
540
        }
541

    
542
        /**
543
         * Standard checks.
544
         *
545
         * @param s the s
546
         * @return the java.lang. object
547
         */
548
        //        @CompileStatic(SKIP)
549
        public String standardChecks(String s) {
550
                Matcher m;
551
                if (fclitics != null && (m = s =~ regFClitics) ) {
552
                //        if (DEBUG) println "CLITIC found: $s ->"+ m
553
                        iterate(m.group(1))
554
                        
555
                        wordcount++;
556
                        writer.writeStartElement("w");
557
                        writeWordAttributes();// id
558
                        //writer.writeAttribute("type", "w");
559
                        writer.writeCharacters(m.group(2));
560
                        writer.writeEndElement();
561
                        writer.writeCharacters("\n");
562
                        
563
                        return "";
564
                } else if (pclitics != null && (m = s =~ regPClitics) ) {
565
                        if (DEBUG) println "PCLITIC found: $s ->"+ m
566
                        
567
                        wordcount++;
568
                        writer.writeStartElement("w");
569
                        writeWordAttributes();// id
570
                        //writer.writeAttribute("type", "w");
571
                        writer.writeCharacters(m.group(1));
572
                        writer.writeEndElement();
573
                        writer.writeCharacters("\n");
574
                        
575
                        iterate(m.group(2))
576
                        
577
                        return "";
578
                } else if (regElision != null && (m = s =~ regElision) ) {
579
                //        if (DEBUG) println "Elision found: $s ->"+ m.group(1)+" + "+m.group(2)+" + "+m.group(3)
580
                        //iterate(m.group(1))
581

    
582
//                        int sep = s.indexOf("'");
583
//                        if (sep < 0)
584
//                                sep = s.indexOf("’");
585
//                        if (sep < 0)
586
//                                sep = s.indexOf("‘");
587

    
588
                        wordcount++;
589
                        writer.writeStartElement("w");
590
                        writeWordAttributes();// id
591
                        //writer.writeAttribute("type", "w");
592
                        writer.writeCharacters(m.group(1));
593
                        writer.writeEndElement();
594
                        writer.writeCharacters("\n");
595

    
596
                        iterate(m.group(2))
597
                        
598
                        return "";
599
                } else if (reg3pts != null && (m = s =~ reg3pts) )        {
600
                //        if(DEBUG){println "REG '...' found: $s -> "+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
601
                        iterate(m.group(1))
602

    
603
                        wordcount++;
604
                        writer.writeStartElement("w");
605
                        writeWordAttributes();// id
606
                        //writer.writeAttribute("type","pon");
607
                        writer.writeCharacters("...");
608
                        writer.writeEndElement();
609
                        writer.writeCharacters("\n");
610

    
611
                        return m.group(3);
612
                } else if (regPunct != null && (m = s =~ regPunct) ) {
613
                        if(DEBUG){println "PUNCT found: $s ->"+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
614
                        iterate(m.group(1));
615

    
616
                        wordcount++;
617
                        writer.writeStartElement("w");
618
                        writeWordAttributes();// id
619
                        //writer.writeAttribute("type","pon");
620
                        writer.writeCharacters(m.group(2));
621
                        writer.writeEndElement();
622
                        writer.writeCharacters("\n");
623

    
624
                        return m.group(3);
625
                } else {
626
        //                if(DEBUG){println "Other found: "+s}
627
                        wordcount++;
628
                        writer.writeStartElement("w");
629
                        writeWordAttributes();// id
630
//                        if (reg_punct_other.matcher(s).matches())
631
//                                writer.writeAttribute("type","pon");
632
//                        else
633
//                                writer.writeAttribute("type","w");
634
                        writer.writeCharacters(s);
635
                        writer.writeEndElement();
636
                        writer.writeCharacters("\n");
637

    
638
                        return "";
639
                }
640
        }
641

    
642
        /**
643
         * Write word attributes.
644
         *
645
         * @return the java.lang. object
646
         */
647
        protected writeWordAttributes() {
648
                writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
649
                writer.writeAttribute("n",""+wordcount);
650
        }
651

    
652
        public void setStartTag(String tag)
653
        {
654
                this.startTag = tag;
655
                this.reg_startTag = Pattern.compile(startTag);
656
        }
657

    
658
        /**
659
         * Tokenize.
660
         *
661
         * @param str the str
662
         * @return the list
663
         */
664
        public List<String> tokenize(String str)
665
        {
666
                return str.tokenize()        // cut by whitespace
667
        }
668

    
669
        /**
670
         * The main method.
671
         *
672
         * @param args the arguments
673
         */
674
        public static void main(String[] args)
675
        {
676
                File SVNDIR = new File(System.getProperty("user.home"), "SVN/TXMSVN/trunk/corpora/tokenizer");
677
                Runtime.getRuntime().exec("svn", "update", SVNDIR.getAbsolutePath());
678
                File inputFile = new File(SVNDIR, "test1.xml")
679
                File outputFile = new File(SVNDIR, "test1-t.xml")
680
                SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, "fr")
681
                tokenizer.DEBUG = true
682
                tokenizer.setNote("note")
683
                tokenizer.setOutSideTextTags("outside")
684
                tokenizer.setOutSideTextTagsAndKeepContent("outsideToEdit")
685
                println outputFile.toString() + " : "+tokenizer.process();
686
                // FIXME: tests UTF-8 XML ranges
687
                //                File inputDir = new File("C:/Tools/Textometrie/___corpus/xml/test_bug_import_chinois/")
688
                //        File inputFile = new File(inputDir, "problem2.xml");
689

    
690
                /*
691
                File inputDir = new File("/home/mdecorde/xml/bvhepistemon2016")
692
                File outputDir = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast")
693
                File outputDir2 = new File("/home/mdecorde/xml/bvhepistemon2016/out-slow")
694
                File outputDir3 = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast2")
695
                File inputFile = new File(inputDir, "1542_RabelaisGrgt.xml");
696
                outputDir.deleteDir()
697
                outputDir2.deleteDir()
698
                outputDir3.deleteDir()
699
                outputDir.mkdir()
700
                outputDir2.mkdir()
701
                outputDir3.mkdir()
702
                long time;
703
                for (String lang : ["none"]) {
704
                        println "LANG: $lang"
705

706
                        time = System.currentTimeMillis();
707
                        File outputFile = new File(outputDir, "${lang}.xml")
708
                        SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, lang)
709
                        tokenizer.process();
710
                        println "FAST $lang: "+(System.currentTimeMillis()-time)
711

712
//                        time = System.currentTimeMillis();
713
//                        File outputFile3 = new File(outputDir3, "${lang}.xml")
714
//                        FastSimpleTokenizerXml tokenizer3 = new FastSimpleTokenizerXml(inputFile, outputFile3, lang)
715
//                        tokenizer3.process();
716
//                        println "FAST2 $lang: "+(System.currentTimeMillis()-time)
717
//                        
718
//                        time = System.currentTimeMillis();
719
//                        File outputFile2 = new File(outputDir2, "${lang}.xml")
720
//                        OldSimpleTokenizerXml tokenizer2 = new OldSimpleTokenizerXml(inputFile, outputFile2, lang)
721
//                        tokenizer2.process();
722
//                        println "SLOW $lang: "+(System.currentTimeMillis()-time)
723
                }
724
*/
725
                println ""
726
                println "Done"
727

    
728
                //                String lang = "en"
729
                //                File inDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant")
730
                //                File outDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant-t")
731
                //                outDir.deleteDir()
732
                //                outDir.mkdir()
733
                //
734
                //                println "processing "+inDir.listFiles().size()+" files."
735
                //                for (def infile : inDir.listFiles()) {
736
                //                        if (!infile.getName().endsWith(".xml")) continue;
737
                //
738
                //                        print "."
739
                //                        File outfile = new File(outDir, infile.getName())
740
                //                        SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
741
                //                        tokenizer.setDEBUG false
742
                //                        tokenizer.process();
743
                //                }
744
                //                println ""
745
                //                println "Done"
746

    
747
        }
748
}