Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / filters / Tokeniser / FastSimpleTokenizerXml.groovy @ 1688

History | View | Annotate | Download (17.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate:$
41
// $LastChangedRevision:$
42
// $LastChangedBy:$
43
//
44
package org.txm.scripts.filters.Tokeniser;
45

    
46
import static groovy.transform.TypeCheckingMode.SKIP
47
import groovy.transform.CompileStatic
48

    
49
import java.util.regex.Matcher
50
import java.util.regex.Pattern
51

    
52
import javax.xml.stream.*
53

    
54
import org.txm.scripts.importer.graal.PersonalNamespaceContext
55
import org.txm.tokenizer.TokenizerClasses
56
@CompileStatic
57
public class FastSimpleTokenizerXml {
58

    
59
        /** The word_tags. */
60
        String word_tags = TokenizerClasses.word_tags;
61

    
62
        /** The intraword_tags. */
63
        String intraword_tags = TokenizerClasses.intraword_tags;
64

    
65
        /** The punct_strong. */
66
        def punct_strong = TokenizerClasses.punct_strong;
67

    
68
        /** The punct_all. */
69
        def punct_all = TokenizerClasses.punct_all;
70

    
71
        /** The word_chars. */
72
        def word_chars = TokenizerClasses.word_chars;
73
        
74
        def fclitics = null; // default behavior don't manage clitics
75
        def pclitics = null; // default behavior don't manage clitics
76

    
77
        /** The ignorable_tags. */
78
        String ignorable_tags = "";
79
        String startTag = null
80

    
81
        /** The DEBUG. */
82
        boolean DEBUG = false;
83

    
84
        /** The outfile. */
85
        File outfile;
86

    
87
        /** The infile. */
88
        File infile;
89
        
90
        String lang;
91

    
92
        /** The buffer. */
93
        StringBuffer buffer;
94

    
95
        /** The writer. */
96
        XMLStreamWriter writer;
97
        BufferedOutputStream output;
98
        
99
        /** The parser. */
100
        XMLStreamReader parser
101

    
102
        /** The localname. */
103
        String localname;
104

    
105
        /** The prefix. */
106
        String prefix;
107
        String filename;
108
        
109
        def regElision;
110
        def reg3pts;
111
        def regPunct;
112
        def regFClitics = null ;
113
        String whitespaces;
114
        Pattern regLN;
115
        Pattern regCTRL;
116
        Pattern regSplitWhiteSpaces;
117
        
118
        public FastSimpleTokenizerXml(File infile, File outfile)
119
        {
120
                this(infile, outfile, "");
121
        }
122
        
123
        /**
124
         * Instantiates a new simple tokenizer xml.
125
         *
126
         * @param infile the infile
127
         * @param outfile the outfile
128
         */
129
        public FastSimpleTokenizerXml(File infile, File outfile, String lang)
130
        {
131
                this.lang = lang;
132
                if (lang != null)
133
                if (lang.startsWith("en")) {
134
                        fclitics = TokenizerClasses.FClitic_en
135
                } else if (lang.startsWith("fr")) {
136
                        fclitics = TokenizerClasses.FClitic_fr
137
                        pclitics = TokenizerClasses.PClitic_fr
138
                } else if (lang.startsWith("gl")) {
139
                        fclitics = TokenizerClasses.FClitic_gl
140
                } else if (lang.startsWith("it")) {
141
                        pclitics = TokenizerClasses.PClitic_it
142
                }
143
                
144
                this.outfile = outfile;
145
                this.infile = infile;
146
                this.filename = infile.getName();
147
                int index = filename.lastIndexOf(".");
148
                if (index > 0) filename = filename.substring(0, index);
149
                
150
                String strRegElision = TokenizerClasses.regElision;
151
                String strRegPunct = TokenizerClasses.regPunct;
152
                regElision = ~/\A(.*?)(\p{L}++$strRegElision)(.*)\Z/
153
                reg3pts = ~/\A(.*)(\.\.\.)(.*)\Z/
154
                regPunct = ~/\A(.*)($strRegPunct)(.*)\Z/
155
                if (fclitics != null)
156
                        regFClitics = ~/(.+)($fclitics)/
157
                whitespaces = ~TokenizerClasses.whitespaces;
158
                
159
                regSplitWhiteSpaces = Pattern.compile(TokenizerClasses.whitespaces);
160
                regLN = Pattern.compile("/\n/");
161
                regCTRL = Pattern.compile("/\\p{C}/");
162
                
163
        }
164

    
165
        /**
166
         * Fill infos.
167
         *
168
         * @param event the event
169
         * @return the java.lang. object
170
         */
171
        public fillInfos(int event)
172
        {
173
                if (event == XMLStreamConstants.START_ELEMENT || event == XMLStreamConstants.END_ELEMENT)
174
                {
175
                        localname = parser.getLocalName();
176
                        prefix = parser.getPrefix();
177
                }
178
        }
179

    
180
        /**
181
         * Donothing.
182
         *
183
         * @param event the event
184
         * @param wordid the wordid
185
         * @return the java.lang. object
186
         */
187
        public donothing(int event, Integer wordid)
188
        {
189
                if (event == XMLStreamConstants.START_ELEMENT ) {
190
                        String namespace = parser.getNamespaceURI();
191

    
192
                        localname = parser.getLocalName();
193
                        if (prefix != null && prefix.length() > 0)
194
                                writer.writeStartElement(prefix+":"+localname);
195
                        else
196
                        //                                if(namespace != null)
197
                        //                                        writer.writeStartElement(namespace, localname);
198
                        //                                else
199
                        writer.writeStartElement(localname);
200

    
201
                        //                        if(parser.getNamespaceCount() > 0)
202
                        //                                writer.writeDefaultNamespace(parser.getNamespaceURI(0))
203
                        //                        for(int i = 1 ; i < parser.getNamespaceCount() ; i++)
204
                        //                                writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
205

    
206
                        String namespace_prefix;
207
                        for (int i = 0 ; i<   parser.getNamespaceCount() ; i++) {
208
                                namespace_prefix = parser.getNamespacePrefix(i);
209
                                if ((namespace_prefix != null)&&   (namespace_prefix.length()>   0)) {
210
                                        writer.writeNamespace(namespace_prefix, parser.getNamespaceURI(i));
211
                                } else {
212
                                        writer.writeDefaultNamespace(parser.getNamespaceURI(i));
213
                                }
214
                        }
215

    
216
                        String attrprefix, attname;
217
                        boolean hasId = false;
218
                        boolean hasType = false
219
                        boolean hasN = false
220
                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
221
                                attname = parser.getAttributeLocalName(i);
222
                                attrprefix = parser.getAttributePrefix(i);
223
                                if ("id".equals(attname)) hasId = true;
224
                                if ("type".equals(attname)) hasType = true;
225
                                if ("n".equals(attname)) hasN = true;
226

    
227
                                if(attrprefix != null && attrprefix.length() > 0)
228
                                        writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i))
229
                                else
230
                                        writer.writeAttribute(attname, parser.getAttributeValue(i))
231
                        }
232

    
233
                        if (wordid != null && !hasId && localname == "w")
234
                                writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
235

    
236
                        if (!hasType && localname == "w")
237
                                writer.writeAttribute("type", "w");
238
                                
239
                        if (!hasN && localname == "w")
240
                                writer.writeAttribute("n", ""+wordcount);
241

    
242
                        writer.writeCharacters("\n");
243
                }
244
                else if(event == XMLStreamConstants.END_ELEMENT)
245
                {
246
                        writer.writeEndElement();
247
                        writer.writeCharacters("\n");
248
                }
249
                else if(event == XMLStreamConstants.CHARACTERS)
250
                {
251
                        //println parser.getText();
252
                        //writer.writeCharacters("𦟛");
253
                        
254
                        
255
                        // checks if the token starts with an high surrogate
256
//                        if(isHighSurrogate(parser.getText().charAt(0)))        {
257
//                                println "warning: invalid UTF-8 XML range, token " + parser.getText() + " has been replaced.";
258
//                                writer.writeCharacters("__invalidXMLChar__")
259
//                                //writer.writeCharacters("𦟛");
260
//                                println "high surrogate: " + Integer.toHexString((int)parser.getText().charAt(0)); 
261
//                                println "low surrogate: " + Integer.toHexString((int)parser.getText().charAt(1));
262
//                                int charSum = parser.getText().charAt(0) + parser.getText().charAt(1);
263
//                                println "char sum: " + charSum;
264
//                                println "test " + surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
265
//                                int scalar = surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
266
//                                //writer.writeCharacters(String.valueOf((char)112692));
267
//                                writer.writeCharacters("&#" + scalar + ";");
268
//                        }
269
//                        else
270
                        
271
                        
272
                        if (insideword) { // ensure there is not \t or \n in the word form value
273
                                writer.writeCharacters(parser.getText().trim().replace("\n", " ").replace("\t", " "));
274
                        } else {
275
                                writer.writeCharacters(parser.getText());
276
                        }
277
                        
278
                        
279
                        
280
                }
281
        }
282
        
283
        
284
        /**
285
         * Converts the specified surrogates pair to scalar.
286
         * @param highSurrogate
287
         * @param lowSurrogate
288
         * @return
289
         */
290
        public int surrogatesPairToScalar(char highSurrogate, char lowSurrogate)        {
291
                return ((highSurrogate - 0xD800) * 0x400) + (lowSurrogate - 0xDC00) + 0x10000;
292
        }
293
        
294
        /**
295
         * Checks if the specified character is an high/leading surrogate.
296
         * @param character
297
         * @return
298
         */
299
        public boolean isHighSurrogate(char character)        {
300
                return (character >= 0xD800 && character <= 0xDBFF);
301
        }
302
        
303
        
304

    
305
        /** The wordcount. */
306
        int wordcount = 0;
307

    
308
        /** The ignorecontent. */
309
        boolean ignorecontent = true;//tokenize a partir de <body>
310
        boolean insideword = false;
311
        /**
312
         * Process.
313
         *
314
         * @return true, if successful
315
         */
316
        public boolean process()
317
        {
318
                if (!infile.exists()) {
319
                        println "$infile does not exists"
320
                        return false;
321
                }
322
                XMLOutputFactory factory = XMLOutputFactory.newInstance();
323
                output = new BufferedOutputStream(new FileOutputStream(outfile));
324
                
325
//                output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(outfile), 2048 * 1024), "UTF-8");
326
                
327
                writer = factory.createXMLStreamWriter(output, "UTF-8")
328
                writer.setNamespaceContext(new PersonalNamespaceContext());
329

    
330
                def inputData = infile.toURI().toURL().openStream();
331
                def inputfactory = XMLInputFactory.newInstance();
332
                //inputfactory.setProperty("http://apache.org/xml/properties/input-buffer-size", new Integer(2048));
333
                //inputfactory.setExpandEntityReferences(false);
334
                parser = inputfactory.createXMLStreamReader(inputData, "UTF-8");
335
                //println "PARSER: "+parser.getClass()
336
                writer.writeStartDocument("UTF-8","1.0");
337
                writer.writeCharacters("\n");
338

    
339
                int previousEvent = 0;
340
                boolean startProcess = false;
341
                if (startTag == null) // if no startTag specified we process from the start
342
                        startProcess = true;
343
                buffer = new StringBuffer();
344
                //println "process - start start tag: "+startTag+" startProcess: $startProcess"
345
                ignorecontent = !startProcess;
346
                try {
347
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
348
                                if (!startProcess) {
349
                                        if (event == XMLStreamConstants.START_ELEMENT) {
350
                                                if (parser.getLocalName().matches(startTag)) {
351

    
352
                                                        startProcess = true
353
                                                        ignorecontent = false;
354
                                                }
355
                                        }
356
                                        if (!startProcess) {
357
                                                donothing(event, null);
358
                                                continue;
359
                                        }
360
                                }
361

    
362
                                if (previousEvent == XMLStreamConstants.CHARACTERS && previousEvent != event) {
363
                                        processWord(); // tokenize
364
                                        buffer = new StringBuffer();
365
                                }
366
                                fillInfos(event);//get localname and prefix
367
                                if (event == XMLStreamConstants.START_ELEMENT) {
368
                                        //println "Open: "+localname;
369

    
370
                                        if (localname.matches(word_tags)) // ignore the content of the word but keep counting
371
                                        {
372
                                                //println "Found pretagged word";
373
                                                wordcount++;
374
                                                donothing(event, wordcount);
375
                                                ignorecontent = true;
376
                                                insideword = true;
377
                                        } else if (localname.matches(ignorable_tags)) // ignore the content of the file
378
                                        {
379
                                                donothing(event, null);
380
                                                ignorecontent = true;
381
                                        } else {
382
                                                donothing(event, null);
383
                                        }
384
                                } else if(event == XMLStreamConstants.END_ELEMENT) {
385
                                        //println "Close: "+localname;
386
                                        if (localname.matches(word_tags)) {
387
                                                ignorecontent = false;
388
                                                insideword = false;
389
                                                writer.writeEndElement();
390
                                                writer.writeCharacters("\n");
391
                                        } else if (localname.matches(ignorable_tags)) // ignore the content of the file
392
                                        {
393
                                                ignorecontent = false;
394
                                                donothing(event, null);
395

    
396
                                        } else {
397
                                                donothing(event, null);
398
                                        }
399
                                } else if (event == XMLStreamConstants.CHARACTERS) {
400
                                        if (ignorecontent) {
401
                                                //println " ignore chars: "+parser.getText().trim();
402
                                                donothing(event, null);
403
                                        } else {
404
                                                //println " process chars: "+parser.getText().trim();
405
                                                buffer.append(parser.getText());
406
                                                if (buffer.length() >= 128 && buffer.charAt(buffer.length()-1) == " ") {
407
                                                        processWord();
408
                                                        buffer = new StringBuffer();
409
                                                }
410
                                        }
411
                                } else if (event == XMLStreamConstants.COMMENT) {
412
                                        writer.writeComment(parser.getText())
413
                                } else if (event == XMLStreamConstants.DTD) {
414
                                        //println "DTD!";
415
                                } else {
416
                                        if (DEBUG) println "Warning in $infile: ignore XML event at location "+parser.getLocation()
417
                                }
418
                                previousEvent = event;
419
                        }
420

    
421
                        parser.close()
422
                        writer.close();
423
                        output.close();
424
                        inputData.close();
425
                } catch (Exception e) {
426
                        System.err.println("Error : "+infile);
427
                        e.printStackTrace();
428
                        if (writer != null) writer.close();
429
                        if (output != null) output.close();
430
                        if (parser != null) parser.close();
431
                        if (inputData != null) inputData.close();
432
                        return false;
433
                }
434
                return true;
435
        }
436

    
437

    
438
        /**
439
         * Process word.
440
         */
441
        protected void processWord()
442
        {
443
                String text = buffer.toString();//parser.getText().trim().replace("\t", " ");
444
                if (DEBUG) println "-- chars: "+text+"--";
445
                
446
                text = regLN.matcher(text).replaceAll(" ");
447
                text = regCTRL.matcher(text).replaceAll("");                                                // remove ctrl characters
448
                //for (String s : text.split(whitespaces) )                        // separate with unicode white spaces
449
                for (String s : regSplitWhiteSpaces.split(text))                        // separate with unicode white spaces
450
                {
451
                        if (DEBUG){println "process $s"}
452
                        iterate(s);
453
                }
454
        }
455

    
456
        /**
457
         * Iterate.
458
         *
459
         * @param s the s
460
         * @return the java.lang. object
461
         */
462
        protected iterate(String s)
463
        {
464
                while (s != null && s.length() > 0) {
465
                        if (DEBUG){println "  > $s"}
466
                        s = standardChecks(s);
467
                }
468
        }
469

    
470

    
471
        /**
472
         * Standard checks.
473
         *
474
         * @param s the s
475
         * @return the java.lang. object
476
         */
477
//        @CompileStatic(SKIP)
478
        protected String standardChecks(String s)
479
        {
480
                
481
                Matcher m;
482
                if (fclitics != null && (m = s =~ regFClitics) ) {
483
//                        println "CLITIC found: $s ->"+ m
484
                        if (m.group(1) != null && m.group(1).length() > 0) standardChecks(m.group(1));// process first part of the string
485
                        
486
                        wordcount++;
487
                        writer.writeStartElement("w");
488
                        writeWordAttributes();// id
489
                        writer.writeAttribute("type", "w");
490
                        writer.writeCharacters(m.group(2));
491
                        writer.writeEndElement();
492
                        writer.writeCharacters("\n");
493
                }
494
                else if ((m = s =~ regElision) )
495
                {
496
                        iterate(m.group(1))
497

    
498
                        int sep = s.indexOf("'");
499
                        if (sep < 0)
500
                                sep = s.indexOf("");
501
                        if (sep < 0)
502
                                sep = s.indexOf("");
503

    
504
                        wordcount++;
505
                        writer.writeStartElement("w");
506
                        writeWordAttributes();// id
507
                        writer.writeAttribute("type", "w");
508
                        writer.writeCharacters(m.group(2));
509
                        writer.writeEndElement();
510
                        writer.writeCharacters("\n");
511

    
512
                        return m.group(3);
513
                }
514
                else if ((m = s =~ reg3pts) )
515
                {
516
                        iterate(m.group(1))
517

    
518
                        wordcount++;
519
                        writer.writeStartElement("w");
520
                        writeWordAttributes();// id
521
                        writer.writeAttribute("type","pon");
522
                        writer.writeCharacters("...");
523
                        writer.writeEndElement();
524
                        writer.writeCharacters("\n");
525

    
526
                        return m.group(3);
527
                }
528
                else if ((m = s =~ regPunct) )
529
                //else if((m = s =~ /\A(.*)($punct_all)(.*)\Z/) )
530
                {
531
                        iterate(m.group(1));
532

    
533
                        wordcount++;
534
                        writer.writeStartElement("w");
535
                        writeWordAttributes();// id
536
                        writer.writeAttribute("type","pon");
537
                        writer.writeCharacters(m.group(2));
538
                        writer.writeEndElement();
539
                        writer.writeCharacters("\n");
540

    
541
                        return m.group(3);
542
                }
543
                else
544
                {
545
                        if(DEBUG){println "Other : "+s}
546
                        wordcount++;
547
                        writer.writeStartElement("w");
548
                        writeWordAttributes();// id
549
                        if(s.matches(/\p{P}/))
550
                                writer.writeAttribute("type","pon");
551
                        else
552
                                writer.writeAttribute("type","w");
553
                        writer.writeCharacters(s);
554
                        writer.writeEndElement();
555
                        writer.writeCharacters("\n");
556

    
557
                        return "";
558
                }
559
        }
560

    
561
        /**
562
         * Write word attributes.
563
         *
564
         * @return the java.lang. object
565
         */
566
        protected writeWordAttributes()
567
        {
568
                writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
569
                writer.writeAttribute("n",""+wordcount);
570
        }
571

    
572
        public void setStartTag(String tag)
573
        {
574
                this.startTag = tag;
575
        }
576

    
577
        /**
578
         * Tokenize.
579
         *
580
         * @param str the str
581
         * @return the list
582
         */
583
        public List<String> tokenize(String str)
584
        {
585
                return str.tokenize()        // cut by whitespace
586
        }
587

    
588
        /**
589
         * The main method.
590
         *
591
         * @param args the arguments
592
         */
593
        public static void main(String[] args)
594
        {
595
                
596
                
597
                
598
                // FIXME: tests UTF-8 XML ranges
599
                
600
                
601
                
602
                File inputDir = new File("C:/Tools/Textometrie/___corpus/xml/test_bug_import_chinois/")
603
                File inputFile = new File(inputDir, "problem2.xml");
604

    
605
                
606
//                File inputDir = new File("/home/mdecorde/xml/SMS88Mil/out")
607
//                File inputFile = new File(inputDir, "sms-sample.xml");
608
                for (String lang : ["none", "en", "fr", null]) {
609
                        print "."
610
                        File outputFile = new File(inputDir, "test-${lang}.xml")
611
                        FastSimpleTokenizerXml tokenizer = new FastSimpleTokenizerXml(inputFile, outputFile, lang)
612
                        tokenizer.process();
613
                }
614
                
615
                println ""
616
                println "Done"
617
                
618
//                String lang = "en"
619
//                File inDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant")
620
//                File outDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant-t")
621
//                outDir.deleteDir()
622
//                outDir.mkdir()
623
//                
624
//                println "processing "+inDir.listFiles().size()+" files."
625
//                for (def infile : inDir.listFiles()) {                        
626
//                        if (!infile.getName().endsWith(".xml")) continue;
627
//                        
628
//                        print "."
629
//                        File outfile = new File(outDir, infile.getName())
630
//                        FastSimpleTokenizerXml tokenizer = new FastSimpleTokenizerXml(infile, outfile, lang)
631
//                        tokenizer.setDEBUG false
632
//                        tokenizer.process();
633
//                }
634
//                println ""
635
//                println "Done"
636

    
637
        }
638
}