Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / filters / Tokeniser / OldSimpleTokenizerXml.groovy @ 1000

History | View | Annotate | Download (17.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate:$
41
// $LastChangedRevision:$
42
// $LastChangedBy:$
43
//
44
package org.txm.scripts.filters.Tokeniser;
45

    
46
import javax.xml.parsers.DocumentBuilder;
47
import javax.xml.parsers.DocumentBuilderFactory;
48
import javax.xml.parsers.ParserConfigurationException;
49
import javax.xml.transform.OutputKeys;
50
import javax.xml.transform.Result;
51
import javax.xml.transform.Source;
52
import javax.xml.transform.Transformer;
53
import javax.xml.transform.TransformerFactory;
54
import javax.xml.transform.dom.DOMSource;
55
import javax.xml.transform.stream.StreamResult;
56

    
57
import org.txm.scripts.importer.graal.PersonalNamespaceContext;
58
import org.txm.objects.Base;
59
import org.w3c.dom.Document;
60
import org.w3c.dom.Element;
61
import org.w3c.dom.NodeList;
62
import org.xml.sax.SAXException;
63

    
64
import java.io.File;
65
import java.io.FileInputStream;
66
import java.io.InputStreamReader;
67

    
68
import javax.xml.stream.*;
69

    
70
import java.net.URL;
71

    
72
import org.txm.tokenizer.TokenizerClasses;
73

    
74
public class OldSimpleTokenizerXml {
75

    
76
        /** The word_tags. */
77
        String word_tags = TokenizerClasses.word_tags;
78

    
79
        /** The intraword_tags. */
80
        String intraword_tags = TokenizerClasses.intraword_tags;
81

    
82
        /** The punct_strong. */
83
        def punct_strong = TokenizerClasses.punct_strong;
84

    
85
        /** The punct_all. */
86
        def punct_all = TokenizerClasses.punct_all;
87

    
88
        /** The word_chars. */
89
        def word_chars = TokenizerClasses.word_chars;
90

    
91
        def fclitics = null; // default behavior don't manage clitics
92
        def pclitics = null; // default behavior don't manage clitics
93

    
94
        /** The ignorable_tags. */
95
        String ignorable_tags
96
        String outside_text_tags_keep_content
97
        String outside_text_tags
98
        String startTag = null
99

    
100
        /** The DEBUG. */
101
        public boolean DEBUG = false;
102

    
103
        /** The outfile. */
104
        File outfile;
105

    
106
        /** The infile. */
107
        File infile;
108

    
109
        String lang;
110

    
111
        /** The buffer. */
112
        StringBuffer buffer;
113

    
114
        /** The writer. */
115
        XMLStreamWriter writer;
116
        BufferedOutputStream output;
117

    
118
        /** The parser. */
119
        XMLStreamReader parser
120

    
121
        /** The localname. */
122
        String localname;
123

    
124
        /** The prefix. */
125
        String prefix;
126
        String filename;
127

    
128
        def regElision;
129
        def reg3pts;
130
        def regPunct;
131
        def regFClitics = null ;
132
        def regPClitics = null ;
133
        String whitespaces;
134

    
135
        public OldSimpleTokenizerXml(File infile, File outfile)
136
        {
137
                this(infile, outfile, "");
138
        }
139

    
140
        /**
141
         * Instantiates a new simple tokenizer xml.
142
         *
143
         * @param infile the infile
144
         * @param outfile the outfile
145
         */
146
        public OldSimpleTokenizerXml(File infile, File outfile, String lang)
147
        {
148
                this.lang = lang;
149
                if (lang != null)
150
                        if (lang.startsWith("en")) {
151
                                fclitics = TokenizerClasses.FClitic_en
152
                        } else if (lang.startsWith("fr")) {
153
                                fclitics = TokenizerClasses.FClitic_fr
154
                                pclitics = TokenizerClasses.PClitic_fr
155
                        } else if (lang.startsWith("gl")) {
156
                                fclitics = TokenizerClasses.FClitic_gl
157
                        } else if (lang.startsWith("it")) {
158
                                pclitics = TokenizerClasses.PClitic_it
159
                        }
160

    
161
                this.outfile = outfile;
162
                this.infile = infile;
163
                this.filename = infile.getName();
164
                int index = filename.lastIndexOf(".");
165
                if (index > 0) filename = filename.substring(0, index);
166

    
167
                String strRegElision = TokenizerClasses.regElision;
168
                String strRegPunct = TokenizerClasses.regPunct;
169
                regElision = /\A(.*?)(\p{L}++$strRegElision)(.*)\Z/
170
                reg3pts = /\A(.*)(\.\.\.)(.*)\Z/
171
                regPunct = /\A(.*)($strRegPunct)(.*)\Z/
172
                if (fclitics != null)
173
                        regFClitics = /(.+)($fclitics)(.*)/
174
                if (pclitics != null)
175
                        regPClitics = /(.+)(pclitics)(.*)/
176
                whitespaces = TokenizerClasses.whitespaces;
177
        }
178

    
179
        /**
180
         * Fill infos.
181
         *
182
         * @param event the event
183
         * @return the java.lang. object
184
         */
185
        public fillInfos(int event)
186
        {
187
                if (event == XMLStreamConstants.START_ELEMENT || event == XMLStreamConstants.END_ELEMENT)
188
                {
189
                        localname = parser.getLocalName();
190
                        prefix = parser.getPrefix();
191
                }
192
        }
193

    
194
        /**
195
         * Donothing.
196
         *
197
         * @param event the event
198
         * @param wordid the wordid
199
         * @return the java.lang. object
200
         */
201
        public donothing(int event, Integer wordid)
202
        {
203
                if (event == XMLStreamConstants.START_ELEMENT ) {
204
                        String namespace = parser.getNamespaceURI();
205

    
206
                        localname = parser.getLocalName();
207
                        if (prefix != null && prefix.length() > 0)
208
                                writer.writeStartElement(prefix+":"+localname);
209
                        else
210
                                //                                if(namespace != null)
211
                                //                                        writer.writeStartElement(namespace, localname);
212
                                //                                else
213
                                writer.writeStartElement(localname);
214

    
215
                        //                        if(parser.getNamespaceCount() > 0)
216
                        //                                writer.writeDefaultNamespace(parser.getNamespaceURI(0))
217
                        //                        for(int i = 1 ; i < parser.getNamespaceCount() ; i++)
218
                        //                                writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
219

    
220
                        String namespace_prefix;
221
                        for (int i = 0 ; i<   parser.getNamespaceCount() ; i++) {
222
                                namespace_prefix = parser.getNamespacePrefix(i);
223
                                if ((namespace_prefix != null)&&   (namespace_prefix.length()>   0)) {
224
                                        writer.writeNamespace(namespace_prefix, parser.getNamespaceURI(i));
225
                                } else {
226
                                        writer.writeDefaultNamespace(parser.getNamespaceURI(i));
227
                                }
228
                        }
229

    
230
                        String attrprefix, attname;
231
                        boolean hasId = false;
232
                        boolean hasType = false
233
                        boolean hasN = false
234
                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
235
                                attname = parser.getAttributeLocalName(i);
236
                                attrprefix = parser.getAttributePrefix(i);
237
                                if ("id".equals(attname)) hasId = true;
238
                                if ("type".equals(attname)) hasType = true;
239
                                if ("n".equals(attname)) hasN = true;
240

    
241
                                if(attrprefix != null && attrprefix.length() > 0)
242
                                        writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i))
243
                                else
244
                                        writer.writeAttribute(attname, parser.getAttributeValue(i))
245
                        }
246

    
247
                        if (wordid != null && !hasId && localname == "w")
248
                                writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
249

    
250
                        if (!hasType && localname == "w")
251
                                writer.writeAttribute("type", "w");
252

    
253
                        if (!hasN && localname == "w")
254
                                writer.writeAttribute("n", ""+wordcount);
255

    
256
                        writer.writeCharacters("\n");
257
                }
258
                else if(event == XMLStreamConstants.END_ELEMENT)
259
                {
260
                        writer.writeEndElement();
261
                        writer.writeCharacters("\n");
262
                }
263
                else if(event == XMLStreamConstants.CHARACTERS)
264
                {
265
                        if (insideword) { // ensure there is not \t or \n in the word form value
266
                                writer.writeCharacters(parser.getText().trim().replace("\n", " ").replace("\t", " "));
267
                        } else {
268
                                writer.writeCharacters(parser.getText());
269
                        }
270
                }
271
        }
272

    
273
        /** The wordcount. */
274
        int wordcount = 0;
275

    
276
        /** The ignorecontent. */
277
        boolean ignorecontent = true;//tokenize a partir de <body>
278
        boolean insideword = false;
279
        /**
280
         * Process.
281
         *
282
         * @return true, if successful
283
         */
284
        public boolean process()
285
        {
286
                if (!infile.exists()) {
287
                        println "$infile does not exists"
288
                        return false;
289
                }
290
                XMLOutputFactory factory = XMLOutputFactory.newInstance();
291
                output = new BufferedOutputStream(new FileOutputStream(outfile))
292
                writer = factory.createXMLStreamWriter(output, "UTF-8")
293
                writer.setNamespaceContext(new PersonalNamespaceContext());
294

    
295
                def inputData = infile.toURI().toURL().openStream();
296
                def inputfactory = XMLInputFactory.newInstance();
297
                //inputfactory.setProperty("http://apache.org/xml/properties/input-buffer-size", new Integer(2048));
298
                //inputfactory.setExpandEntityReferences(false);
299
                parser = inputfactory.createXMLStreamReader(inputData, "UTF-8");
300
                //println "PARSER: "+parser.getClass()
301
                writer.writeStartDocument("UTF-8","1.0");
302
                writer.writeCharacters("\n");
303

    
304
                int previousEvent = 0;
305
                boolean startProcess = false;
306
                if (startTag == null) // if no startTag specified we process from the start
307
                        startProcess = true;
308
                buffer = new StringBuffer();
309
                //println "process - start start tag: "+startTag+" startProcess: $startProcess"
310
                ignorecontent = !startProcess;
311
                try {
312
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
313
                                if (!startProcess) {
314
                                        if (event == XMLStreamConstants.START_ELEMENT) {
315
                                                if (parser.getLocalName().matches(startTag)) {
316

    
317
                                                        startProcess = true
318
                                                        ignorecontent = false;
319
                                                }
320
                                        }
321
                                        if (!startProcess) {
322
                                                donothing(event, null);
323
                                                continue;
324
                                        }
325
                                }
326

    
327
                                if (previousEvent == XMLStreamConstants.CHARACTERS && previousEvent != event) {
328
                                        processWord(); // tokenize
329
                                        buffer = new StringBuffer();
330
                                }
331
                                fillInfos(event);//get localname and prefix
332
                                if (event == XMLStreamConstants.START_ELEMENT) {
333
                                        //println "Open: "+localname;
334

    
335
                                        if (localname.matches(word_tags)) // ignore the content of the word but keep counting
336
                                        {
337
                                                //println "Found pretagged word";
338
                                                wordcount++;
339
                                                donothing(event, wordcount);
340
                                                ignorecontent = true;
341
                                                insideword = true;
342
                                        } else if (outside_text_tags_keep_content != null && localname.matches(outside_text_tags_keep_content)) { // ignore the content of the tag ONLY
343
                                                //just ignore the tag
344
                                        } else if (ignorable_tags != null && localname.matches(ignorable_tags)) { // ignore the content of the tag ONLY
345
                                                donothing(event, null);
346
                                                ignorecontent = true;
347
                                        } else if (outside_text_tags != null && localname.matches(outside_text_tags)) { // ignore the tag and its content of the tag
348
                                                goToEndOfElement(localname);
349
                                        } else {
350
                                                donothing(event, null);
351
                                        }
352
                                } else if(event == XMLStreamConstants.END_ELEMENT) {
353
                                        //println "Close: "+localname;
354
                                        if (localname.matches(word_tags)) {
355
                                                ignorecontent = false;
356
                                                insideword = false;
357
                                                writer.writeEndElement();
358
                                                writer.writeCharacters("\n");
359
                                        } else if (ignorable_tags != null && localname.matches(ignorable_tags)) { // ignore the content of the tag
360
                                                ignorecontent = false;
361
                                                donothing(event, null);
362
                                        } else if (outside_text_tags_keep_content != null && localname.matches(outside_text_tags_keep_content)) { // ignore the content of the tag ONLY
363
                                                //just ignore the tag
364
                                        } else {
365
                                                donothing(event, null);
366
                                        }
367
                                } else if (event == XMLStreamConstants.CHARACTERS) {
368
                                        if (ignorecontent) {
369
                                                //println " ignore chars: "+parser.getText().trim();
370
                                                donothing(event, null);
371
                                        } else {
372
                                                //println " process chars: "+parser.getText().trim();
373
                                                buffer.append(parser.getText());
374
                                                if (buffer.length() >= 128 && buffer.charAt(buffer.length()-1) == " ") {
375
                                                        processWord();
376
                                                        buffer = new StringBuffer();
377
                                                }
378
                                        }
379
                                } else if (event == XMLStreamConstants.COMMENT) {
380
                                        writer.writeComment(parser.getText())
381
                                } else if (event == XMLStreamConstants.DTD) {
382
                                        //println "DTD!";
383
                                } else {
384
                                        if (DEBUG) println "Warning in $infile: ignore XML event at location "+parser.getLocation()
385
                                }
386
                                previousEvent = event;
387
                        }
388

    
389
                        parser.close()
390
                        writer.close();
391
                        output.close();
392
                        inputData.close();
393
                } catch (Exception e) {
394
                        System.err.println("Error : "+infile);
395
                        e.printStackTrace();
396
                        return false;
397
                }
398
                return true;
399
        }
400

    
401
        public void setOutSideElements(String regexp) {
402
                this.outside_text_tags = regexp;
403
        }
404

    
405
        public void setOutSideTextTags(String regexp) {
406
                this.outside_text_tags_keep_content = regexp;
407
        }
408

    
409
        protected void goToEndOfElement(String name) {
410
                //println "START ignoring tag and content of $name"
411
                def openedTags = []
412
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
413
                        if (event == XMLStreamConstants.START_ELEMENT) {
414
                                openedTags << parser.getLocalName()
415
                                //println "append "+openedTags
416
                        } else if (event == XMLStreamConstants.END_ELEMENT) {
417
                                if (openedTags.size() == 0 && name == parser.getLocalName()) {
418
                                        //println "END ignoring tag and content of $name"
419
                                        return;
420
                                }
421
                                openedTags.pop()
422
                                //println "pop $openedTags"
423
                        }
424
                }
425
        }
426

    
427
        /**
428
         * Process word.
429
         */
430
        protected void processWord()
431
        {
432
                String text = buffer.toString();//parser.getText().trim().replace("\t", " ");
433
                if (DEBUG) println "-- chars: "+text+"--";
434
                text = text.replaceAll("\n", " ");
435
                text = text.replaceAll("\\p{C}", "");                                                // remove ctrl characters
436
                for (String s : text.split(whitespaces) )                        // separate with unicode white spaces
437
                {
438
                        if (DEBUG){println "process $s"}
439
                        iterate(s);
440
                }
441
        }
442

    
443
        /**
444
         * Iterate. a String, should be called when a word is found in a String
445
         *
446
         * @param s the s
447
         * @return the java.lang. object
448
         */
449
        protected iterate(String s)
450
        {
451
                while (s != null && s.length() > 0) {
452
                        if (DEBUG){println "  > $s"}
453
                        s = standardChecks(s);
454
                }
455
        }
456

    
457

    
458
        /**
459
         * Standard checks.
460
         *
461
         * @param s the s
462
         * @return the java.lang. object
463
         */
464
        protected standardChecks(String s)
465
        {
466
                def m;
467
                if (fclitics != null && (m = s =~ regFClitics) ) {
468
                        if (DEBUG) println "CLITIC found: $s ->"+ m[0][1]+" + "+m[0][2]+" + "+m[0][3]
469
                        if (m[0][1] != null && m[0][1].length() > 0) iterate(m[0][1]);// process first part of the string
470

    
471
                        wordcount++;
472
                        writer.writeStartElement("w");
473
                        writeWordAttributes();// id
474
                        writer.writeAttribute("type", "w");
475
                        writer.writeCharacters(m[0][2]);
476
                        writer.writeEndElement();
477
                        writer.writeCharacters("\n");
478

    
479
                        return m[0][3];
480
                } else if ((m = s =~ regElision)) {
481
                        if (DEBUG) println "Elision found: $s ->"+ m[0][1]+" + "+m[0][2]+" + "+m[0][3]
482
                        iterate(m[0][1])
483

    
484
                        int sep = s.indexOf("'");
485
                        if (sep < 0)
486
                                sep = s.indexOf("");
487
                        if (sep < 0)
488
                                sep = s.indexOf("");
489

    
490
                        wordcount++;
491
                        writer.writeStartElement("w");
492
                        writeWordAttributes();// id
493
                        writer.writeAttribute("type", "w");
494
                        writer.writeCharacters(m[0][2]);
495
                        writer.writeEndElement();
496
                        writer.writeCharacters("\n");
497

    
498
                        return m[0][3];
499
                }
500
                else if ((m = s =~ reg3pts) )
501
                {
502
                        if(DEBUG){println "REG '...' found: $s -> "+m[0][1]+" + "+m[0][2]+" + "+m[0][3]}
503
                        iterate(m[0][1])
504

    
505
                        wordcount++;
506
                        writer.writeStartElement("w");
507
                        writeWordAttributes();// id
508
                        writer.writeAttribute("type","pon");
509
                        writer.writeCharacters("...");
510
                        writer.writeEndElement();
511
                        writer.writeCharacters("\n");
512

    
513
                        return m[0][3];
514
                }
515
                else if ((m = s =~ regPunct) )
516
                        //else if((m = s =~ /\A(.*)($punct_all)(.*)\Z/) )
517
                {
518
                        if(DEBUG){println "PUNCT found: $s ->"+m[0][1]+" + "+m[0][2]+" + "+m[0][3]}
519
                        iterate(m[0][1]);
520

    
521
                        wordcount++;
522
                        writer.writeStartElement("w");
523
                        writeWordAttributes();// id
524
                        writer.writeAttribute("type","pon");
525
                        writer.writeCharacters(m[0][2]);
526
                        writer.writeEndElement();
527
                        writer.writeCharacters("\n");
528

    
529
                        return m[0][3];
530
                }
531
                else
532
                {
533
                        if(DEBUG){println "Other found: "+s}
534
                        wordcount++;
535
                        writer.writeStartElement("w");
536
                        writeWordAttributes();// id
537
                        if (s.matches(/\p{P}/))
538
                                writer.writeAttribute("type","pon");
539
                        else
540
                                writer.writeAttribute("type","w");
541
                        writer.writeCharacters(s);
542
                        writer.writeEndElement();
543
                        writer.writeCharacters("\n");
544

    
545
                        return "";
546
                }
547
        }
548

    
549
        /**
550
         * Write word attributes.
551
         *
552
         * @return the java.lang. object
553
         */
554
        protected writeWordAttributes()
555
        {
556
                writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
557
                writer.writeAttribute("n",""+wordcount);
558
        }
559

    
560
        public void setStartTag(String tag)
561
        {
562
                this.startTag = tag;
563
        }
564

    
565
        /**
566
         * Tokenize.
567
         *
568
         * @param str the str
569
         * @return the list
570
         */
571
        public List<String> tokenize(String str)
572
        {
573
                return str.tokenize()        // cut by whitespace
574
        }
575

    
576
        /**
577
         * The main method.
578
         *
579
         * @param args the arguments
580
         */
581
        public static void main(String[] args)
582
        {
583
                File inputDir = new File("/home/mdecorde/xml/annotation/")
584
                File inputFile = new File(inputDir, "test.xml");
585
                for (String lang : ["fr", null]) {
586
                        print "."
587
                        File outputFile = new File(inputDir, "test-${lang}.xml")
588

    
589
                        OldSimpleTokenizerXml tokenizer = new OldSimpleTokenizerXml(inputFile, outputFile, lang)
590
                        tokenizer.DEBUG = true
591
                        tokenizer.process();
592
                }
593

    
594
                println ""
595
                println "Done"
596

    
597
                //                String lang = "en"
598
                //                File inDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant")
599
                //                File outDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant-t")
600
                //                outDir.deleteDir()
601
                //                outDir.mkdir()
602
                //
603
                //                println "processing "+inDir.listFiles().size()+" files."
604
                //                for (def infile : inDir.listFiles()) {
605
                //                        if (!infile.getName().endsWith(".xml")) continue;
606
                //
607
                //                        print "."
608
                //                        File outfile = new File(outDir, infile.getName())
609
                //                        OldSimpleTokenizerXml tokenizer = new OldSimpleTokenizerXml(infile, outfile, lang)
610
                //                        tokenizer.setDEBUG false
611
                //                        tokenizer.process();
612
                //                }
613
                //                println ""
614
                //                println "Done"
615

    
616
        }
617
}