Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / filters / TagSentences / TagSentencesXml.groovy @ 1000

History | View | Annotate | Download (13.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate:$
41
// $LastChangedRevision:$
42
// $LastChangedBy:$
43
//
44
package org.txm.scripts.filters.TagSentences;
45

    
46
import javax.xml.parsers.DocumentBuilder;
47
import javax.xml.parsers.ParserConfigurationException;
48
import javax.xml.transform.OutputKeys;
49
import javax.xml.transform.Result;
50
import javax.xml.transform.Source;
51
import javax.xml.transform.Transformer;
52
import javax.xml.transform.TransformerFactory;
53
import javax.xml.transform.dom.DOMSource;
54
import javax.xml.transform.stream.StreamResult;
55

    
56
import org.txm.importer.ValidateXml;
57
import org.txm.tokenizer.TokenizerClasses;
58
import org.txm.utils.AsciiUtils;
59
import org.w3c.dom.Document;
60
import org.w3c.dom.Element;
61
import org.w3c.dom.NodeList;
62
import org.xml.sax.SAXException;
63

    
64
import java.io.File;
65
import java.io.FileInputStream;
66
import java.io.InputStreamReader;
67

    
68
import javax.xml.stream.*;
69
import java.net.URL;
70

    
71
// TODO: Auto-generated Javadoc
72
/**
73
 * The Class TagSentencesXml.
74
 */
75
class TagSentencesXml {
76

    
77
        /** The url. */
78
        private def url;
79

    
80
        /** The input data. */
81
        private def inputData;
82

    
83
        /** The parser. */
84
        private XMLStreamReader parser;
85

    
86
        /** The factory. */
87
        XMLOutputFactory factory = XMLOutputFactory.newInstance();
88

    
89
        /** The output. */
90
        FileOutputStream output;
91

    
92
        /** The writer. */
93
        XMLStreamWriter writer;
94

    
95
        /** The IN. */
96
        boolean IN = false;
97
        boolean WORD = false;
98

    
99
        /** The pending. */
100
        boolean pending = false;
101

    
102
        /** The div_tags. */
103
        List<String> div_tags = ["front","body","div","head","trailer","p","ab","sp","speaker","list","item"];
104

    
105
        /** The flow_tags. */
106
        List<String> flow_tags = ["seg","foreign","hi","title","name","supplied","subst","add","damage", "unclear", "corr", "sic"];
107

    
108
        String startTag = "text";
109
        String rendMulti = "MULTI_S";
110
        boolean multi = false;
111
        boolean startProcess = false;
112
        String strongPunct = TokenizerClasses.punct_strong;
113

    
114
        /** The localname. */
115
        String localname;
116
        boolean DEBUG = false;
117

    
118
        /**
119
         * Instantiates a new tag sentences xml.
120
         *
121
         * @param discoursxml the discoursxml
122
         */
123
        public TagSentencesXml(File discoursxml)
124
        {
125
                this.url = discoursxml.toURI().toURL();
126
                //corr_tags.addAll(corr_tags_no_seg);
127

    
128
                inputData = discoursxml.toURI().toURL().openStream();
129
                XMLInputFactory factory = XMLInputFactory.newInstance();
130
                parser = factory.createXMLStreamReader(inputData);
131
        }
132

    
133
        /**
134
         * Creates the output.
135
         *
136
         * @param outfile the outfile
137
         * @return true, if successful
138
         */
139
        private boolean createOutput(File outfile){
140
                try {
141
                        output = new BufferedOutputStream(new FileOutputStream(outfile))
142
                        writer = factory.createXMLStreamWriter(output, "UTF-8");//create a new file
143
                        writer.writeStartDocument("UTF-8", "1.0");
144
                        writer.writeCharacters("\n");
145
                        return true;
146
                } catch (Exception e) {
147
                        System.out.println(e.getLocalizedMessage());
148
                        return false;
149
                }
150
        }
151

    
152
        /** The s_id. */
153
        int s_id = 1;
154

    
155
        /**
156
         * Process.
157
         *
158
         * @param outfile the outfile
159
         * @return true, if successful
160
         */
161
        public boolean process(File outfile)
162
        {
163
                s_id = 1;
164
                if (!createOutput(outfile))
165
                        return false;
166
                String localname = "";
167

    
168
                try {
169
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
170
                                if (startProcess) {
171
                                        if (IN) {
172
                                                processIn(event);
173
                                        } else {
174
                                                processOut(event);
175
                                        }
176
                                } else {
177
                                        writeEvent(event);
178
                                }
179

    
180
                                //detect start tag
181
                                if (!startProcess)
182
                                        if (event == XMLStreamConstants.START_ELEMENT)
183
                                                if (parser.getLocalName() == startTag)
184
                                                        startProcess = true;
185
                        }
186
                        
187
                        writer.close()
188
                        output.close()
189
                        parser.close()
190
                } catch (Exception ex) {
191
                        System.err.println(parser.getLocation().toString());
192
                        ex.printStackTrace();
193
                        
194
                        writer.close()
195
                        output.close()
196
                        parser.close()
197
                        return false;
198
                }
199

    
200
                return true;
201
        }
202

    
203
        /**
204
         * ignore content until the existing s is closed
205
         * this sentence may contains others sentence
206
         */
207
        public void processContentExistingSentence() // we're IN
208
        {
209
                ArrayList<String> openedtags = [];
210
                if(DEBUG) writer.writeComment("SENT+");
211
                assert(IN == true);
212
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
213
                        switch (event) {
214
                                case XMLStreamConstants.START_ELEMENT:
215
                                        localname = parser.getLocalName();
216
                                        openedtags.add(localname); // stack the openened tag
217
                                        writeEvent(event);
218
                                        break;
219

    
220
                                case XMLStreamConstants.END_ELEMENT:
221
                                        localname = parser.getLocalName();
222
                                        writeEvent(event);
223

    
224
                                        if (localname == "s") {
225
                                                //        println "end of word: "+parser.getLocation().getLineNumber();
226
                                                if(openedtags.size() == 0) // it's the end of the existing sentence
227
                                                {
228
                                                        IN = false; // we're now OUT
229
                                                        return; // we've finish to write the word
230
                                                }
231
                                        }
232
                                        openedtags.pop(); // pop the tag stack
233
                                        break;
234

    
235
                                case XMLStreamConstants.CHARACTERS:
236
                                        writeEvent(event);
237
                                        break;
238
                                case XMLStreamConstants.COMMENT:
239
                                        writer.writeComment(parser.getText());
240
                                        break;
241
                        }
242
                }
243
                if(DEBUG) writer.writeComment("+SENT");
244
        }
245

    
246
        /**
247
         * process the content of a word. start element must have been writen before calling this method
248
         */
249
        public void processWordContent() // we're IN
250
        {
251
                if(DEBUG) writer.writeComment("WORD");
252
                assert(IN == true);
253
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
254
                        switch (event) {
255
                                case XMLStreamConstants.START_ELEMENT:
256
                                        localname = parser.getLocalName();
257
                                        if(localname == "w")
258
                                                println "Error: found a word in a word: "+parser.getLocation();
259
                                        writeEvent(event);
260
                                        break;
261

    
262
                                case XMLStreamConstants.END_ELEMENT:
263
                                        localname = parser.getLocalName();
264

    
265
                                        if (localname == "w" && pending) // we've found a strong punct in a word
266
                                        {
267
                                                IN = false;
268
                                                pending = false; // reset pending
269
                                                writeEvent(event);
270
                                                writer.writeEndElement();
271
                                        } else {
272
                                                writeEvent(event);
273
                                        }
274

    
275
                                        if (localname == "w") {
276
                                                //        println "end of word: "+parser.getLocation().getLineNumber();
277
                                                return; // we've finish to write the word
278
                                        }
279

    
280
                                        break;
281

    
282
                                case XMLStreamConstants.CHARACTERS: // detect end of sentence
283
                                        String txt = parser.getText();
284
                                        if (txt.matches(strongPunct)) {
285
                                                pending = true;
286
                                        }
287
                                        writeEvent(event);
288
                                        break;
289
                                case XMLStreamConstants.COMMENT:
290
                                        writer.writeComment(parser.getText());
291
                                        break;
292
                        }
293
                }
294
        }
295

    
296
        /**
297
         * Process out.
298
         *
299
         * @param event the event
300
         */
301
        public void processOut(int event)
302
        {
303
                if (DEBUG) writer.writeComment("OUT");
304
                switch (event) {
305
                        case XMLStreamConstants.START_ELEMENT:
306
                                localname = parser.getLocalName();
307
                                if (localname == "s") {
308
                                        IN = true;
309
                                        writeEvent(event);
310
                                        processContentExistingSentence();
311
                                } else if (localname == "w") {
312
                                        IN = true;
313
                                        writer.writeStartElement("s");
314
                                        writer.writeAttribute("id", "s_"+s_id++)
315
                                        writeEvent(event);
316
                                        processWordContent();
317
                                } else if (flow_tags.contains(localname)) {
318
                                        String rend = getRendAttribute();
319
                                        if (!rend.contains(rendMulti)) {
320
                                                IN = true;
321
                                                writer.writeStartElement("s");
322
                                                writer.writeAttribute("id", "s_"+s_id++)
323
                                        } else {
324
                                                multi = true;
325
                                        }
326
                                        writeEvent(event);
327
                                } else if (div_tags.contains(localname)) // found a tag out of sentences, we do nothing
328
                                {
329
                                        writeEvent(event);
330
                                } else {
331
                                        writeEvent(event);
332
                                }
333
                                break;
334

    
335
                        case XMLStreamConstants.END_ELEMENT:
336
                                localname = parser.getLocalName();
337
                                if (localname == "s") {
338
                                        System.out.println("Error: found a closing s but we are OUT");
339
                                }
340
                                writeEvent(event);
341
                                break;
342

    
343
                        case XMLStreamConstants.CHARACTERS:
344
                                writeEvent(event);
345
                                break;
346
                        case XMLStreamConstants.COMMENT:
347
                                writer.writeComment(parser.getText());
348
                                break
349
                }
350
        }
351

    
352
        public String getRendAttribute()
353
        {
354
                for (int i = 0 ; i < parser.getAttributeCount() ; i++)
355
                        if (parser.getAttributeLocalName(i) == "rend")
356
                                return parser.getAttributeValue(i);
357
                return "";
358
        }
359

    
360
        /**
361
         * Process xml in a created sentence.
362
         *
363
         * @param event the event
364
         */
365
        public void processIn(int event)
366
        {
367
                if(DEBUG) writer.writeComment("IN")
368
                switch (event) {
369
                        case XMLStreamConstants.START_ELEMENT:
370
                                localname = parser.getLocalName();
371
                                if (localname == "s") // found opening s, we close ours and stay IN
372
                                {
373
                                        writer.writeEndElement(); // close our sentence
374
                                        writeEvent(event); // write the start element
375
                                        processContentExistingSentence(); // process its content
376
                                        //println "ERROR "+parser.getLocation();
377
                                } else if (localname == "w") // can't close s from here, see END_ELEMENT
378
                                {
379
                                        writeEvent(event);
380
                                        processWordContent();
381
                                } else if (flow_tags.contains(localname)) // flow tag = we stay in the sentence
382
                                {
383
                                        String rend = getRendAttribute();
384
                                        if (rend.contains(rendMulti)) // but this one must be seens as a div tag
385
                                        {
386
                                                writer.writeEndElement(); // close s
387
                                                IN = false; // get out of the sentence
388
                                                multi = true;
389
                                        }
390
                                        writeEvent(event);
391
                                } else if (div_tags.contains(localname)) // div tag = we must get out the sentence
392
                                {
393
                                        writer.writeEndElement(); // close s
394
                                        IN = false; // get out of the sentence
395
                                        writeEvent(event);
396
                                } else {
397
                                        writeEvent(event);
398
                                }
399
                                break;
400
                        case XMLStreamConstants.END_ELEMENT:
401
                                localname = parser.getLocalName();
402
                                if (localname == "s") // the sentence seems to already exists, we write the event and get OUT
403
                                {
404
                                        IN = false;
405
                                        writeEvent(event);
406
                                } else if (flow_tags.contains(localname)) // closing flow tag = we do nothing
407
                                {
408
                                        if (multi)
409
                                                println "end of multi?"
410
                                        writeEvent(event);
411
                                } else if (div_tags.contains(localname)) // closing div tag = we close the current sentence
412
                                {
413
                                        //println "closing s because of "+localname+" "+parser.getLocation();
414
                                        writer.writeEndElement(); // close s
415
                                        IN = false;
416
                                        writeEvent(event);
417
                                } else {
418
                                        writeEvent(event);
419
                                }
420
                                break;
421

    
422
                        case XMLStreamConstants.CHARACTERS: // detect end of sentence
423
                                writeEvent(event);
424
                                break;
425
                        case XMLStreamConstants.COMMENT:
426
                                writer.writeComment(parser.getText());
427
                                break
428
                }
429
        }
430

    
431
        /** The prefix. */
432
        String prefix;
433

    
434
        /**
435
         * write the current event.
436
         *
437
         * @param event the stax event
438
         */
439
        private void writeEvent(int event)
440
        {
441
                prefix = parser.getPrefix();
442
                if (event == XMLStreamConstants.START_ELEMENT )
443
                {
444
                        localname = parser.getLocalName();
445
                        if (prefix != null && prefix.length() > 0)
446
                                writer.writeStartElement(prefix+":"+localname);
447
                        else
448
                                writer.writeStartElement(localname);
449
                        String attrprefix;
450
                        for (int i = 0 ; i < parser.getAttributeCount() ; i++)
451
                        {
452
                                attrprefix = parser.getAttributePrefix(i);
453
                                if (attrprefix != null && attrprefix.length() > 0)
454
                                        writer.writeAttribute(attrprefix+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i))
455
                                else
456
                                        writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i))
457
                        }
458
                        //writer.writeCharacters("\n");
459
                } else if (event == XMLStreamConstants.END_ELEMENT)
460
                {
461
                        writer.writeEndElement();
462
                        //writer.writeCharacters("\n");
463
                } else if (event == XMLStreamConstants.CHARACTERS)
464
                {
465
                        writer.writeCharacters(parser.getText());
466
                } else if (event == XMLStreamConstants.COMMENT)
467
                {
468
                        writer.writeComment(parser.getText());
469
                }
470
        }
471

    
472
        /**
473
         * The main method.
474
         *
475
         * @param args the arguments
476
         */
477
        public static void main(String[] args)
478
        {
479
                File dir = new File ("/home/mdecorde/TXM/corpora/alceste/tokenized")
480
                File outdir = new File ("/home/mdecorde/TXM/corpora/alceste/tokenized/s")
481

    
482
                def files = dir.listFiles();//scan directory split
483

    
484
                println("Sentencing "+files.size()+" files")
485
                for (File infile : files) {
486
                        print ".";
487
                        File result = new File(outdir, infile.getName())
488
                        TagSentencesXml sentencer = new TagSentencesXml(infile);
489
                        if (!sentencer.process(result)) {
490
                                println "error with: "+infile
491
                                return;
492
                        }
493

    
494
                        //validation
495
                        if (!ValidateXml.test(result)) {
496
                                println "validation failed with: "+infile
497
                                return;
498
                        }
499
                }
500
                println "done"
501

    
502
        }
503
}