Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / filters / TagSentences / TagSentencesXml.groovy @ 1688

History | View | Annotate | Download (13.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate:$
41
// $LastChangedRevision:$
42
// $LastChangedBy:$
43
//
44
package org.txm.scripts.filters.TagSentences;
45

    
46
import javax.xml.parsers.DocumentBuilder;
47
import javax.xml.parsers.ParserConfigurationException;
48
import javax.xml.transform.OutputKeys;
49
import javax.xml.transform.Result;
50
import javax.xml.transform.Source;
51
import javax.xml.transform.Transformer;
52
import javax.xml.transform.TransformerFactory;
53
import javax.xml.transform.dom.DOMSource;
54
import javax.xml.transform.stream.StreamResult;
55

    
56
import org.txm.importer.ValidateXml;
57
import org.txm.tokenizer.TokenizerClasses;
58
import org.txm.utils.AsciiUtils;
59
import org.w3c.dom.Document;
60
import org.w3c.dom.Element;
61
import org.w3c.dom.NodeList;
62
import org.xml.sax.SAXException;
63

    
64
import java.io.File;
65
import java.io.FileInputStream;
66
import java.io.InputStreamReader;
67

    
68
import javax.xml.stream.*;
69
import java.net.URL;
70

    
71
// TODO: Auto-generated Javadoc
72
/**
73
 * The Class TagSentencesXml.
74
 */
75
class TagSentencesXml {
76

    
77
        /** The url. */
78
        private def url;
79

    
80
        /** The input data. */
81
        private def inputData;
82

    
83
        /** The parser. */
84
        private XMLStreamReader parser;
85

    
86
        /** The factory. */
87
        XMLOutputFactory factory = XMLOutputFactory.newInstance();
88

    
89
        /** The output. */
90
        FileOutputStream output;
91

    
92
        /** The writer. */
93
        XMLStreamWriter writer;
94

    
95
        /** The IN. */
96
        boolean IN = false;
97
        boolean WORD = false;
98

    
99
        /** The pending. */
100
        boolean pending = false;
101

    
102
        /** The div_tags. */
103
        List<String> div_tags = ["front","body","div","head","trailer","p","ab","sp","speaker","list","item"];
104

    
105
        /** The flow_tags. */
106
        List<String> flow_tags = ["seg","foreign","hi","title","name","supplied","subst","add","damage", "unclear", "corr", "sic"];
107

    
108
        String startTag = "text";
109
        String rendMulti = "MULTI_S";
110
        boolean multi = false;
111
        boolean startProcess = false;
112
        String strongPunct = TokenizerClasses.punct_strong;
113

    
114
        /** The localname. */
115
        String localname;
116
        boolean DEBUG = false;
117

    
118
        /**
119
         * Instantiates a new tag sentences xml.
120
         *
121
         * @param discoursxml the discoursxml
122
         */
123
        public TagSentencesXml(File discoursxml)
124
        {
125
                this.url = discoursxml.toURI().toURL();
126
                //corr_tags.addAll(corr_tags_no_seg);
127

    
128
                inputData = discoursxml.toURI().toURL().openStream();
129
                XMLInputFactory factory = XMLInputFactory.newInstance();
130
                parser = factory.createXMLStreamReader(inputData);
131
        }
132

    
133
        /**
134
         * Creates the output.
135
         *
136
         * @param outfile the outfile
137
         * @return true, if successful
138
         */
139
        private boolean createOutput(File outfile){
140
                try {
141
                        output = new BufferedOutputStream(new FileOutputStream(outfile))
142
                        writer = factory.createXMLStreamWriter(output, "UTF-8");//create a new file
143
                        writer.writeStartDocument("UTF-8", "1.0");
144
                        writer.writeCharacters("\n");
145
                        return true;
146
                } catch (Exception e) {
147
                        System.out.println(e.getLocalizedMessage());
148
                        return false;
149
                }
150
        }
151

    
152
        /** The s_id. */
153
        int s_id = 1;
154

    
155
        /**
156
         * Process.
157
         *
158
         * @param outfile the outfile
159
         * @return true, if successful
160
         */
161
        public boolean process(File outfile)
162
        {
163
                s_id = 1;
164
                if (!createOutput(outfile))
165
                        return false;
166
                String localname = "";
167

    
168
                try {
169
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
170
                                if (startProcess) {
171
                                        if (IN) {
172
                                                processIn(event);
173
                                        } else {
174
                                                processOut(event);
175
                                        }
176
                                } else {
177
                                        writeEvent(event);
178
                                }
179

    
180
                                //detect start tag
181
                                if (!startProcess)
182
                                        if (event == XMLStreamConstants.START_ELEMENT)
183
                                                if (parser.getLocalName() == startTag)
184
                                                        startProcess = true;
185
                        }
186
                        
187
                        writer.close()
188
                        output.close()
189
                        if (parser != null) parser.close();
190
                if (inputData != null) inputData.close();
191
                } catch (Exception ex) {
192
                        System.err.println(parser.getLocation().toString());
193
                        ex.printStackTrace();
194
                        
195
                        writer.close()
196
                        output.close()
197
                        if (parser != null) parser.close();
198
                if (inputData != null) inputData.close();
199
                        return false;
200
                }
201

    
202
                return true;
203
        }
204

    
205
        /**
206
         * ignore content until the existing s is closed
207
         * this sentence may contains others sentence
208
         */
209
        public void processContentExistingSentence() // we're IN
210
        {
211
                ArrayList<String> openedtags = [];
212
                if(DEBUG) writer.writeComment("SENT+");
213
                assert(IN == true);
214
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
215
                        switch (event) {
216
                                case XMLStreamConstants.START_ELEMENT:
217
                                        localname = parser.getLocalName();
218
                                        openedtags.add(localname); // stack the openened tag
219
                                        writeEvent(event);
220
                                        break;
221

    
222
                                case XMLStreamConstants.END_ELEMENT:
223
                                        localname = parser.getLocalName();
224
                                        writeEvent(event);
225

    
226
                                        if (localname == "s") {
227
                                                //        println "end of word: "+parser.getLocation().getLineNumber();
228
                                                if(openedtags.size() == 0) // it's the end of the existing sentence
229
                                                {
230
                                                        IN = false; // we're now OUT
231
                                                        return; // we've finish to write the word
232
                                                }
233
                                        }
234
                                        openedtags.pop(); // pop the tag stack
235
                                        break;
236

    
237
                                case XMLStreamConstants.CHARACTERS:
238
                                        writeEvent(event);
239
                                        break;
240
                                case XMLStreamConstants.COMMENT:
241
                                        writer.writeComment(parser.getText());
242
                                        break;
243
                        }
244
                }
245
                if(DEBUG) writer.writeComment("+SENT");
246
        }
247

    
248
        /**
249
         * process the content of a word. start element must have been writen before calling this method
250
         */
251
        public void processWordContent() // we're IN
252
        {
253
                if(DEBUG) writer.writeComment("WORD");
254
                assert(IN == true);
255
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
256
                        switch (event) {
257
                                case XMLStreamConstants.START_ELEMENT:
258
                                        localname = parser.getLocalName();
259
                                        if(localname == "w")
260
                                                println "Error: found a word in a word: "+parser.getLocation();
261
                                        writeEvent(event);
262
                                        break;
263

    
264
                                case XMLStreamConstants.END_ELEMENT:
265
                                        localname = parser.getLocalName();
266

    
267
                                        if (localname == "w" && pending) // we've found a strong punct in a word
268
                                        {
269
                                                IN = false;
270
                                                pending = false; // reset pending
271
                                                writeEvent(event);
272
                                                writer.writeEndElement();
273
                                        } else {
274
                                                writeEvent(event);
275
                                        }
276

    
277
                                        if (localname == "w") {
278
                                                //        println "end of word: "+parser.getLocation().getLineNumber();
279
                                                return; // we've finish to write the word
280
                                        }
281

    
282
                                        break;
283

    
284
                                case XMLStreamConstants.CHARACTERS: // detect end of sentence
285
                                        String txt = parser.getText();
286
                                        if (txt.matches(strongPunct)) {
287
                                                pending = true;
288
                                        }
289
                                        writeEvent(event);
290
                                        break;
291
                                case XMLStreamConstants.COMMENT:
292
                                        writer.writeComment(parser.getText());
293
                                        break;
294
                        }
295
                }
296
        }
297

    
298
        /**
299
         * Process out.
300
         *
301
         * @param event the event
302
         */
303
        public void processOut(int event)
304
        {
305
                if (DEBUG) writer.writeComment("OUT");
306
                switch (event) {
307
                        case XMLStreamConstants.START_ELEMENT:
308
                                localname = parser.getLocalName();
309
                                if (localname == "s") {
310
                                        IN = true;
311
                                        writeEvent(event);
312
                                        processContentExistingSentence();
313
                                } else if (localname == "w") {
314
                                        IN = true;
315
                                        writer.writeStartElement("s");
316
                                        writer.writeAttribute("id", "s_"+s_id++)
317
                                        writeEvent(event);
318
                                        processWordContent();
319
                                } else if (flow_tags.contains(localname)) {
320
                                        String rend = getRendAttribute();
321
                                        if (!rend.contains(rendMulti)) {
322
                                                IN = true;
323
                                                writer.writeStartElement("s");
324
                                                writer.writeAttribute("id", "s_"+s_id++)
325
                                        } else {
326
                                                multi = true;
327
                                        }
328
                                        writeEvent(event);
329
                                } else if (div_tags.contains(localname)) // found a tag out of sentences, we do nothing
330
                                {
331
                                        writeEvent(event);
332
                                } else {
333
                                        writeEvent(event);
334
                                }
335
                                break;
336

    
337
                        case XMLStreamConstants.END_ELEMENT:
338
                                localname = parser.getLocalName();
339
                                if (localname == "s") {
340
                                        System.out.println("Error: found a closing s but we are OUT");
341
                                }
342
                                writeEvent(event);
343
                                break;
344

    
345
                        case XMLStreamConstants.CHARACTERS:
346
                                writeEvent(event);
347
                                break;
348
                        case XMLStreamConstants.COMMENT:
349
                                writer.writeComment(parser.getText());
350
                                break
351
                }
352
        }
353

    
354
        public String getRendAttribute()
355
        {
356
                for (int i = 0 ; i < parser.getAttributeCount() ; i++)
357
                        if (parser.getAttributeLocalName(i) == "rend")
358
                                return parser.getAttributeValue(i);
359
                return "";
360
        }
361

    
362
        /**
363
         * Process xml in a created sentence.
364
         *
365
         * @param event the event
366
         */
367
        public void processIn(int event)
368
        {
369
                if(DEBUG) writer.writeComment("IN")
370
                switch (event) {
371
                        case XMLStreamConstants.START_ELEMENT:
372
                                localname = parser.getLocalName();
373
                                if (localname == "s") // found opening s, we close ours and stay IN
374
                                {
375
                                        writer.writeEndElement(); // close our sentence
376
                                        writeEvent(event); // write the start element
377
                                        processContentExistingSentence(); // process its content
378
                                        //println "ERROR "+parser.getLocation();
379
                                } else if (localname == "w") // can't close s from here, see END_ELEMENT
380
                                {
381
                                        writeEvent(event);
382
                                        processWordContent();
383
                                } else if (flow_tags.contains(localname)) // flow tag = we stay in the sentence
384
                                {
385
                                        String rend = getRendAttribute();
386
                                        if (rend.contains(rendMulti)) // but this one must be seens as a div tag
387
                                        {
388
                                                writer.writeEndElement(); // close s
389
                                                IN = false; // get out of the sentence
390
                                                multi = true;
391
                                        }
392
                                        writeEvent(event);
393
                                } else if (div_tags.contains(localname)) // div tag = we must get out the sentence
394
                                {
395
                                        writer.writeEndElement(); // close s
396
                                        IN = false; // get out of the sentence
397
                                        writeEvent(event);
398
                                } else {
399
                                        writeEvent(event);
400
                                }
401
                                break;
402
                        case XMLStreamConstants.END_ELEMENT:
403
                                localname = parser.getLocalName();
404
                                if (localname == "s") // the sentence seems to already exists, we write the event and get OUT
405
                                {
406
                                        IN = false;
407
                                        writeEvent(event);
408
                                } else if (flow_tags.contains(localname)) // closing flow tag = we do nothing
409
                                {
410
                                        if (multi)
411
                                                println "end of multi?"
412
                                        writeEvent(event);
413
                                } else if (div_tags.contains(localname)) // closing div tag = we close the current sentence
414
                                {
415
                                        //println "closing s because of "+localname+" "+parser.getLocation();
416
                                        writer.writeEndElement(); // close s
417
                                        IN = false;
418
                                        writeEvent(event);
419
                                } else {
420
                                        writeEvent(event);
421
                                }
422
                                break;
423

    
424
                        case XMLStreamConstants.CHARACTERS: // detect end of sentence
425
                                writeEvent(event);
426
                                break;
427
                        case XMLStreamConstants.COMMENT:
428
                                writer.writeComment(parser.getText());
429
                                break
430
                }
431
        }
432

    
433
        /** The prefix. */
434
        String prefix;
435

    
436
        /**
437
         * write the current event.
438
         *
439
         * @param event the stax event
440
         */
441
        private void writeEvent(int event)
442
        {
443
                prefix = parser.getPrefix();
444
                if (event == XMLStreamConstants.START_ELEMENT )
445
                {
446
                        localname = parser.getLocalName();
447
                        if (prefix != null && prefix.length() > 0)
448
                                writer.writeStartElement(prefix+":"+localname);
449
                        else
450
                                writer.writeStartElement(localname);
451
                        String attrprefix;
452
                        for (int i = 0 ; i < parser.getAttributeCount() ; i++)
453
                        {
454
                                attrprefix = parser.getAttributePrefix(i);
455
                                if (attrprefix != null && attrprefix.length() > 0)
456
                                        writer.writeAttribute(attrprefix+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i))
457
                                else
458
                                        writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i))
459
                        }
460
                        //writer.writeCharacters("\n");
461
                } else if (event == XMLStreamConstants.END_ELEMENT)
462
                {
463
                        writer.writeEndElement();
464
                        //writer.writeCharacters("\n");
465
                } else if (event == XMLStreamConstants.CHARACTERS)
466
                {
467
                        writer.writeCharacters(parser.getText());
468
                } else if (event == XMLStreamConstants.COMMENT)
469
                {
470
                        writer.writeComment(parser.getText());
471
                }
472
        }
473

    
474
        /**
475
         * The main method.
476
         *
477
         * @param args the arguments
478
         */
479
        public static void main(String[] args)
480
        {
481
                File dir = new File ("/home/mdecorde/TXM/corpora/alceste/tokenized")
482
                File outdir = new File ("/home/mdecorde/TXM/corpora/alceste/tokenized/s")
483

    
484
                def files = dir.listFiles();//scan directory split
485

    
486
                println("Sentencing "+files.size()+" files")
487
                for (File infile : files) {
488
                        print ".";
489
                        File result = new File(outdir, infile.getName())
490
                        TagSentencesXml sentencer = new TagSentencesXml(infile);
491
                        if (!sentencer.process(result)) {
492
                                println "error with: "+infile
493
                                return;
494
                        }
495

    
496
                        //validation
497
                        if (!ValidateXml.test(result)) {
498
                                println "validation failed with: "+infile
499
                                return;
500
                        }
501
                }
502
                println "done"
503

    
504
        }
505
}