Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / filters / TagSentences / TagSentencesXml.groovy @ 1000

History | View | Annotate | Download (13.8 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
22 321 mdecorde
//
23 321 mdecorde
// This file is part of the TXM platform.
24 321 mdecorde
//
25 321 mdecorde
// The TXM platform is free software: you can redistribute it and/or modif y
26 321 mdecorde
// it under the terms of the GNU General Public License as published by
27 321 mdecorde
// the Free Software Foundation, either version 3 of the License, or
28 321 mdecorde
// (at your option) any later version.
29 321 mdecorde
//
30 321 mdecorde
// The TXM platform is distributed in the hope that it will be useful,
31 321 mdecorde
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32 321 mdecorde
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33 321 mdecorde
// GNU General Public License for more details.
34 321 mdecorde
//
35 321 mdecorde
// You should have received a copy of the GNU General Public License
36 321 mdecorde
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37 321 mdecorde
//
38 321 mdecorde
//
39 321 mdecorde
//
40 321 mdecorde
// $LastChangedDate:$
41 321 mdecorde
// $LastChangedRevision:$
42 321 mdecorde
// $LastChangedBy:$
43 321 mdecorde
//
44 1000 mdecorde
package org.txm.scripts.filters.TagSentences;
45 321 mdecorde
46 321 mdecorde
import javax.xml.parsers.DocumentBuilder;
47 321 mdecorde
import javax.xml.parsers.ParserConfigurationException;
48 321 mdecorde
import javax.xml.transform.OutputKeys;
49 321 mdecorde
import javax.xml.transform.Result;
50 321 mdecorde
import javax.xml.transform.Source;
51 321 mdecorde
import javax.xml.transform.Transformer;
52 321 mdecorde
import javax.xml.transform.TransformerFactory;
53 321 mdecorde
import javax.xml.transform.dom.DOMSource;
54 321 mdecorde
import javax.xml.transform.stream.StreamResult;
55 321 mdecorde
56 1000 mdecorde
import org.txm.importer.ValidateXml;
57 321 mdecorde
import org.txm.tokenizer.TokenizerClasses;
58 321 mdecorde
import org.txm.utils.AsciiUtils;
59 321 mdecorde
import org.w3c.dom.Document;
60 321 mdecorde
import org.w3c.dom.Element;
61 321 mdecorde
import org.w3c.dom.NodeList;
62 321 mdecorde
import org.xml.sax.SAXException;
63 321 mdecorde
64 321 mdecorde
import java.io.File;
65 321 mdecorde
import java.io.FileInputStream;
66 321 mdecorde
import java.io.InputStreamReader;
67 321 mdecorde
68 321 mdecorde
import javax.xml.stream.*;
69 321 mdecorde
import java.net.URL;
70 321 mdecorde
71 321 mdecorde
// TODO: Auto-generated Javadoc
72 321 mdecorde
/**
73 321 mdecorde
 * The Class TagSentencesXml.
74 321 mdecorde
 */
75 321 mdecorde
class TagSentencesXml {
76 321 mdecorde
77 321 mdecorde
        /** The url. */
78 321 mdecorde
        private def url;
79 321 mdecorde
80 321 mdecorde
        /** The input data. */
81 321 mdecorde
        private def inputData;
82 321 mdecorde
83 321 mdecorde
        /** The parser. */
84 321 mdecorde
        private XMLStreamReader parser;
85 321 mdecorde
86 321 mdecorde
        /** The factory. */
87 321 mdecorde
        XMLOutputFactory factory = XMLOutputFactory.newInstance();
88 321 mdecorde
89 321 mdecorde
        /** The output. */
90 321 mdecorde
        FileOutputStream output;
91 321 mdecorde
92 321 mdecorde
        /** The writer. */
93 321 mdecorde
        XMLStreamWriter writer;
94 321 mdecorde
95 321 mdecorde
        /** The IN. */
96 321 mdecorde
        boolean IN = false;
97 321 mdecorde
        boolean WORD = false;
98 321 mdecorde
99 321 mdecorde
        /** The pending. */
100 321 mdecorde
        boolean pending = false;
101 321 mdecorde
102 321 mdecorde
        /** The div_tags. */
103 321 mdecorde
        List<String> div_tags = ["front","body","div","head","trailer","p","ab","sp","speaker","list","item"];
104 321 mdecorde
105 321 mdecorde
        /** The flow_tags. */
106 321 mdecorde
        List<String> flow_tags = ["seg","foreign","hi","title","name","supplied","subst","add","damage", "unclear", "corr", "sic"];
107 321 mdecorde
108 321 mdecorde
        String startTag = "text";
109 321 mdecorde
        String rendMulti = "MULTI_S";
110 321 mdecorde
        boolean multi = false;
111 321 mdecorde
        boolean startProcess = false;
112 321 mdecorde
        String strongPunct = TokenizerClasses.punct_strong;
113 321 mdecorde
114 321 mdecorde
        /** The localname. */
115 321 mdecorde
        String localname;
116 321 mdecorde
        boolean DEBUG = false;
117 321 mdecorde
118 321 mdecorde
        /**
119 321 mdecorde
         * Instantiates a new tag sentences xml.
120 321 mdecorde
         *
121 321 mdecorde
         * @param discoursxml the discoursxml
122 321 mdecorde
         */
123 321 mdecorde
        public TagSentencesXml(File discoursxml)
124 321 mdecorde
        {
125 321 mdecorde
                this.url = discoursxml.toURI().toURL();
126 321 mdecorde
                //corr_tags.addAll(corr_tags_no_seg);
127 321 mdecorde
128 321 mdecorde
                inputData = discoursxml.toURI().toURL().openStream();
129 321 mdecorde
                XMLInputFactory factory = XMLInputFactory.newInstance();
130 321 mdecorde
                parser = factory.createXMLStreamReader(inputData);
131 321 mdecorde
        }
132 321 mdecorde
133 321 mdecorde
        /**
134 321 mdecorde
         * Creates the output.
135 321 mdecorde
         *
136 321 mdecorde
         * @param outfile the outfile
137 321 mdecorde
         * @return true, if successful
138 321 mdecorde
         */
139 321 mdecorde
        private boolean createOutput(File outfile){
140 321 mdecorde
                try {
141 321 mdecorde
                        output = new BufferedOutputStream(new FileOutputStream(outfile))
142 321 mdecorde
                        writer = factory.createXMLStreamWriter(output, "UTF-8");//create a new file
143 321 mdecorde
                        writer.writeStartDocument("UTF-8", "1.0");
144 321 mdecorde
                        writer.writeCharacters("\n");
145 321 mdecorde
                        return true;
146 321 mdecorde
                } catch (Exception e) {
147 321 mdecorde
                        System.out.println(e.getLocalizedMessage());
148 321 mdecorde
                        return false;
149 321 mdecorde
                }
150 321 mdecorde
        }
151 321 mdecorde
152 321 mdecorde
        /** The s_id. */
153 321 mdecorde
        int s_id = 1;
154 321 mdecorde
155 321 mdecorde
        /**
156 321 mdecorde
         * Process.
157 321 mdecorde
         *
158 321 mdecorde
         * @param outfile the outfile
159 321 mdecorde
         * @return true, if successful
160 321 mdecorde
         */
161 321 mdecorde
        public boolean process(File outfile)
162 321 mdecorde
        {
163 321 mdecorde
                s_id = 1;
164 321 mdecorde
                if (!createOutput(outfile))
165 321 mdecorde
                        return false;
166 321 mdecorde
                String localname = "";
167 321 mdecorde
168 321 mdecorde
                try {
169 321 mdecorde
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
170 321 mdecorde
                                if (startProcess) {
171 321 mdecorde
                                        if (IN) {
172 321 mdecorde
                                                processIn(event);
173 321 mdecorde
                                        } else {
174 321 mdecorde
                                                processOut(event);
175 321 mdecorde
                                        }
176 321 mdecorde
                                } else {
177 321 mdecorde
                                        writeEvent(event);
178 321 mdecorde
                                }
179 321 mdecorde
180 321 mdecorde
                                //detect start tag
181 321 mdecorde
                                if (!startProcess)
182 321 mdecorde
                                        if (event == XMLStreamConstants.START_ELEMENT)
183 321 mdecorde
                                                if (parser.getLocalName() == startTag)
184 321 mdecorde
                                                        startProcess = true;
185 321 mdecorde
                        }
186 321 mdecorde
187 321 mdecorde
                        writer.close()
188 321 mdecorde
                        output.close()
189 321 mdecorde
                        parser.close()
190 321 mdecorde
                } catch (Exception ex) {
191 321 mdecorde
                        System.err.println(parser.getLocation().toString());
192 321 mdecorde
                        ex.printStackTrace();
193 321 mdecorde
194 321 mdecorde
                        writer.close()
195 321 mdecorde
                        output.close()
196 321 mdecorde
                        parser.close()
197 321 mdecorde
                        return false;
198 321 mdecorde
                }
199 321 mdecorde
200 321 mdecorde
                return true;
201 321 mdecorde
        }
202 321 mdecorde
203 321 mdecorde
        /**
204 321 mdecorde
         * ignore content until the existing s is closed
205 321 mdecorde
         * this sentence may contains others sentence
206 321 mdecorde
         */
207 321 mdecorde
        public void processContentExistingSentence() // we're IN
208 321 mdecorde
        {
209 321 mdecorde
                ArrayList<String> openedtags = [];
210 321 mdecorde
                if(DEBUG) writer.writeComment("SENT+");
211 321 mdecorde
                assert(IN == true);
212 321 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
213 321 mdecorde
                        switch (event) {
214 321 mdecorde
                                case XMLStreamConstants.START_ELEMENT:
215 321 mdecorde
                                        localname = parser.getLocalName();
216 321 mdecorde
                                        openedtags.add(localname); // stack the openened tag
217 321 mdecorde
                                        writeEvent(event);
218 321 mdecorde
                                        break;
219 321 mdecorde
220 321 mdecorde
                                case XMLStreamConstants.END_ELEMENT:
221 321 mdecorde
                                        localname = parser.getLocalName();
222 321 mdecorde
                                        writeEvent(event);
223 321 mdecorde
224 321 mdecorde
                                        if (localname == "s") {
225 321 mdecorde
                                                //        println "end of word: "+parser.getLocation().getLineNumber();
226 321 mdecorde
                                                if(openedtags.size() == 0) // it's the end of the existing sentence
227 321 mdecorde
                                                {
228 321 mdecorde
                                                        IN = false; // we're now OUT
229 321 mdecorde
                                                        return; // we've finish to write the word
230 321 mdecorde
                                                }
231 321 mdecorde
                                        }
232 321 mdecorde
                                        openedtags.pop(); // pop the tag stack
233 321 mdecorde
                                        break;
234 321 mdecorde
235 321 mdecorde
                                case XMLStreamConstants.CHARACTERS:
236 321 mdecorde
                                        writeEvent(event);
237 321 mdecorde
                                        break;
238 321 mdecorde
                                case XMLStreamConstants.COMMENT:
239 321 mdecorde
                                        writer.writeComment(parser.getText());
240 321 mdecorde
                                        break;
241 321 mdecorde
                        }
242 321 mdecorde
                }
243 321 mdecorde
                if(DEBUG) writer.writeComment("+SENT");
244 321 mdecorde
        }
245 321 mdecorde
246 321 mdecorde
        /**
247 321 mdecorde
         * process the content of a word. start element must have been writen before calling this method
248 321 mdecorde
         */
249 321 mdecorde
        public void processWordContent() // we're IN
250 321 mdecorde
        {
251 321 mdecorde
                if(DEBUG) writer.writeComment("WORD");
252 321 mdecorde
                assert(IN == true);
253 321 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
254 321 mdecorde
                        switch (event) {
255 321 mdecorde
                                case XMLStreamConstants.START_ELEMENT:
256 321 mdecorde
                                        localname = parser.getLocalName();
257 321 mdecorde
                                        if(localname == "w")
258 321 mdecorde
                                                println "Error: found a word in a word: "+parser.getLocation();
259 321 mdecorde
                                        writeEvent(event);
260 321 mdecorde
                                        break;
261 321 mdecorde
262 321 mdecorde
                                case XMLStreamConstants.END_ELEMENT:
263 321 mdecorde
                                        localname = parser.getLocalName();
264 321 mdecorde
265 321 mdecorde
                                        if (localname == "w" && pending) // we've found a strong punct in a word
266 321 mdecorde
                                        {
267 321 mdecorde
                                                IN = false;
268 321 mdecorde
                                                pending = false; // reset pending
269 321 mdecorde
                                                writeEvent(event);
270 321 mdecorde
                                                writer.writeEndElement();
271 321 mdecorde
                                        } else {
272 321 mdecorde
                                                writeEvent(event);
273 321 mdecorde
                                        }
274 321 mdecorde
275 321 mdecorde
                                        if (localname == "w") {
276 321 mdecorde
                                                //        println "end of word: "+parser.getLocation().getLineNumber();
277 321 mdecorde
                                                return; // we've finish to write the word
278 321 mdecorde
                                        }
279 321 mdecorde
280 321 mdecorde
                                        break;
281 321 mdecorde
282 321 mdecorde
                                case XMLStreamConstants.CHARACTERS: // detect end of sentence
283 321 mdecorde
                                        String txt = parser.getText();
284 321 mdecorde
                                        if (txt.matches(strongPunct)) {
285 321 mdecorde
                                                pending = true;
286 321 mdecorde
                                        }
287 321 mdecorde
                                        writeEvent(event);
288 321 mdecorde
                                        break;
289 321 mdecorde
                                case XMLStreamConstants.COMMENT:
290 321 mdecorde
                                        writer.writeComment(parser.getText());
291 321 mdecorde
                                        break;
292 321 mdecorde
                        }
293 321 mdecorde
                }
294 321 mdecorde
        }
295 321 mdecorde
296 321 mdecorde
        /**
297 321 mdecorde
         * Process out.
298 321 mdecorde
         *
299 321 mdecorde
         * @param event the event
300 321 mdecorde
         */
301 321 mdecorde
        public void processOut(int event)
302 321 mdecorde
        {
303 321 mdecorde
                if (DEBUG) writer.writeComment("OUT");
304 321 mdecorde
                switch (event) {
305 321 mdecorde
                        case XMLStreamConstants.START_ELEMENT:
306 321 mdecorde
                                localname = parser.getLocalName();
307 321 mdecorde
                                if (localname == "s") {
308 321 mdecorde
                                        IN = true;
309 321 mdecorde
                                        writeEvent(event);
310 321 mdecorde
                                        processContentExistingSentence();
311 321 mdecorde
                                } else if (localname == "w") {
312 321 mdecorde
                                        IN = true;
313 321 mdecorde
                                        writer.writeStartElement("s");
314 321 mdecorde
                                        writer.writeAttribute("id", "s_"+s_id++)
315 321 mdecorde
                                        writeEvent(event);
316 321 mdecorde
                                        processWordContent();
317 321 mdecorde
                                } else if (flow_tags.contains(localname)) {
318 321 mdecorde
                                        String rend = getRendAttribute();
319 321 mdecorde
                                        if (!rend.contains(rendMulti)) {
320 321 mdecorde
                                                IN = true;
321 321 mdecorde
                                                writer.writeStartElement("s");
322 321 mdecorde
                                                writer.writeAttribute("id", "s_"+s_id++)
323 321 mdecorde
                                        } else {
324 321 mdecorde
                                                multi = true;
325 321 mdecorde
                                        }
326 321 mdecorde
                                        writeEvent(event);
327 321 mdecorde
                                } else if (div_tags.contains(localname)) // found a tag out of sentences, we do nothing
328 321 mdecorde
                                {
329 321 mdecorde
                                        writeEvent(event);
330 321 mdecorde
                                } else {
331 321 mdecorde
                                        writeEvent(event);
332 321 mdecorde
                                }
333 321 mdecorde
                                break;
334 321 mdecorde
335 321 mdecorde
                        case XMLStreamConstants.END_ELEMENT:
336 321 mdecorde
                                localname = parser.getLocalName();
337 321 mdecorde
                                if (localname == "s") {
338 321 mdecorde
                                        System.out.println("Error: found a closing s but we are OUT");
339 321 mdecorde
                                }
340 321 mdecorde
                                writeEvent(event);
341 321 mdecorde
                                break;
342 321 mdecorde
343 321 mdecorde
                        case XMLStreamConstants.CHARACTERS:
344 321 mdecorde
                                writeEvent(event);
345 321 mdecorde
                                break;
346 321 mdecorde
                        case XMLStreamConstants.COMMENT:
347 321 mdecorde
                                writer.writeComment(parser.getText());
348 321 mdecorde
                                break
349 321 mdecorde
                }
350 321 mdecorde
        }
351 321 mdecorde
352 321 mdecorde
        public String getRendAttribute()
353 321 mdecorde
        {
354 321 mdecorde
                for (int i = 0 ; i < parser.getAttributeCount() ; i++)
355 321 mdecorde
                        if (parser.getAttributeLocalName(i) == "rend")
356 321 mdecorde
                                return parser.getAttributeValue(i);
357 321 mdecorde
                return "";
358 321 mdecorde
        }
359 321 mdecorde
360 321 mdecorde
        /**
361 321 mdecorde
         * Process xml in a created sentence.
362 321 mdecorde
         *
363 321 mdecorde
         * @param event the event
364 321 mdecorde
         */
365 321 mdecorde
        public void processIn(int event)
366 321 mdecorde
        {
367 321 mdecorde
                if(DEBUG) writer.writeComment("IN")
368 321 mdecorde
                switch (event) {
369 321 mdecorde
                        case XMLStreamConstants.START_ELEMENT:
370 321 mdecorde
                                localname = parser.getLocalName();
371 321 mdecorde
                                if (localname == "s") // found opening s, we close ours and stay IN
372 321 mdecorde
                                {
373 321 mdecorde
                                        writer.writeEndElement(); // close our sentence
374 321 mdecorde
                                        writeEvent(event); // write the start element
375 321 mdecorde
                                        processContentExistingSentence(); // process its content
376 321 mdecorde
                                        //println "ERROR "+parser.getLocation();
377 321 mdecorde
                                } else if (localname == "w") // can't close s from here, see END_ELEMENT
378 321 mdecorde
                                {
379 321 mdecorde
                                        writeEvent(event);
380 321 mdecorde
                                        processWordContent();
381 321 mdecorde
                                } else if (flow_tags.contains(localname)) // flow tag = we stay in the sentence
382 321 mdecorde
                                {
383 321 mdecorde
                                        String rend = getRendAttribute();
384 321 mdecorde
                                        if (rend.contains(rendMulti)) // but this one must be seens as a div tag
385 321 mdecorde
                                        {
386 321 mdecorde
                                                writer.writeEndElement(); // close s
387 321 mdecorde
                                                IN = false; // get out of the sentence
388 321 mdecorde
                                                multi = true;
389 321 mdecorde
                                        }
390 321 mdecorde
                                        writeEvent(event);
391 321 mdecorde
                                } else if (div_tags.contains(localname)) // div tag = we must get out the sentence
392 321 mdecorde
                                {
393 321 mdecorde
                                        writer.writeEndElement(); // close s
394 321 mdecorde
                                        IN = false; // get out of the sentence
395 321 mdecorde
                                        writeEvent(event);
396 321 mdecorde
                                } else {
397 321 mdecorde
                                        writeEvent(event);
398 321 mdecorde
                                }
399 321 mdecorde
                                break;
400 321 mdecorde
                        case XMLStreamConstants.END_ELEMENT:
401 321 mdecorde
                                localname = parser.getLocalName();
402 321 mdecorde
                                if (localname == "s") // the sentence seems to already exists, we write the event and get OUT
403 321 mdecorde
                                {
404 321 mdecorde
                                        IN = false;
405 321 mdecorde
                                        writeEvent(event);
406 321 mdecorde
                                } else if (flow_tags.contains(localname)) // closing flow tag = we do nothing
407 321 mdecorde
                                {
408 321 mdecorde
                                        if (multi)
409 321 mdecorde
                                                println "end of multi?"
410 321 mdecorde
                                        writeEvent(event);
411 321 mdecorde
                                } else if (div_tags.contains(localname)) // closing div tag = we close the current sentence
412 321 mdecorde
                                {
413 321 mdecorde
                                        //println "closing s because of "+localname+" "+parser.getLocation();
414 321 mdecorde
                                        writer.writeEndElement(); // close s
415 321 mdecorde
                                        IN = false;
416 321 mdecorde
                                        writeEvent(event);
417 321 mdecorde
                                } else {
418 321 mdecorde
                                        writeEvent(event);
419 321 mdecorde
                                }
420 321 mdecorde
                                break;
421 321 mdecorde
422 321 mdecorde
                        case XMLStreamConstants.CHARACTERS: // detect end of sentence
423 321 mdecorde
                                writeEvent(event);
424 321 mdecorde
                                break;
425 321 mdecorde
                        case XMLStreamConstants.COMMENT:
426 321 mdecorde
                                writer.writeComment(parser.getText());
427 321 mdecorde
                                break
428 321 mdecorde
                }
429 321 mdecorde
        }
430 321 mdecorde
431 321 mdecorde
        /** The prefix. */
432 321 mdecorde
        String prefix;
433 321 mdecorde
434 321 mdecorde
        /**
435 321 mdecorde
         * write the current event.
436 321 mdecorde
         *
437 321 mdecorde
         * @param event the stax event
438 321 mdecorde
         */
439 321 mdecorde
        private void writeEvent(int event)
440 321 mdecorde
        {
441 321 mdecorde
                prefix = parser.getPrefix();
442 321 mdecorde
                if (event == XMLStreamConstants.START_ELEMENT )
443 321 mdecorde
                {
444 321 mdecorde
                        localname = parser.getLocalName();
445 321 mdecorde
                        if (prefix != null && prefix.length() > 0)
446 321 mdecorde
                                writer.writeStartElement(prefix+":"+localname);
447 321 mdecorde
                        else
448 321 mdecorde
                                writer.writeStartElement(localname);
449 321 mdecorde
                        String attrprefix;
450 321 mdecorde
                        for (int i = 0 ; i < parser.getAttributeCount() ; i++)
451 321 mdecorde
                        {
452 321 mdecorde
                                attrprefix = parser.getAttributePrefix(i);
453 321 mdecorde
                                if (attrprefix != null && attrprefix.length() > 0)
454 321 mdecorde
                                        writer.writeAttribute(attrprefix+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i))
455 321 mdecorde
                                else
456 321 mdecorde
                                        writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i))
457 321 mdecorde
                        }
458 321 mdecorde
                        //writer.writeCharacters("\n");
459 321 mdecorde
                } else if (event == XMLStreamConstants.END_ELEMENT)
460 321 mdecorde
                {
461 321 mdecorde
                        writer.writeEndElement();
462 321 mdecorde
                        //writer.writeCharacters("\n");
463 321 mdecorde
                } else if (event == XMLStreamConstants.CHARACTERS)
464 321 mdecorde
                {
465 321 mdecorde
                        writer.writeCharacters(parser.getText());
466 321 mdecorde
                } else if (event == XMLStreamConstants.COMMENT)
467 321 mdecorde
                {
468 321 mdecorde
                        writer.writeComment(parser.getText());
469 321 mdecorde
                }
470 321 mdecorde
        }
471 321 mdecorde
472 321 mdecorde
        /**
473 321 mdecorde
         * The main method.
474 321 mdecorde
         *
475 321 mdecorde
         * @param args the arguments
476 321 mdecorde
         */
477 321 mdecorde
        public static void main(String[] args)
478 321 mdecorde
        {
479 321 mdecorde
                File dir = new File ("/home/mdecorde/TXM/corpora/alceste/tokenized")
480 321 mdecorde
                File outdir = new File ("/home/mdecorde/TXM/corpora/alceste/tokenized/s")
481 321 mdecorde
482 321 mdecorde
                def files = dir.listFiles();//scan directory split
483 321 mdecorde
484 321 mdecorde
                println("Sentencing "+files.size()+" files")
485 321 mdecorde
                for (File infile : files) {
486 321 mdecorde
                        print ".";
487 321 mdecorde
                        File result = new File(outdir, infile.getName())
488 321 mdecorde
                        TagSentencesXml sentencer = new TagSentencesXml(infile);
489 321 mdecorde
                        if (!sentencer.process(result)) {
490 321 mdecorde
                                println "error with: "+infile
491 321 mdecorde
                                return;
492 321 mdecorde
                        }
493 321 mdecorde
494 321 mdecorde
                        //validation
495 321 mdecorde
                        if (!ValidateXml.test(result)) {
496 321 mdecorde
                                println "validation failed with: "+infile
497 321 mdecorde
                                return;
498 321 mdecorde
                        }
499 321 mdecorde
                }
500 321 mdecorde
                println "done"
501 321 mdecorde
502 321 mdecorde
        }
503 321 mdecorde
}