Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / filters / TagSentences / TagSentences.groovy @ 1000

History | View | Annotate | Download (12.6 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
//
22 321 mdecorde
//
23 321 mdecorde
//
24 321 mdecorde
// $LastChangedDate:$
25 321 mdecorde
// $LastChangedRevision:$
26 321 mdecorde
// $LastChangedBy:$
27 321 mdecorde
//
28 1000 mdecorde
package org.txm.scripts.filters.TagSentences;
29 321 mdecorde
30 321 mdecorde
import java.util.regex.*
31 321 mdecorde
32 1000 mdecorde
import org.txm.importer.scripts.filters.*
33 321 mdecorde
import org.txm.tokenizer.*
34 321 mdecorde
35 986 mdecorde
import org.txm.scripts.filters.CutHeader.*
36 986 mdecorde
import org.txm.scripts.filters.FusionHeader.*
37 321 mdecorde
38 321 mdecorde
// TODO: Auto-generated Javadoc
39 321 mdecorde
/**
40 321 mdecorde
 * The Class TagSentences.
41 321 mdecorde
 */
42 321 mdecorde
class TagSentences extends Filter {
43 321 mdecorde
44 321 mdecorde
        /** The counter. */
45 321 mdecorde
        int counter;
46 321 mdecorde
47 321 mdecorde
        /** The m. */
48 321 mdecorde
        def m;
49 321 mdecorde
50 321 mdecorde
        /** The segment. */
51 321 mdecorde
        def segment;
52 321 mdecorde
53 321 mdecorde
        /** The linetype. */
54 321 mdecorde
        def linetype = "out";
55 321 mdecorde
56 321 mdecorde
        /** The open_corr_tags. */
57 321 mdecorde
        def open_corr_tags = new LinkedList<String>();
58 321 mdecorde
59 321 mdecorde
        /** The LAS topen_corr_tags. */
60 321 mdecorde
        def LASTopen_corr_tags = "";
61 321 mdecorde
62 321 mdecorde
        /** The open_div_tags. */
63 321 mdecorde
        def open_div_tags = new LinkedList<String>();
64 321 mdecorde
65 321 mdecorde
        /** The LAST open_div_tags. */
66 321 mdecorde
        def LASTopen_div_tags = "";
67 321 mdecorde
68 321 mdecorde
        /** The corr. */
69 321 mdecorde
        def corr = "no";
70 321 mdecorde
71 321 mdecorde
        /** The corr_name. */
72 321 mdecorde
        def corr_name;
73 321 mdecorde
74 321 mdecorde
        /** The corr_tag. */
75 321 mdecorde
        def corr_tag;
76 321 mdecorde
77 321 mdecorde
        /** The pending. */
78 321 mdecorde
        def pending = "no";
79 321 mdecorde
80 321 mdecorde
        /** The scounter. */
81 321 mdecorde
        def scounter = 0;
82 321 mdecorde
83 321 mdecorde
        /** The _before. */
84 321 mdecorde
        def _before;
85 321 mdecorde
86 321 mdecorde
        /** The _after. */
87 321 mdecorde
        def _after;
88 321 mdecorde
89 321 mdecorde
        /** The div_tags. */
90 321 mdecorde
        def div_tags = TokenizerClasses.div_tags;
91 321 mdecorde
92 321 mdecorde
        /** The q_tags. */
93 321 mdecorde
        def q_tags = TokenizerClasses.q_tags;
94 321 mdecorde
95 321 mdecorde
        /** The corr_tags_no_seg. */
96 321 mdecorde
        def corr_tags_no_seg = TokenizerClasses.corr_tags_no_seg;
97 321 mdecorde
98 321 mdecorde
        /** The corr_tags. */
99 321 mdecorde
        def corr_tags = TokenizerClasses.corr_tags;
100 321 mdecorde
101 321 mdecorde
        def strongPunct = TokenizerClasses.punct_strong;
102 321 mdecorde
103 321 mdecorde
        /* (non-Javadoc)
104 321 mdecorde
         * @see org.txm.importer.filters.Filter#SetUsedParam(java.lang.Object)
105 321 mdecorde
         */
106 321 mdecorde
        void SetUsedParam(Object args) {
107 321 mdecorde
                try {
108 321 mdecorde
                        div_tags = args.get("div_tags");
109 321 mdecorde
                        q_tags = args.get("q_tags");
110 321 mdecorde
                        corr_tags_no_seg = args.get("corr_tags_no_seg");
111 321 mdecorde
                        corr_tags = args.get("corr_tags");
112 321 mdecorde
                }
113 321 mdecorde
                catch(Exception e) {
114 321 mdecorde
                        System.err.println(e);
115 321 mdecorde
                        System.err.println("TagginSentences needs 1 Map with args  :\n div_tags, q_tags, extraword_tags, " +
116 321 mdecorde
                                        "corr_tags_no_seg, corr_tags, word_tags, intraword_tags")
117 321 mdecorde
                }
118 321 mdecorde
        }
119 321 mdecorde
120 321 mdecorde
        /* (non-Javadoc)
121 321 mdecorde
         * @see org.txm.importer.filters.Filter#before()
122 321 mdecorde
         */
123 321 mdecorde
        boolean before() {
124 321 mdecorde
                // System.out.println("begin sentences");
125 321 mdecorde
        }
126 321 mdecorde
127 321 mdecorde
        /* (non-Javadoc)
128 321 mdecorde
         * @see org.txm.importer.filters.Filter#after()
129 321 mdecorde
         */
130 321 mdecorde
        void after() {
131 321 mdecorde
                // System.out.println("end sentences");
132 321 mdecorde
        }
133 321 mdecorde
134 479 mdecorde
        def reg_empty_test = /\S/
135 479 mdecorde
136 321 mdecorde
        /**
137 321 mdecorde
         * Test before after.
138 321 mdecorde
         *
139 321 mdecorde
         * @return true, if successful
140 321 mdecorde
         */
141 321 mdecorde
        boolean testBeforeAfter() {
142 479 mdecorde
                if (_before ==~ reg_empty_test || _after ==~ reg_empty_test) {
143 321 mdecorde
                        print "TagSentences : ERROR: $line";
144 321 mdecorde
                        return false;
145 321 mdecorde
                }
146 321 mdecorde
                return true;
147 321 mdecorde
        }
148 321 mdecorde
149 321 mdecorde
        /**
150 321 mdecorde
         * Affect13.
151 321 mdecorde
         *
152 321 mdecorde
         * @param m the m
153 321 mdecorde
         */
154 321 mdecorde
        void affect13(def m) {
155 321 mdecorde
                _before = m[0][1];
156 321 mdecorde
                _after = m[0][3];
157 321 mdecorde
                testBeforeAfter();
158 321 mdecorde
        }
159 321 mdecorde
160 321 mdecorde
        /**
161 321 mdecorde
         * Affect12.
162 321 mdecorde
         *
163 321 mdecorde
         * @param m the m
164 321 mdecorde
         */
165 321 mdecorde
        void affect12(def m) {
166 321 mdecorde
                _before = m[0][1];
167 321 mdecorde
                _after = m[0][2];
168 321 mdecorde
                testBeforeAfter();
169 321 mdecorde
        }
170 321 mdecorde
171 321 mdecorde
        /** The MINPRINT. */
172 321 mdecorde
        int MINPRINT = 0
173 321 mdecorde
174 321 mdecorde
        /** The MAXPRINT. */
175 321 mdecorde
        int MAXPRINT = 0
176 321 mdecorde
177 479 mdecorde
        def reg_comment = /\A\s*<!--.*-->\s*\Z/
178 479 mdecorde
        def reg_out_of_sentence = /^(.*)<s( [^>]*)?>(.*)$/
179 479 mdecorde
        def reg_sentence_with_no_n_attribute = /<s( [^>]*)?>/
180 479 mdecorde
        def reg_end_of_sentence = /^(.*)<\/s>(.*)$/
181 479 mdecorde
        def reg_punct = /^(.*)<w type="pon"[^>]*>.*<\/w>(.*)$/
182 479 mdecorde
        def reg_strong_punct = /^(.*)<w [^>]*>$strongPunct<\/w>(.*)/
183 479 mdecorde
        def reg_word = /^(.*)<w .*<\/w>(.*)/
184 479 mdecorde
        def reg_corr_tags_no_seg = /^\s*(<($corr_tags_no_seg)( [^>\/]*)?>)\s*$/
185 479 mdecorde
        def reg_corr_tags_no_seg_alone = "<($corr_tags_no_seg)>"
186 479 mdecorde
        def reg_corr_tags_no_seg2 = /^(.*)<\/($corr_tags_no_seg)>(.*)$/
187 479 mdecorde
        def reg_block_tags = /^(.*)<\/($div_tags|$q_tags)>(.*)$/
188 479 mdecorde
        def reg_block_tag_alone = "<($div_tags|$q_tags)>"
189 479 mdecorde
190 321 mdecorde
        /* (non-Javadoc)
191 321 mdecorde
         * @see org.txm.importer.filters.Filter#filter()
192 321 mdecorde
         */
193 321 mdecorde
        void filter() {
194 321 mdecorde
                String tag;
195 321 mdecorde
                // Write your code here, but don't forget to write in the output
196 321 mdecorde
                // ex : output.print("TheStringToWrite " + line );
197 321 mdecorde
                // in the var line is the current line
198 321 mdecorde
                //line = line.trim()
199 321 mdecorde
200 479 mdecorde
                if (scounter > MINPRINT && scounter < MAXPRINT)
201 321 mdecorde
                        println linetype+"LINE : "+line;
202 321 mdecorde
203 479 mdecorde
                if (line ==~ reg_comment) {
204 321 mdecorde
                        output.print(line);
205 321 mdecorde
                }
206 479 mdecorde
                else if (linetype == "out") {
207 321 mdecorde
                        //on est en dehors d'une phrase
208 479 mdecorde
                        if ( (m = line =~ reg_out_of_sentence) ) {
209 321 mdecorde
                                //on trouve une balise de phrase => on met à jour le numéro, on est dans une phrase
210 321 mdecorde
                                affect13(m);
211 321 mdecorde
                                if (pending == "yes") {
212 321 mdecorde
                                        output.write("</s>\n")
213 321 mdecorde
                                        pending = "no";
214 321 mdecorde
                                }
215 321 mdecorde
216 321 mdecorde
                                scounter++;
217 479 mdecorde
                                line = (line =~ reg_sentence_with_no_n_attribute).replaceAll("<s n=\"$scounter\">");
218 321 mdecorde
                                output.print(line+"\n");
219 321 mdecorde
                                linetype = "in";
220 321 mdecorde
                        }
221 479 mdecorde
                        else if( (m = line =~ reg_end_of_sentence) ) { ////on trouve une balise de phrase fermante
222 321 mdecorde
                                //println "found </s> "+line
223 321 mdecorde
                                affect12(m);
224 321 mdecorde
                                if (pending == "yes") {
225 321 mdecorde
                                        linetype = "out";
226 321 mdecorde
                                        pending = "no";
227 321 mdecorde
                                        //pending = "no";
228 321 mdecorde
                                        output.print(line);
229 321 mdecorde
                                }
230 321 mdecorde
                                else
231 321 mdecorde
                                {
232 321 mdecorde
                                        //output.print(line);
233 321 mdecorde
                                        System.err.println("Found </s>, but pending = no and linetype= out : "+line+" ; scount $scounter");
234 321 mdecorde
                                }
235 321 mdecorde
                        }
236 479 mdecorde
                        else if ( (        m = line =~ reg_punct)  && pending== "yes") {
237 321 mdecorde
                                // on trouve une ponctuation
238 321 mdecorde
                                // et la balise de la phrase précédente n'est pas fermée => on ne fait rien
239 321 mdecorde
                                                        //println "found w type pon"
240 321 mdecorde
                                affect12(m);
241 321 mdecorde
                                output.print(line);
242 321 mdecorde
                        }
243 321 mdecorde
244 479 mdecorde
                        else if( (m = line =~ reg_word) ) //on rencontre un mot...
245 321 mdecorde
                        {
246 321 mdecorde
                                //println "found <w> open a sentence"
247 321 mdecorde
                                affect12(m);
248 321 mdecorde
                                if(pending == "yes")
249 321 mdecorde
                                {
250 321 mdecorde
                                        output.write("</s>\n")
251 321 mdecorde
                                        pending = "no";
252 321 mdecorde
                                }
253 321 mdecorde
                                scounter++;
254 479 mdecorde
                                output.print("<s n=\"$scounter\">\n"+line);
255 321 mdecorde
                                linetype = "in";
256 321 mdecorde
                        }
257 479 mdecorde
                        else if( (m = line =~ reg_corr_tags_no_seg)
258 479 mdecorde
                        || line.matches(reg_corr_tags_no_seg_alone))
259 321 mdecorde
                        {
260 321 mdecorde
                                if(pending == "yes")
261 321 mdecorde
                                {
262 321 mdecorde
                                        output.write("</s>\n")
263 321 mdecorde
                                        pending = "no";
264 321 mdecorde
                                }
265 321 mdecorde
                                //System.out.println("OUT: found open corr "+line);
266 321 mdecorde
                                //on trouve une balise de correction ouvrante
267 321 mdecorde
                                def tag_name = m[0][2];
268 321 mdecorde
                                def tag_end = m[0][3];
269 321 mdecorde
270 321 mdecorde
                                if (tag_end != null && tag_end.contains("multi_s")) // traite le corr comme un div tag
271 321 mdecorde
                                {
272 321 mdecorde
                                        open_div_tags.push(tag_name);
273 321 mdecorde
                                        LASTopen_div_tags = tag_name;
274 321 mdecorde
                                } else  {
275 321 mdecorde
                                        open_corr_tags.push(tag_name);
276 321 mdecorde
277 321 mdecorde
                                        scounter++;
278 479 mdecorde
                                        output.print("<s n=\"$scounter\">\n"); //on ouvre une <s>
279 321 mdecorde
                                        linetype = "in";
280 321 mdecorde
                                }
281 321 mdecorde
                                if (scounter > MINPRINT && scounter < MAXPRINT) {
282 479 mdecorde
                                        println "stacks $open_corr_tags ; $open_div_tags"
283 321 mdecorde
                                }
284 321 mdecorde
                                output.print(line);
285 479 mdecorde
                        } else if( ( m = line =~ reg_corr_tags_no_seg2) )
286 321 mdecorde
                        {
287 321 mdecorde
                                //on trouve la balise fermante correspondante à la dernière correction ouverte
288 321 mdecorde
                                                affect13(m);
289 321 mdecorde
                                if (scounter > MINPRINT && scounter < MAXPRINT)
290 321 mdecorde
                                        println "closing corr tag "+line
291 321 mdecorde
292 321 mdecorde
                                if (open_corr_tags.size() == 0 )//il n'y a plus de simple balise de corr
293 321 mdecorde
                                {
294 321 mdecorde
                                        open_div_tags.pop();
295 321 mdecorde
                                        if(open_div_tags.size() == 0)
296 321 mdecorde
                                        {
297 479 mdecorde
                                                if (pending == "yes") {
298 321 mdecorde
                                                        output.print("</s>\n"+line);// comme une div tag
299 321 mdecorde
                                                        pending = "no";
300 479 mdecorde
                                                } else {
301 321 mdecorde
                                                        output.print(line);// comme une div tag
302 321 mdecorde
                                                }
303 479 mdecorde
                                        } else {
304 321 mdecorde
                                                output.print(line);// comme une div tag
305 321 mdecorde
                                        }
306 479 mdecorde
                                } else {
307 321 mdecorde
                                        open_corr_tags.pop();
308 321 mdecorde
309 479 mdecorde
                                        if (pending == "yes") {
310 479 mdecorde
                                                if (open_corr_tags.size() == 0) {
311 321 mdecorde
                                                        output.print("</s>\n");
312 321 mdecorde
                                                        output.print(line);
313 321 mdecorde
                                                        linetype = "out";
314 321 mdecorde
                                                        pending = "no";
315 479 mdecorde
                                                } else {
316 321 mdecorde
                                                        output.print(line);
317 321 mdecorde
                                                }
318 479 mdecorde
                                        } else {
319 321 mdecorde
                                                output.print(line);
320 321 mdecorde
                                        }
321 321 mdecorde
                                }
322 479 mdecorde
323 479 mdecorde
                                if( scounter > MINPRINT && scounter < MAXPRINT) {
324 321 mdecorde
                                        println "stacks "+open_corr_tags+" ; "+open_div_tags
325 321 mdecorde
                                }
326 479 mdecorde
                        } else if( (        m = line =~ reg_block_tags) ||
327 479 mdecorde
                                                line.matches(reg_block_tag_alone)) { ////on trouve une balise de citation ou division fermante
328 321 mdecorde
                                //println "found closing div|quote tag "+line
329 321 mdecorde
                                //affect13(m);
330 321 mdecorde
331 479 mdecorde
                                if (pending == "yes") {        //println "a sentence was closed"
332 321 mdecorde
                                        output.print("</s>\n$line");
333 321 mdecorde
                                        pending = "no";
334 321 mdecorde
                                } else {
335 321 mdecorde
                                        //println "was NOT in sentence"
336 321 mdecorde
                                        output.print(line);
337 321 mdecorde
                                }
338 321 mdecorde
                        }
339 321 mdecorde
                        else if( (m = line =~ /^(.*)<($div_tags|$q_tags)>(.*)$/) ||
340 321 mdecorde
                        (m = line =~ /^(.*)<($div_tags|$q_tags)( [^>]*)?>(.*)$/) ||
341 321 mdecorde
                        line.matches("<($div_tags|$q_tags)( [^>]*)?>")) { //balise div ouvrante
342 321 mdecorde
343 321 mdecorde
                                //pending = "no";
344 321 mdecorde
                                //push @s_errors, scounter;
345 479 mdecorde
                                if (pending == "yes") {
346 321 mdecorde
                                        output.print("</s>\n");
347 321 mdecorde
                                        pending = "no";
348 321 mdecorde
                                }
349 321 mdecorde
350 321 mdecorde
                                output.print("$line");
351 479 mdecorde
                        }         else {        //println "ELSE de 'out'"
352 321 mdecorde
                                output.print(line);
353 321 mdecorde
                        }
354 321 mdecorde
                }
355 479 mdecorde
                else if (linetype == "in") //on est à l'intérieur d'une phrase
356 321 mdecorde
                                {         //println "in sentence"
357 321 mdecorde
358 479 mdecorde
                                if ( ( m = line =~ reg_strong_punct) ) { ////on trouve une ponctuation forte ==> on est à l'extérieur d'une phrase
359 321 mdecorde
                                                //println "found word .!? "+line
360 321 mdecorde
                                affect12(m);
361 321 mdecorde
362 479 mdecorde
                                if (open_corr_tags.size() == 0) // il n'y a pas de correction en cours
363 321 mdecorde
                                {
364 321 mdecorde
                                        linetype = "out";
365 321 mdecorde
                                        pending ="yes"
366 321 mdecorde
                                        output.print(line);
367 321 mdecorde
                                }
368 321 mdecorde
                                else //on ne ferme pas desuite la phrase mais apres les corr
369 321 mdecorde
                                {
370 321 mdecorde
                                        pending = "yes"
371 321 mdecorde
                                        output.print(line);
372 321 mdecorde
                                }
373 479 mdecorde
                        } else if( (m = line =~ reg_word) || (m = line =~ /^(.*)<w( [^>]*)?>.*<\/w>(.*)/) ) //on rencontre un mot...
374 321 mdecorde
                        {
375 321 mdecorde
                                //println "found <w> open a sentence"
376 321 mdecorde
                                affect12(m);
377 321 mdecorde
                                output.print(line);
378 321 mdecorde
379 321 mdecorde
                        }
380 479 mdecorde
                        else if( (m = line =~ reg_corr_tags_no_seg)
381 479 mdecorde
                        || line.matches(reg_corr_tags_no_seg_alone))
382 321 mdecorde
                        {
383 321 mdecorde
                                if(scounter > MINPRINT && scounter < MAXPRINT)
384 321 mdecorde
                                        System.out.println("open corr "+line);
385 321 mdecorde
                                //on trouve une balise de correction ouvrante
386 321 mdecorde
                                def tag_name = m[0][2];
387 321 mdecorde
                                def tag_end = m[0][3];
388 321 mdecorde
389 321 mdecorde
                                if (tag_end != null && tag_end.contains("multi_s")) // traite le corr comme un div tag
390 321 mdecorde
                                {
391 321 mdecorde
                                        open_div_tags.push(tag_name);
392 321 mdecorde
                                        LASTopen_div_tags = tag_name;
393 321 mdecorde
                                        output.print("</s>\n");
394 321 mdecorde
                                        linetype="out"
395 321 mdecorde
                                }
396 321 mdecorde
                                else
397 321 mdecorde
                                {
398 321 mdecorde
                                        open_corr_tags.push(tag_name);
399 321 mdecorde
                                }
400 479 mdecorde
                                if (scounter > MINPRINT && scounter < MAXPRINT) {
401 321 mdecorde
                                        println "stacks "+open_corr_tags+" ; "+open_div_tags
402 321 mdecorde
                                }
403 321 mdecorde
                                output.print(line);
404 321 mdecorde
                        }
405 479 mdecorde
                        else if( ( m = line =~ reg_corr_tags_no_seg2) )
406 321 mdecorde
                        {
407 321 mdecorde
                                //on trouve la balise fermante correspondante à la dernière correction ouverte
408 479 mdecorde
                                if (scounter > MINPRINT && scounter < MAXPRINT)
409 321 mdecorde
                                        System.out.println("closing corr "+line);
410 321 mdecorde
                                affect13(m);
411 321 mdecorde
412 321 mdecorde
                                if (open_corr_tags.size() == 0 )//il n'y a plus de simple balise de corr
413 321 mdecorde
                                {
414 321 mdecorde
                                        if (open_div_tags.size() > 1) {
415 321 mdecorde
                                                output.print("</s>\n"+line);// comme une div tag
416 321 mdecorde
                                                linetype = "out";
417 321 mdecorde
                                                open_div_tags.pop();
418 321 mdecorde
                                        } else {
419 321 mdecorde
                                                println("error : found a non classified corr : "+line);
420 321 mdecorde
                                        }
421 321 mdecorde
                                } else {
422 321 mdecorde
                                        open_corr_tags.pop();
423 321 mdecorde
                                        output.print(line);
424 321 mdecorde
                                        if (pending == "yes" && open_corr_tags.size() == 0) // c'était le dernier corr
425 321 mdecorde
                                        {
426 321 mdecorde
                                                output.print("</s>\n");
427 321 mdecorde
                                                linetype = "out";
428 321 mdecorde
                                                pending = "no";
429 321 mdecorde
                                        }
430 321 mdecorde
                                }
431 479 mdecorde
                                if (scounter > MINPRINT && scounter < MAXPRINT) {
432 321 mdecorde
                                        println "stacks "+open_corr_tags+" ; "+open_div_tags
433 321 mdecorde
                                }
434 321 mdecorde
                        }
435 479 mdecorde
                        else if( (m = line =~ reg_out_of_sentence) ) { ////on trouve une balise de phrase fermante
436 321 mdecorde
                                //println "found </s> "+line
437 321 mdecorde
                                affect12(m);
438 321 mdecorde
439 321 mdecorde
                                linetype = "out";
440 321 mdecorde
                                //pending = "no";
441 321 mdecorde
                                output.print(line);
442 321 mdecorde
                        }
443 479 mdecorde
                        else if( (m = line =~ reg_block_tags) ||
444 321 mdecorde
                        line.matches("</($div_tags|$q_tags)>")) { ////on trouve une balise de division ou de citation fermante ==> on ferme une </s>
445 321 mdecorde
                                //println "found div or quote closing tag "+line
446 321 mdecorde
                                //affect13(m);
447 321 mdecorde
448 321 mdecorde
                                linetype = "out";
449 321 mdecorde
                                //pending = "no";
450 321 mdecorde
                                //push @s_errors, scounter;
451 321 mdecorde
                                output.print("</s>\n$line");
452 321 mdecorde
                        }
453 321 mdecorde
                        else if( (m = line =~ /^(.*)<($div_tags|$q_tags)>(.*)$/) ||
454 321 mdecorde
                        (m = line =~ /^(.*)<($div_tags|$q_tags)( [^>]*)?>(.*)$/) ||
455 321 mdecorde
                        line.matches("<($div_tags|$q_tags)( [^>]*)?>")) { ////on trouve une balise de division ou de citation fermante ==> on ferme une </s>
456 321 mdecorde
                                //println "found div or quote closing tag "+line
457 321 mdecorde
                                //affect13(m);
458 321 mdecorde
459 321 mdecorde
                                //push @s_errors, scounter;
460 321 mdecorde
                                output.print("</s>\n")
461 321 mdecorde
                                output.print("$line");
462 321 mdecorde
                                output.print("<s n=\""+scounter+"\">\n"); //on ouvre une <s>
463 321 mdecorde
464 321 mdecorde
                        }
465 321 mdecorde
                        else {
466 479 mdecorde
                                if (scounter > MINPRINT && scounter < MAXPRINT)
467 321 mdecorde
                                        println "ELSE "+line
468 479 mdecorde
469 321 mdecorde
                                output.print(line);
470 321 mdecorde
                        }
471 321 mdecorde
                }
472 321 mdecorde
                else {
473 321 mdecorde
                        print "Error in sentence tagging : $line";
474 321 mdecorde
                        System.err.println("Error in sentence tagging : $line");
475 321 mdecorde
                }
476 321 mdecorde
                // End
477 321 mdecorde
        }
478 321 mdecorde
}