Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / filters / TagSentences / TagSentences.groovy @ 479

History | View | Annotate | Download (12.6 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package filters.TagSentences;
29

    
30
import java.util.regex.*
31

    
32
import org.txm.importer.filters.*
33
import org.txm.tokenizer.*
34

    
35
import filters.CutHeader.*
36
import filters.FusionHeader.*
37

    
38
// TODO: Auto-generated Javadoc
39
/**
40
 * The Class TagSentences.
41
 */
42
class TagSentences extends Filter {
43
        
44
        /** The counter. */
45
        int counter;
46
        
47
        /** The m. */
48
        def m;
49
        
50
        /** The segment. */
51
        def segment;
52
        
53
        /** The linetype. */
54
        def linetype = "out";
55
        
56
        /** The open_corr_tags. */
57
        def open_corr_tags = new LinkedList<String>();
58
        
59
        /** The LAS topen_corr_tags. */
60
        def LASTopen_corr_tags = "";
61
        
62
        /** The open_div_tags. */
63
        def open_div_tags = new LinkedList<String>();
64
        
65
        /** The LAST open_div_tags. */
66
        def LASTopen_div_tags = "";
67
        
68
        /** The corr. */
69
        def corr = "no";
70
        
71
        /** The corr_name. */
72
        def corr_name;
73
        
74
        /** The corr_tag. */
75
        def corr_tag;
76
        
77
        /** The pending. */
78
        def pending = "no";
79
        
80
        /** The scounter. */
81
        def scounter = 0;
82
        
83
        /** The _before. */
84
        def _before;
85
        
86
        /** The _after. */
87
        def _after;
88
        
89
        /** The div_tags. */
90
        def div_tags = TokenizerClasses.div_tags;
91
        
92
        /** The q_tags. */
93
        def q_tags = TokenizerClasses.q_tags;
94
        
95
        /** The corr_tags_no_seg. */
96
        def corr_tags_no_seg = TokenizerClasses.corr_tags_no_seg;
97
        
98
        /** The corr_tags. */
99
        def corr_tags = TokenizerClasses.corr_tags;
100
        
101
        def strongPunct = TokenizerClasses.punct_strong;
102
        
103
        /* (non-Javadoc)
104
         * @see org.txm.importer.filters.Filter#SetUsedParam(java.lang.Object)
105
         */
106
        void SetUsedParam(Object args) {
107
                try {
108
                        div_tags = args.get("div_tags");
109
                        q_tags = args.get("q_tags");
110
                        corr_tags_no_seg = args.get("corr_tags_no_seg");
111
                        corr_tags = args.get("corr_tags");
112
                }
113
                catch(Exception e) {
114
                        System.err.println(e);
115
                        System.err.println("TagginSentences needs 1 Map with args  :\n div_tags, q_tags, extraword_tags, " +
116
                                        "corr_tags_no_seg, corr_tags, word_tags, intraword_tags")
117
                }
118
        }
119
        
120
        /* (non-Javadoc)
121
         * @see org.txm.importer.filters.Filter#before()
122
         */
123
        boolean before() {
124
                // System.out.println("begin sentences");
125
        }
126
        
127
        /* (non-Javadoc)
128
         * @see org.txm.importer.filters.Filter#after()
129
         */
130
        void after() {
131
                // System.out.println("end sentences");
132
        }
133
        
134
        def reg_empty_test = /\S/
135
        
136
        /**
137
         * Test before after.
138
         *
139
         * @return true, if successful
140
         */
141
        boolean testBeforeAfter() {
142
                if (_before ==~ reg_empty_test || _after ==~ reg_empty_test) {
143
                        print "TagSentences : ERROR: $line";
144
                        return false;
145
                }
146
                return true;
147
        }
148
        
149
        /**
150
         * Affect13.
151
         *
152
         * @param m the m
153
         */
154
        void affect13(def m) {
155
                _before = m[0][1];
156
                _after = m[0][3];
157
                testBeforeAfter();
158
        }
159
        
160
        /**
161
         * Affect12.
162
         *
163
         * @param m the m
164
         */
165
        void affect12(def m) {
166
                _before = m[0][1];
167
                _after = m[0][2];
168
                testBeforeAfter();
169
        }
170
        
171
        /** The MINPRINT. */
172
        int MINPRINT = 0
173
        
174
        /** The MAXPRINT. */
175
        int MAXPRINT = 0
176
        
177
        def reg_comment = /\A\s*<!--.*-->\s*\Z/
178
        def reg_out_of_sentence = /^(.*)<s( [^>]*)?>(.*)$/
179
        def reg_sentence_with_no_n_attribute = /<s( [^>]*)?>/
180
        def reg_end_of_sentence = /^(.*)<\/s>(.*)$/
181
        def reg_punct = /^(.*)<w type="pon"[^>]*>.*<\/w>(.*)$/
182
        def reg_strong_punct = /^(.*)<w [^>]*>$strongPunct<\/w>(.*)/
183
        def reg_word = /^(.*)<w .*<\/w>(.*)/
184
        def reg_corr_tags_no_seg = /^\s*(<($corr_tags_no_seg)( [^>\/]*)?>)\s*$/
185
        def reg_corr_tags_no_seg_alone = "<($corr_tags_no_seg)>"
186
        def reg_corr_tags_no_seg2 = /^(.*)<\/($corr_tags_no_seg)>(.*)$/
187
        def reg_block_tags = /^(.*)<\/($div_tags|$q_tags)>(.*)$/
188
        def reg_block_tag_alone = "<($div_tags|$q_tags)>"
189
        
190
        /* (non-Javadoc)
191
         * @see org.txm.importer.filters.Filter#filter()
192
         */
193
        void filter() {
194
                String tag;
195
                // Write your code here, but don't forget to write in the output
196
                // ex : output.print("TheStringToWrite " + line );
197
                // in the var line is the current line
198
                //line = line.trim()
199
                
200
                if (scounter > MINPRINT && scounter < MAXPRINT)
201
                        println linetype+"LINE : "+line;
202
                
203
                if (line ==~ reg_comment) {                
204
                        output.print(line);
205
                }
206
                else if (linetype == "out") { 
207
                        //on est en dehors d'une phrase
208
                        if ( (m = line =~ reg_out_of_sentence) ) { 
209
                                //on trouve une balise de phrase => on met à jour le numéro, on est dans une phrase
210
                                affect13(m);
211
                                if (pending == "yes") {
212
                                        output.write("</s>\n")
213
                                        pending = "no";
214
                                }
215
                                
216
                                scounter++;
217
                                line = (line =~ reg_sentence_with_no_n_attribute).replaceAll("<s n=\"$scounter\">");
218
                                output.print(line+"\n");        
219
                                linetype = "in";
220
                        }
221
                        else if( (m = line =~ reg_end_of_sentence) ) { ////on trouve une balise de phrase fermante
222
                                //println "found </s> "+line
223
                                affect12(m);
224
                                if (pending == "yes") {
225
                                        linetype = "out";
226
                                        pending = "no";
227
                                        //pending = "no";
228
                                        output.print(line);
229
                                }
230
                                else
231
                                {
232
                                        //output.print(line);
233
                                        System.err.println("Found </s>, but pending = no and linetype= out : "+line+" ; scount $scounter");
234
                                }
235
                        }
236
                        else if ( (        m = line =~ reg_punct)  && pending== "yes") { 
237
                                // on trouve une ponctuation
238
                                // et la balise de la phrase précédente n'est pas fermée => on ne fait rien
239
                                                        //println "found w type pon"
240
                                affect12(m);
241
                                output.print(line);
242
                        }
243
                        
244
                        else if( (m = line =~ reg_word) ) //on rencontre un mot...
245
                        {         
246
                                //println "found <w> open a sentence"
247
                                affect12(m);
248
                                if(pending == "yes")
249
                                {
250
                                        output.write("</s>\n")
251
                                        pending = "no";
252
                                }
253
                                scounter++;
254
                                output.print("<s n=\"$scounter\">\n"+line);
255
                                linetype = "in";
256
                        }
257
                        else if( (m = line =~ reg_corr_tags_no_seg) 
258
                        || line.matches(reg_corr_tags_no_seg_alone))
259
                        { 
260
                                if(pending == "yes")
261
                                {
262
                                        output.write("</s>\n")
263
                                        pending = "no";
264
                                }
265
                                //System.out.println("OUT: found open corr "+line);
266
                                //on trouve une balise de correction ouvrante
267
                                def tag_name = m[0][2];
268
                                def tag_end = m[0][3];
269
                                
270
                                if (tag_end != null && tag_end.contains("multi_s")) // traite le corr comme un div tag
271
                                { 
272
                                        open_div_tags.push(tag_name);
273
                                        LASTopen_div_tags = tag_name;
274
                                } else  { 
275
                                        open_corr_tags.push(tag_name);
276

    
277
                                        scounter++;
278
                                        output.print("<s n=\"$scounter\">\n"); //on ouvre une <s>
279
                                        linetype = "in";
280
                                }
281
                                if (scounter > MINPRINT && scounter < MAXPRINT) {
282
                                        println "stacks $open_corr_tags ; $open_div_tags"
283
                                }
284
                                output.print(line);
285
                        } else if( ( m = line =~ reg_corr_tags_no_seg2) ) 
286
                        { 
287
                                //on trouve la balise fermante correspondante à la dernière correction ouverte        
288
                                                affect13(m);
289
                                if (scounter > MINPRINT && scounter < MAXPRINT)
290
                                        println "closing corr tag "+line
291
                                
292
                                if (open_corr_tags.size() == 0 )//il n'y a plus de simple balise de corr
293
                                {        
294
                                        open_div_tags.pop();
295
                                        if(open_div_tags.size() == 0)
296
                                        {
297
                                                if (pending == "yes") {
298
                                                        output.print("</s>\n"+line);// comme une div tag
299
                                                        pending = "no";
300
                                                } else {
301
                                                        output.print(line);// comme une div tag
302
                                                }
303
                                        } else {
304
                                                output.print(line);// comme une div tag
305
                                        }
306
                                } else {        
307
                                        open_corr_tags.pop();
308
                                        
309
                                        if (pending == "yes") {
310
                                                if (open_corr_tags.size() == 0) {
311
                                                        output.print("</s>\n");
312
                                                        output.print(line);
313
                                                        linetype = "out";
314
                                                        pending = "no";
315
                                                } else {
316
                                                        output.print(line);
317
                                                }
318
                                        } else {
319
                                                output.print(line);
320
                                        }
321
                                }
322
                                
323
                                if( scounter > MINPRINT && scounter < MAXPRINT) {
324
                                        println "stacks "+open_corr_tags+" ; "+open_div_tags
325
                                }
326
                        } else if( (        m = line =~ reg_block_tags) ||
327
                                                line.matches(reg_block_tag_alone)) { ////on trouve une balise de citation ou division fermante
328
                                //println "found closing div|quote tag "+line
329
                                //affect13(m);
330
                                
331
                                if (pending == "yes") {        //println "a sentence was closed"
332
                                        output.print("</s>\n$line");
333
                                        pending = "no";
334
                                } else {
335
                                        //println "was NOT in sentence"
336
                                        output.print(line);
337
                                }
338
                        }
339
                        else if( (m = line =~ /^(.*)<($div_tags|$q_tags)>(.*)$/) ||
340
                        (m = line =~ /^(.*)<($div_tags|$q_tags)( [^>]*)?>(.*)$/) ||
341
                        line.matches("<($div_tags|$q_tags)( [^>]*)?>")) { //balise div ouvrante
342
                                
343
                                //pending = "no";
344
                                //push @s_errors, scounter;
345
                                if (pending == "yes") {
346
                                        output.print("</s>\n");
347
                                        pending = "no";
348
                                }
349
                                
350
                                output.print("$line");
351
                        }         else {        //println "ELSE de 'out'"
352
                                output.print(line);
353
                        }
354
                }
355
                else if (linetype == "in") //on est à l'intérieur d'une phrase
356
                                {         //println "in sentence"
357
                        
358
                                if ( ( m = line =~ reg_strong_punct) ) { ////on trouve une ponctuation forte ==> on est à l'extérieur d'une phrase
359
                                                //println "found word .!? "+line
360
                                affect12(m);
361
                                
362
                                if (open_corr_tags.size() == 0) // il n'y a pas de correction en cours
363
                                {
364
                                        linetype = "out";
365
                                        pending ="yes"
366
                                        output.print(line);
367
                                }
368
                                else //on ne ferme pas desuite la phrase mais apres les corr
369
                                {
370
                                        pending = "yes"
371
                                        output.print(line);
372
                                }
373
                        } else if( (m = line =~ reg_word) || (m = line =~ /^(.*)<w( [^>]*)?>.*<\/w>(.*)/) ) //on rencontre un mot...
374
                        {         
375
                                //println "found <w> open a sentence"
376
                                affect12(m);
377
                                output.print(line);
378
                                
379
                        }
380
                        else if( (m = line =~ reg_corr_tags_no_seg) 
381
                        || line.matches(reg_corr_tags_no_seg_alone))
382
                        { 
383
                                if(scounter > MINPRINT && scounter < MAXPRINT)
384
                                        System.out.println("open corr "+line);
385
                                //on trouve une balise de correction ouvrante
386
                                def tag_name = m[0][2];
387
                                def tag_end = m[0][3];
388
                                
389
                                if (tag_end != null && tag_end.contains("multi_s")) // traite le corr comme un div tag
390
                                { 
391
                                        open_div_tags.push(tag_name);
392
                                        LASTopen_div_tags = tag_name;
393
                                        output.print("</s>\n");
394
                                        linetype="out"
395
                                }
396
                                else 
397
                                { 
398
                                        open_corr_tags.push(tag_name);
399
                                }
400
                                if (scounter > MINPRINT && scounter < MAXPRINT) {
401
                                        println "stacks "+open_corr_tags+" ; "+open_div_tags
402
                                }
403
                                output.print(line);
404
                        }
405
                        else if( ( m = line =~ reg_corr_tags_no_seg2) ) 
406
                        { 
407
                                //on trouve la balise fermante correspondante à la dernière correction ouverte        
408
                                if (scounter > MINPRINT && scounter < MAXPRINT)
409
                                        System.out.println("closing corr "+line);
410
                                affect13(m);
411
                                
412
                                if (open_corr_tags.size() == 0 )//il n'y a plus de simple balise de corr
413
                                {        
414
                                        if (open_div_tags.size() > 1) {
415
                                                output.print("</s>\n"+line);// comme une div tag
416
                                                linetype = "out";
417
                                                open_div_tags.pop();
418
                                        } else {
419
                                                println("error : found a non classified corr : "+line);
420
                                        }
421
                                } else {        
422
                                        open_corr_tags.pop();
423
                                        output.print(line);
424
                                        if (pending == "yes" && open_corr_tags.size() == 0) // c'était le dernier corr
425
                                        {
426
                                                output.print("</s>\n");
427
                                                linetype = "out";
428
                                                pending = "no";
429
                                        }
430
                                }
431
                                if (scounter > MINPRINT && scounter < MAXPRINT) {
432
                                        println "stacks "+open_corr_tags+" ; "+open_div_tags
433
                                }
434
                        }
435
                        else if( (m = line =~ reg_out_of_sentence) ) { ////on trouve une balise de phrase fermante
436
                                //println "found </s> "+line
437
                                affect12(m);
438
                                
439
                                linetype = "out";
440
                                //pending = "no";
441
                                output.print(line);
442
                        }
443
                        else if( (m = line =~ reg_block_tags) ||
444
                        line.matches("</($div_tags|$q_tags)>")) { ////on trouve une balise de division ou de citation fermante ==> on ferme une </s>
445
                                //println "found div or quote closing tag "+line
446
                                //affect13(m);
447
                                
448
                                linetype = "out";
449
                                //pending = "no";
450
                                //push @s_errors, scounter;
451
                                output.print("</s>\n$line");
452
                        }
453
                        else if( (m = line =~ /^(.*)<($div_tags|$q_tags)>(.*)$/) ||
454
                        (m = line =~ /^(.*)<($div_tags|$q_tags)( [^>]*)?>(.*)$/) ||
455
                        line.matches("<($div_tags|$q_tags)( [^>]*)?>")) { ////on trouve une balise de division ou de citation fermante ==> on ferme une </s>
456
                                //println "found div or quote closing tag "+line
457
                                //affect13(m);
458

    
459
                                //push @s_errors, scounter;
460
                                output.print("</s>\n")
461
                                output.print("$line");
462
                                output.print("<s n=\""+scounter+"\">\n"); //on ouvre une <s>
463
                                
464
                        }                
465
                        else {
466
                                if (scounter > MINPRINT && scounter < MAXPRINT)
467
                                        println "ELSE "+line
468
                                
469
                                output.print(line);
470
                        }
471
                }
472
                else {
473
                        print "Error in sentence tagging : $line";
474
                        System.err.println("Error in sentence tagging : $line");
475
                }
476
                // End
477
        }
478
}