Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / filters / TagSentences / TagSentences.groovy @ 187

History | View | Annotate | Download (12.2 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package filters.TagSentences;
29

    
30
import java.util.regex.*
31

    
32
import org.txm.importer.filters.*
33
import org.txm.tokenizer.*
34

    
35
import filters.CutHeader.*
36
import filters.FusionHeader.*
37

    
38
// TODO: Auto-generated Javadoc
39
/**
40
 * The Class TagSentences.
41
 */
42
class TagSentences extends Filter {
43
        
44
        /** The counter. */
45
        int counter;
46
        
47
        /** The m. */
48
        def m;
49
        
50
        /** The segment. */
51
        def segment;
52
        
53
        /** The linetype. */
54
        def linetype = "out";
55
        
56
        /** The open_corr_tags. */
57
        def open_corr_tags = new LinkedList<String>();
58
        
59
        /** The LAS topen_corr_tags. */
60
        def LASTopen_corr_tags = "";
61
        
62
        /** The open_div_tags. */
63
        def open_div_tags = new LinkedList<String>();
64
        
65
        /** The LAST open_div_tags. */
66
        def LASTopen_div_tags = "";
67
        
68
        /** The corr. */
69
        def corr = "no";
70
        
71
        /** The corr_name. */
72
        def corr_name;
73
        
74
        /** The corr_tag. */
75
        def corr_tag;
76
        
77
        /** The pending. */
78
        def pending = "no";
79
        
80
        /** The scounter. */
81
        def scounter = 0;
82
        
83
        /** The _before. */
84
        def _before;
85
        
86
        /** The _after. */
87
        def _after;
88
        
89
        /** The div_tags. */
90
        def div_tags = TokenizerClasses.div_tags;
91
        
92
        /** The q_tags. */
93
        def q_tags = TokenizerClasses.q_tags;
94
        
95
        /** The corr_tags_no_seg. */
96
        def corr_tags_no_seg = TokenizerClasses.corr_tags_no_seg;
97
        
98
        /** The corr_tags. */
99
        def corr_tags = TokenizerClasses.corr_tags;
100
        
101
        def strongPunct = TokenizerClasses.punct_strong;
102
        
103
        /* (non-Javadoc)
104
         * @see org.txm.importer.filters.Filter#SetUsedParam(java.lang.Object)
105
         */
106
        void SetUsedParam(Object args) {
107
                try {
108
                        div_tags = args.get("div_tags");
109
                        q_tags = args.get("q_tags");
110
                        corr_tags_no_seg = args.get("corr_tags_no_seg");
111
                        corr_tags = args.get("corr_tags");
112
                }
113
                catch(Exception e) {
114
                        System.err.println(e);
115
                        System.err.println("TagginSentences needs 1 Map with args  :\n div_tags, q_tags, extraword_tags, " +
116
                                        "corr_tags_no_seg, corr_tags, word_tags, intraword_tags")
117
                }
118
        }
119
        
120
        /* (non-Javadoc)
121
         * @see org.txm.importer.filters.Filter#before()
122
         */
123
        boolean before() {
124
                // System.out.println("begin sentences");
125
        }
126
        
127
        /* (non-Javadoc)
128
         * @see org.txm.importer.filters.Filter#after()
129
         */
130
        void after() {
131
                // System.out.println("end sentences");
132
        }
133
        
134
        /**
135
         * Test before after.
136
         *
137
         * @return true, if successful
138
         */
139
        boolean testBeforeAfter() {
140
                if (_before ==~ /\S/ || _after ==~ /\S/) {
141
                        print "TagSentences : ERROR: $line";
142
                        return false;
143
                }
144
                return true;
145
        }
146
        
147
        /**
148
         * Affect13.
149
         *
150
         * @param m the m
151
         */
152
        void affect13(def m) {
153
                _before = m[0][1];
154
                _after = m[0][3];
155
                testBeforeAfter();
156
        }
157
        
158
        /**
159
         * Affect12.
160
         *
161
         * @param m the m
162
         */
163
        void affect12(def m) {
164
                _before = m[0][1];
165
                _after = m[0][2];
166
                testBeforeAfter();
167
        }
168
        
169
        /** The MINPRINT. */
170
        int MINPRINT = 0
171
        
172
        /** The MAXPRINT. */
173
        int MAXPRINT = 0
174
        
175
        /* (non-Javadoc)
176
         * @see org.txm.importer.filters.Filter#filter()
177
         */
178
        void filter() {
179
                String tag;
180
                // Write your code here, but don't forget to write in the output
181
                // ex : output.print("TheStringToWrite " + line );
182
                // in the var line is the current line
183
                //line = line.trim()
184
                
185
                if( scounter > MINPRINT && scounter < MAXPRINT)
186
                        println linetype+"LINE : "+line;
187
                
188
                if (line ==~ /\A\s*<!--.*-->\s*\Z/) {                
189
                        output.print(line);
190
                }
191
                else if (linetype ==~ /out/) { 
192
                        //on est en dehors d'une phrase
193
                        if ( (m = line =~ /^(.*)<s( [^>]*)?>(.*)$/) ) { 
194
                                //on trouve une balise de phrase => on met à jour le numéro, on est dans une phrase
195
                                affect13(m);
196
                                if (pending == "yes") {
197
                                        output.write("</s>\n")
198
                                        pending = "no";
199
                                }
200
                                
201
                                scounter++;
202
                                line = (line =~ /<s( [^>]*)?>/).replaceAll("<s n=\""+scounter+"\">");
203
                                output.print(line+"\n");        
204
                                linetype = "in";
205
                        }
206
                        else if( (m = line =~ /^(.*)<\/s>(.*)$/) ) { ////on trouve une balise de phrase fermante
207
                                //println "found </s> "+line
208
                                affect12(m);
209
                                if (pending == "yes") {
210
                                        linetype = "out";
211
                                        pending = "no";
212
                                        //pending = "no";
213
                                        output.print(line);
214
                                }
215
                                else
216
                                {
217
                                        //output.print(line);
218
                                        System.err.println("Found </s>, but pending = no and linetype= out : "+line+" ; scount $scounter");
219
                                }
220
                        }
221
                        else if ( (        m = line =~ /^(.*)<w type="pon"[^>]*>.*<\/w>(.*)$/)  && pending ==~ /yes/) { 
222
                                // on trouve une ponctuation
223
                                // et la balise de la phrase précédente n'est pas fermée => on ne fait rien
224
                                                        //println "found w type pon"
225
                                affect12(m);
226
                                output.print(line);
227
                        }
228
                        
229
                        else if( (m = line =~ /^(.*)<w .*<\/w>(.*)/) ) //on rencontre un mot...
230
                        {         
231
                                //println "found <w> open a sentence"
232
                                affect12(m);
233
                                if(pending == "yes")
234
                                {
235
                                        output.write("</s>\n")
236
                                        pending = "no";
237
                                }
238
                                scounter++;
239
                                output.print("<s n=\""+scounter+"\">\n"+line);
240
                                linetype = "in";
241
                        }
242
                        else if( (m = line =~ /^\s*(<($corr_tags_no_seg)( [^>\/]*)?>)\s*$/) 
243
                        || line.matches("<($corr_tags_no_seg)>"))
244
                        { 
245
                                if(pending == "yes")
246
                                {
247
                                        output.write("</s>\n")
248
                                        pending = "no";
249
                                }
250
                                //System.out.println("OUT: found open corr "+line);
251
                                //on trouve une balise de correction ouvrante
252
                                def tag_name = m[0][2];
253
                                def tag_end = m[0][3];
254
                                
255
                                if (tag_end != null && tag_end.contains("multi_s")) // traite le corr comme un div tag
256
                                { 
257
                                        open_div_tags.push(tag_name);
258
                                        LASTopen_div_tags = tag_name;
259
                                } else  { 
260
                                        open_corr_tags.push(tag_name);
261

    
262
                                        scounter++;
263
                                        output.print("<s n=\""+scounter+"\">\n"); //on ouvre une <s>
264
                                        linetype = "in";
265
                                }
266
                                if (scounter > MINPRINT && scounter < MAXPRINT) {
267
                                        println "stacks "+open_corr_tags+" ; "+open_div_tags
268
                                }
269
                                output.print(line);
270
                        } else if( ( m = line =~ /^(.*)<\/($corr_tags_no_seg)>(.*)$/) ) 
271
                        { 
272
                                //on trouve la balise fermante correspondante à la dernière correction ouverte        
273
                                                affect13(m);
274
                                if (scounter > MINPRINT && scounter < MAXPRINT)
275
                                        println "closing corr tag "+line
276
                                
277
                                if (open_corr_tags.size() == 0 )//il n'y a plus de simple balise de corr
278
                                {        
279
                                        open_div_tags.pop();
280
                                        if(open_div_tags.size() == 0)
281
                                        {
282
                                                if(pending == "yes")
283
                                                {
284
                                                        output.print("</s>\n"+line);// comme une div tag
285
                                                        pending = "no";
286
                                                }
287
                                                else
288
                                                {
289
                                                        output.print(line);// comme une div tag
290
                                                }
291
                                        }
292
                                        else
293
                                        {
294
                                                output.print(line);// comme une div tag
295
                                        }
296
                                }
297
                                else
298
                                {        
299
                                        open_corr_tags.pop();
300
                                        
301
                                        if(pending == "yes")
302
                                        {
303
                                                if(open_corr_tags.size() == 0)
304
                                                {
305
                                                        output.print("</s>\n");
306
                                                        output.print(line);
307
                                                        linetype = "out";
308
                                                        pending = "no";
309
                                                }
310
                                                else
311
                                                {
312
                                                        output.print(line);
313
                                                }
314
                                        }
315
                                        else
316
                                        {
317
                                                output.print(line);
318
                                        }
319
                                }
320
                                if( scounter > MINPRINT && scounter < MAXPRINT)
321
                                {
322
                                        println "stacks "+open_corr_tags+" ; "+open_div_tags
323
                                }
324
                        }
325
                        else if( (        m = line =~ /^(.*)<\/($div_tags|$q_tags)>(.*)$/) ||
326
                        line.matches("<($div_tags|$q_tags)>")) { ////on trouve une balise de citation ou division fermante
327
                                //println "found closing div|quote tag "+line
328
                                //affect13(m);
329
                                
330
                                if (pending == "yes") 
331
                                {        //println "a sentence was closed"
332
                                        output.print("</s>\n$line");
333
                                        pending = "no";
334
                                } else {
335
                                        //println "was NOT in sentence"
336
                                        output.print(line);
337
                                }
338
                        }
339
                        else if( (m = line =~ /^(.*)<($div_tags|$q_tags)>(.*)$/) ||
340
                        (m = line =~ /^(.*)<($div_tags|$q_tags)( [^>]*)?>(.*)$/) ||
341
                        line.matches("<($div_tags|$q_tags)( [^>]*)?>")) { //balise div ouvrante
342
                                
343
                                //pending = "no";
344
                                //push @s_errors, scounter;
345
                                if(pending == "yes")
346
                                {
347
                                        output.print("</s>\n");
348
                                        pending = "no";
349
                                }
350
                                
351
                                output.print("$line");
352
                        }                
353
                        else {        //println "ELSE de 'out'"
354
                                output.print(line);
355
                        }
356
                }
357
                else if (linetype ==~ /in/) //on est à l'intérieur d'une phrase
358
                                {         //println "in sentence"
359
                        
360
                                if( ( m = line =~ /^(.*)<w [^>]*>$strongPunct<\/w>(.*)/) ) { ////on trouve une ponctuation forte ==> on est à l'extérieur d'une phrase
361
                                                //println "found word .!? "+line
362
                                affect12(m);
363
                                
364
                                if(open_corr_tags.size() == 0) // il n'y a pas de correction en cours
365
                                {
366
                                        linetype = "out";
367
                                        pending ="yes"
368
                                        output.print(line);
369
                                }
370
                                else //on ne ferme pas desuite la phrase mais apres les corr
371
                                {
372
                                        pending = "yes"
373
                                        output.print(line);
374
                                }
375
                        }else if( (m = line =~ /^(.*)<w .*<\/w>(.*)/) || (m = line =~ /^(.*)<w( [^>]*)?>.*<\/w>(.*)/) ) //on rencontre un mot...
376
                        {         
377
                                //println "found <w> open a sentence"
378
                                affect12(m);
379
                                output.print(line);
380
                                
381
                        }
382
                        else if( (m = line =~ /^\s*(<($corr_tags_no_seg)( [^>\/]*)?>)\s*$/) 
383
                        || line.matches("<($corr_tags_no_seg)>"))
384
                        { 
385
                                if(scounter > MINPRINT && scounter < MAXPRINT)
386
                                        System.out.println("open corr "+line);
387
                                //on trouve une balise de correction ouvrante
388
                                def tag_name = m[0][2];
389
                                def tag_end = m[0][3];
390
                                
391
                                if (tag_end != null && tag_end.contains("multi_s")) // traite le corr comme un div tag
392
                                { 
393
                                        open_div_tags.push(tag_name);
394
                                        LASTopen_div_tags = tag_name;
395
                                        output.print("</s>\n");
396
                                        linetype="out"
397
                                }
398
                                else 
399
                                { 
400
                                        open_corr_tags.push(tag_name);
401
                                }
402
                                if(scounter > MINPRINT && scounter < MAXPRINT)
403
                                {
404
                                        println "stacks "+open_corr_tags+" ; "+open_div_tags
405
                                }
406
                                output.print(line);
407
                        }
408
                        else if( ( m = line =~ /^(.*)<\/($corr_tags_no_seg)>(.*)$/) ) 
409
                        { 
410
                                //on trouve la balise fermante correspondante à la dernière correction ouverte        
411
                                if(scounter > MINPRINT && scounter < MAXPRINT)
412
                                        System.out.println("closing corr "+line);
413
                                affect13(m);
414
                                
415
                                if (open_corr_tags.size() == 0 )//il n'y a plus de simple balise de corr
416
                                {        
417
                                        if (open_div_tags.size() > 1) {
418
                                                output.print("</s>\n"+line);// comme une div tag
419
                                                linetype = "out";
420
                                                open_div_tags.pop();
421
                                        } else {
422
                                                println("error : found a non classified corr : "+line);
423
                                        }
424
                                } else {        
425
                                        open_corr_tags.pop();
426
                                        output.print(line);
427
                                        if (pending == "yes" && open_corr_tags.size() == 0) // c'était le dernier corr
428
                                        {
429
                                                output.print("</s>\n");
430
                                                linetype = "out";
431
                                                pending = "no";
432
                                        }
433
                                }
434
                                if(scounter > MINPRINT && scounter < MAXPRINT)
435
                                {
436
                                        println "stacks "+open_corr_tags+" ; "+open_div_tags
437
                                }
438
                        }
439
                        else if( (m = line =~ /^(.*)<\/s>(.*)$/) ) { ////on trouve une balise de phrase fermante
440
                                //println "found </s> "+line
441
                                affect12(m);
442
                                
443
                                linetype = "out";
444
                                //pending = "no";
445
                                output.print(line);
446
                        }
447
                        else if( (m = line =~ /^(.*)<\/($div_tags|$q_tags)>(.*)$/) ||
448
                        line.matches("</($div_tags|$q_tags)>")) { ////on trouve une balise de division ou de citation fermante ==> on ferme une </s>
449
                                //println "found div or quote closing tag "+line
450
                                //affect13(m);
451
                                
452
                                linetype = "out";
453
                                //pending = "no";
454
                                //push @s_errors, scounter;
455
                                output.print("</s>\n$line");
456
                        }
457
                        else if( (m = line =~ /^(.*)<($div_tags|$q_tags)>(.*)$/) ||
458
                        (m = line =~ /^(.*)<($div_tags|$q_tags)( [^>]*)?>(.*)$/) ||
459
                        line.matches("<($div_tags|$q_tags)( [^>]*)?>")) { ////on trouve une balise de division ou de citation fermante ==> on ferme une </s>
460
                                //println "found div or quote closing tag "+line
461
                                //affect13(m);
462

    
463
                                //push @s_errors, scounter;
464
                                output.print("</s>\n")
465
                                output.print("$line");
466
                                output.print("<s n=\""+scounter+"\">\n"); //on ouvre une <s>
467
                                
468
                        }                
469
                        else {
470
                                if(scounter > MINPRINT && scounter < MAXPRINT)
471
                                        println "ELSE "+line
472
                                output.print(line);
473
                        }
474
                }
475
                else {
476
                        print "Error in sentence tagging : $line";
477
                        System.err.println("Error in sentence tagging : $line");
478
                }
479
                // End
480
        }
481
}