Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / filters / Tokeniser / Tokeniser.groovy @ 479

History | View | Annotate | Download (14 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$
27
//
28
package filters.Tokeniser;
29

    
30
import org.txm.importer.filters.*;
31
import org.txm.tokenizer.TokenizerClasses;
32

    
33
// TODO: Auto-generated Javadoc
34
/**
35
 * The Class Tokeniser.
36
 */
37
class Tokeniser extends Filter {
38

    
39
        public Tokeniser(File text)
40
        {
41
                this.filename = text.getName();
42
                int index = filename.lastIndexOf(".");
43
                if(index > 0)
44
                        filename = filename.substring(0, index);
45
        }
46

    
47
        static public int DEBUG = 0;
48
        /** The charerrors. */
49
        HashSet<String> charerrors = new HashSet<String>();
50
        /*
51
         * Elements processed by the tokenizer:
52
         * 
53
         * DEFS: - w|abbr|num (word_tags) - c|ex|caes|choice|corr|sic|reg
54
         * (intraword_tags) - <[^>]+> (tag_all)
55
         * 
56
         * INPUT: OUTPUT: MODE: COMMENTS:
57
         * 
58
         * - &apos; ' BEGIN - <\/($word_tags)> </w>+NL word - --> -->+NL comment -
59
         * <\/($head_name)> </($head_name)>+NL head - <\/note> </note>-->+NL note -
60
         * <(head|p) [^>]*lang=['"]fr['"] ibid text Editor's title - <w(
61
         * [^>]*)?>\s*)($tag_all\s*) ibid+NL if not </w> text - <note[^>]*>
62
         * NL+<!--$note text Note found - <\/note> </note>-->+NL text - $tag_all
63
         * ibid+NL text Line with tags only - <!-- NL+<!-- text XML comment found -
64
         * --> -->+NL - <($word_tags)([^>]*)> <w type=\"$tag_name\"
65
         * $tag_atts>$word</w>+NL text Tagged words, punctuation marks,
66
         * abbreviations and numbers - $tag_all ibid+NL text Separated tags - \.[^
67
         * .]+\. <w type=\"num\">$word</w>+NL text Numbers and abbreviations
68
         * surrounded with dots - $punct_all|['‘’] <w type=\"pon\">$word</w>+NL text
69
         * Punctuation marks - ($word_chars)*[\(\[]($word_cha <w >$word</w> text
70
         * Words with brackets rs)+[\)\]]($word_chars)+|($wor
71
         * d_chars)+[\(\[]($word_chars)+[ \)\]])(\s+|$punct_all) -
72
         * (((<[^>]+>)*($word_chars)+(<[^ <w >$word</w> text Other word patterns
73
         * without elision >]+>)*)+)(\s+|$punct_all) -
74
         * (((<[^>]+>)*($word_chars)+(<[^ <w >$word</w> text Other word patterns
75
         * with  >]+>)*)+)['’]
76
         */
77

    
78
        /** The counter. */
79
        int counter = 1;
80

    
81
        /** The m. */
82
        def m;
83

    
84
        /** The segment. */
85
        def segment;
86

    
87
        /** The note. */
88
        def note;
89

    
90
        /** The tag. */
91
        def tag;
92

    
93
        /** The tag_name. */
94
        def tag_name;
95

    
96
        /** The tag_atts. */
97
        def tag_atts;
98

    
99
        /** The starttag. */
100
        def starttag;
101

    
102
        /** The endtag. */
103
        def endtag;
104

    
105
        /** The word. */
106
        def word;
107

    
108
        /** The linetype. */
109
        def linetype = "text"; // input processing state, can be : text, word, comment, head, note
110

    
111
        /** The head_name. */
112
        def head_name;
113

    
114
        /** The tag_all. */
115
        def tag_all = "<[^>]+>";
116

    
117
        /** The word_tags. */
118
        def word_tags = TokenizerClasses.word_tags;
119

    
120
        /** The intraword_tags. */
121
        def intraword_tags = TokenizerClasses.intraword_tags;
122

    
123
        /** The punct_strong. */
124
        def punct_strong = TokenizerClasses.punct_strong;
125

    
126
        /** The punct_all. */
127
        def punct_all = TokenizerClasses.punct_all;
128

    
129
        /** The word_chars. */
130
        def word_chars = TokenizerClasses.word_chars;
131
        def punct_quotes = TokenizerClasses.punct_quotes;
132
        String filename;
133
        int wordcount = 1;
134
        /* (non-Javadoc)
135
         * @see org.txm.importer.filters.Filter#SetUsedParam(java.lang.Object)
136
         */
137
        void SetUsedParam(Object args)
138
        {
139
                try {
140
                        word_tags = args.get("word_tags");
141
                        intraword_tags = args.get("intraword_tags");
142
                        punct_strong = args.get("punct_strong");
143
                        punct_all = args.get("punct_all");
144
                        word_chars = args.get("word_chars");
145
                } catch(Exception e) {
146
                        System.err.println(e)
147
                        System.err.println("tokenizer needs an \"args\" map with the following elements :\n word_tags, intraword_tags, punct_strong, " +
148
                                        "punct_all, word_chars")
149
                }
150
        }
151

    
152
        /* (non-Javadoc)
153
         * @see org.txm.importer.filters.Filter#before()
154
         */
155
        boolean before() {
156
                // System.out.println("beginning of tokenizer processing");
157
        }
158

    
159
        /* (non-Javadoc)
160
         * @see org.txm.importer.filters.Filter#after()
161
         */
162
        void after() {
163
                if (charerrors.size() > 0) {
164
                        System.out.print("Tokenizer unknown word chars in $filename: ");
165
                        println charerrors;
166
                        
167
                }
168
        }
169

    
170
        /* (non-Javadoc)
171
         * @see org.txm.importer.filters.Filter#filter()
172
         */
173
        void filter()
174
        {
175
                String word;
176
                // Write your code here, but don't forget to write to the output
177
                // Example : output.print("TheStringToWrite " + line);
178
                // The line variable contains the current line
179
                segment = line+" ";
180
                segment = segment.replace("&amp;", "&").replace("&apos;", "'").replace("&quot;","'");
181

    
182
                if(DEBUG-- > 0)
183
                        println("Line: "+line);
184
                while (true) {
185
                        if (DEBUG-- > 0)
186
                                println " segment: "+segment+"\n linetype: "+linetype;
187

    
188
                        if (linetype == "word") {
189
                                if(        (m = segment =~ /\A\s*(.*?)<\/($word_tags)>(.*)\Z/) )
190
                                {
191
                                        segment = m[0][3];
192
                                        linetype = "text";
193
                                        output.print( m[0][1]+"</w>"+lineSeparator);
194
                                        if (segment ==~ /\S/)
195
                                        {
196
                                                continue;
197
                                        }
198
                                }
199
                                else
200
                                {
201
                                        segment = (segment =~ /\A\s*/).replaceAll("");
202
                                        segment = segment.trim();
203
                                        output.print(segment);
204
                                }
205
                        }
206
                        else if (linetype == "comment")
207
                        {
208
                                if(        (m = segment =~ /\A\s*(.*?)-->(.*)\Z/))
209
                                {
210
                                        linetype = "text";
211
                                        output.print(m[0][1]+"-->"+lineSeparator);
212
                                        //println(m[0][1]+"-->\n");
213
                                        segment = m[0][2];
214
                                        if(! (segment ==~ /^\s*$/))
215
                                        {
216
                                                continue;
217
                                        }
218
                                }
219
                                else
220
                                {
221
                                        segment = (segment =~ /\A\s*/).replaceAll("");
222
                                        output.print(segment);
223
                                        //println(segment);
224
                                }
225
                        }
226
                        else if (linetype == "head")
227
                        {
228
                                if(        (m = segment =~ /\A\s*(.*?)<\/($head_name)>\s*\Z/))
229
                                {
230
                                        linetype = "text";
231
                                        output.print(m[0][1]+"</$head_name>"+lineSeparator);
232
                                        head_name = "";
233
                                }
234
                                else
235
                                {
236
                                        output.print(segment);
237
                                        //print "Editor's title: $segment\n";
238
                                        //println("Editor's title: $segment\n");
239
                                }
240
                        }
241
                        else if (linetype == "note")
242
                        {
243
                                if(        (m = segment =~ /\A\s*(.*?)<\/note>/))
244
                                {
245
                                        linetype = "text";
246
                                        output.print(m[0][1]+"</note>-->"+lineSeparator);
247
                                        //println(m[0][1]+"</note>\n");
248
                                }
249
                                else
250
                                {
251
                                        segment = (segment =~ /\A\s*/).replaceAll("");
252
                                        output.print(segment);
253
                                        //println(segment);
254
                                }
255
                        }
256
                        else if (linetype == "text")
257
                        {
258
                                if(        (m = segment =~ /\A\s*<(head|p) [^>]*lang=['"]fr['"]/) )
259
                                {
260
                                        if(DEBUG-- > 0)
261
                                                println "  tag head|p: m01"+head_name;
262
                                        head_name = m[0][1];
263
                                        output.print(segment);
264
                                        linetype = "head";
265
                                }
266
                                else if( (m = segment =~ /\A(\s*<w( [^>]*)?>\s*)($tag_all\s*)*\Z/))
267
                                {
268
                                        String wordtag = m[0][1];
269
                                        String tail = m[0][3].trim();
270
                                        if(!wordtag.contains("id=\""))
271
                                                wordtag ="<w id=\"w_"+filename+"_"+wordcount+"\""+ wordtag.substring(2);
272
                                        wordcount++;
273
                                        if(DEBUG-- > 0)
274
                                                println "  tag w: m01"+head_name;
275
                                        segment = tail;
276
                                        if (segment == "</w>") {
277
                                                println("\nEmpty word tag found, adding [???] content!\nsegment: '$wordtag[???]$tail'");
278
                                                output.print(wordtag.trim()+"[???]"+tail.trim()+lineSeparator);
279
                                        } else if( !(segment ==~ /<\/w>/)) {
280
                                                linetype = "word";
281
                                                output.print(wordtag.trim()+tail.trim());
282
                                        }
283
                                        else
284
                                                output.print(wordtag.trim()+tail.trim()+lineSeparator);
285
                                }
286
                                else if( (m = segment =~ /\A\s*(<note[^>]*>)(.*)\Z/)){
287
                                        if(DEBUG-- > 0)
288
                                                println "  tag note: new seg="+m[0][2];
289
                                        segment = m[0][2];
290
                                        def note = m[0][1];
291
                                        //println("Note found: $note");
292
                                        output.print(lineSeparator+"<!--$note");
293
                                        if( (m = segment =~ /\A(.*?)<\/note>(.*)\Z/)){
294
                                                segment = m[0][2];
295
                                                //println(m[0][1]+"</note>\n");
296
                                                output.print(m[0][1]+"</note>-->"+lineSeparator);
297
                                        }
298
                                        else {
299
                                                linetype = "note";
300
                                                //println("$segment ");
301
                                                output.print("$segment ");
302
                                                segment = " ";
303
                                        }
304
                                }
305
                                else if (segment ==~ /\A\s*($tag_all\s*)+\Z/)  {
306
                                        //print "Line with tags only : $line\n";
307
                                        if(DEBUG-- > 0)
308
                                                println "  tag all: ";
309

    
310
                                        output.print(segment+lineSeparator);
311
                                }
312
                                else if( (m = segment =~ /\A\s*(.*?)\s*\Z/)){
313

    
314

    
315
                                        segment = m[0][1]+" ";
316
                                        while(! (segment ==~ /\A\s*\Z/) )
317
                                        {
318
                                                if(DEBUG-- > 0)
319
                                                        println "  other: new seg="+segment;
320
                                                if( (m = segment =~ /\A\s*<!--(.*)\Z/))
321
                                                {
322
                                                        segment = m[0][1]+" ";
323
                                                        if(DEBUG-- > 0)
324
                                                                println("   comment: new seg="+segment);
325
                                                        output.print(lineSeparator+"<!--");
326
                                                        if( (m = segment =~ /\A(.*?)-->(.*)\Z/)) {
327
                                                                segment = m[0][2];
328
                                                                //println(m[0][1]+"-->"+lineSeparator);
329
                                                                output.print(m[0][1]+"-->"+lineSeparator);
330
                                                        }
331
                                                        else  {
332
                                                                linetype = "comment";
333
                                                                //println("$segment ");
334
                                                                output.print("$segment ");
335
                                                                segment = " ";
336
                                                        }
337
                                                }
338
                                                else if( (m = segment =~ /\A\s*<($word_tags)([^>]*)>(.*)\Z/)) {
339
                                                        if(DEBUG-- > 0)
340
                                                                println("   tag word: 1w="+m[0][1]+" 2="+m[0][2]+" 3seg="+m[0][3]);
341
                                                        segment = m[0][3];
342
                                                        tag_name = m[0][1];
343
                                                        tag_atts = m[0][2];
344
                                                        //print "Tagged words, punctuation marks, abbreviations and numbers : <$tag_name$tag_atts>\n";
345
                                                        output.print("<w");
346
                                                        if(!tag_atts.contains("id=\""))
347
                                                        {
348
                                                                tag_atts = " id=\"w_"+filename+"_"+wordcount+"\" ";
349
                                                        }
350
                                                        wordcount++;
351
                                                        if (tag_name ==~ /abbr|num/) {
352
                                                                output.print(" type=\"$tag_name\"");
353
                                                        }
354
                                                        output.print("$tag_atts>");
355
                                                        if( (m = segment =~ /\A(.*?)<\/($tag_name)>(.*)\Z/)) {
356
                                                                word = m[0][1];
357
                                                                segment = m[0][3];
358
                                                                word = (word =~ /^\s*(.*)\s*$/).replaceAll("\$1");
359
                                                                output.print("$word</w>"+lineSeparator);
360
                                                        }
361
                                                        else  {
362
                                                                linetype = "word";
363
                                                                output.print(segment);
364
                                                                segment = " ";
365
                                                        }
366
                                                }
367
                                                else if( (m = segment =~ /\A\s*($tag_all)\s+(.*)\Z/)) {
368
                                                        if(DEBUG-- > 0)
369
                                                                println "   Separated tags: 1w="+m[0][1]+" 2seg="+m[0][2];
370
                                                        segment = m[0][2];
371
                                                        tag = m[0][1];
372
                                                        output.print("$tag"+lineSeparator);
373
                                                }
374
                                                else if( (m = segment =~ /\A\s*([.·][^ .·]+[.·])(.*)\Z/))
375
                                                {
376
                                                        if(DEBUG-- > 0)
377
                                                                println "   Numbers and abbreviations: 1w="+m[0][1]+" 2seg="+m[0][2];
378
                                                        segment = m[0][2];
379
                                                        word = m[0][1];
380
                                                        word = (word =~ /^\s*(.*)\s*$/).replaceAll("\$1");
381
                                                        output.print("<w id=\"w_"+filename+"_$wordcount\" type=\"num\">$word</w>"+lineSeparator);
382
                                                        wordcount++;
383
                                                        //push @numbers, $word;
384
                                                        //print "Number found: <w type=\"num\">$word</w>\n";
385
                                                }
386
                                                else if( (m = segment =~ /\A\s*($punct_all|[$punct_quotes])(.*)\Z/))
387
                                                {
388
                                                        if(DEBUG-- > 0)
389
                                                                println "   Punctuation marks: 1w="+m[0][1]+" 2seg="+m[0][2]
390
                                                        segment = m[0][2];
391
                                                        word = m[0][1];
392
                                                        word = (word =~ /^\s*(.*)\s*$/).replaceAll("\$1");
393
                                                        word = word.replace("<", "&lt;");
394
                                                        output.print("<w id=\"w_"+filename+"_$wordcount\" type=\"pon\">"+word+"</w>"+lineSeparator);
395
                                                        wordcount++;
396
                                                }
397
                                                else if( (m = segment =~ /\A\s*(($word_chars)*[{\(\[]($word_chars)+[\)\]}]($word_chars)+|($word_chars)+[{\(\[]($word_chars)+[\)\]}])(\s+|$punct_all)(.*?)\Z/))                                                {
398
                                                        if(DEBUG-- > 0)
399
                                                                println "   Words with brackets: 1w="+m[0][1]+" 2="+m[0][2]+" 3="+m[0][3]+" 4="+m[0][4]+" 5="+m[0][5]+" 6="+m[0][6]+" 7seg="+m[0][7]+" 8seg="+m[0][8];
400
                                                        segment = m[0][7]+m[0][8];//"$7$8";
401
                                                        word = m[0][1];
402
                                                        word = (word =~ /^\s*(.*)\s*$/).replaceAll("\$1");
403
                                                        //push @w_brackets, $word;
404
                                                        def word_clean = word;
405
                                                        word_clean = (word_clean =~ /($tag_all)/).replaceAll("");
406
                                                        if (word_clean ==~ /[^0-9A-Za-zœÀ-ÿ'’\(\)\[\]\-]/)
407
                                                        {        //System.err.println("Line $linecounter Error : " +word);
408
                                                                charerrors.add(word);
409
                                                        }
410
                                                        output.print("<w id=\"w_"+filename+"_$wordcount\">"+word+"</w>"+lineSeparator);
411
                                                        wordcount++;
412
                                                }
413
                                                else if( (m = segment =~ /\A\s*(((<[^>]+>)*($word_chars+)(<[^>]+>)*)+)(\s+|$punct_all)(.*?)\Z/))
414
                                                {
415
                                                        segment = m[0][6]+m[0][7];//"$6$7";
416
                                                        word = m[0][1];
417

    
418
                                                        if(DEBUG-- > 0)
419
                                                        {
420
                                                                println "Other word without elision: 1w="+m[0][1]+" 2="+m[0][2]+" 3="+m[0][3]+" 4="+m[0][4]+" 5="+m[0][5]+" 6seg="+m[0][6]+" 7seg="+m[0][7];
421
                                                        }
422
                                                        def word_clean = word;
423
                                                        word_clean = (word_clean =~ /($tag_all)/).replaceAll("");
424
                                                        if (word_clean ==~ /[^0-9A-Za-zœÀ-ÿ'’()[\\]–\\-]/)
425
                                                        {
426
                                                                //System.err.println("Line $linecounter Error : " +word);
427
                                                                charerrors.add(word);
428
                                                        }
429
                                                        String towrite = "<w id=\"w_"+filename+"_$wordcount\">"+word.toString()+"</w>"+lineSeparator;
430
                                                        wordcount++;
431
                                                        output.print(towrite);
432
                                                }
433
                                                else if( (m = segment =~ /\A\s*(((<[^>]+>)*($word_chars)+(<[^>]+>)*)+)[$punct_quotes](.*?)\Z/))
434
                                                {
435
                                                        if(DEBUG-- > 0)
436
                                                                println "   Other word patterns with elision: 1w="+m[0][1]+" 2="+m[0][2]+" 3="+m[0][3]+" 4="+m[0][4]+" 5="+m[0][5]+" 6seg="+m[0][6];
437
                                                        segment = m[0][6];
438
                                                        word = m[0][1]+"'";
439
                                                        def word_clean = word;
440
                                                        word_clean = (word_clean =~ /($tag_all)/).replaceAll("");
441
                                                        if (word_clean ==~ /[^0-9A-Za-zœÀ-ÿ'’\(\)\[\]\-]/)                                            {
442
                                                                //System.err.println("Line $linecounter Error : " +word);
443
                                                                charerrors.add(word);
444
                                                        }
445
                                                        output.print("<w id=\"w_"+filename+"_$wordcount\">"+word+"</w>"+lineSeparator);
446
                                                        wordcount++;
447
                                                }
448
                                                else{
449
                                                        System.err.println("   Unknown word pattern $segment");
450
                                                        output.print("<note>Error! Unknown word pattern: $segment</note>"+lineSeparator);
451
                                                        segment = " ";
452
                                                }
453
                                        }
454
                                }
455
                                else
456
                                {
457
                                        println(" line (?) : $segment \n");
458
                                        System.exit(-1)
459
                                }
460
                        }
461
                        else
462
                        {
463
                                output.print(segment);
464
                        }
465
                        break;
466
                }
467
                // End
468
        }
469
}