Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / tokenizer / TokenizerClasses.groovy @ 625

History | View | Annotate | Download (24.1 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-04-06 09:11:32 +0200 (jeu. 06 avril 2017) $
25
// $LastChangedRevision: 3425 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.tokenizer
29

    
30
import java.io.FileWriter;
31
import java.util.regex.Pattern;
32

    
33
import org.txm.utils.xml.DomUtils;
34
import org.w3c.dom.Document;
35
import org.w3c.dom.Element
36
import org.w3c.dom.Node;
37
import org.w3c.dom.NodeList;
38

    
39
import javax.xml.stream.*;
40
import java.net.URL;
41

    
42
// TODO: Auto-generated Javadoc
43
/**
44
 * The Class TokenizerClasses.
45
 */
46
class TokenizerClasses {
47

    
48
        //// ROOTS ? ////
49
        public static boolean debug = false;
50
        /** The tag_all. */
51
        public static String tag_all = "<[A-Za-z][^>]+>";
52
        
53
        /** The Weblex enclitics. */
54
        public static String enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
55
        public static String encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
56

    
57
        /** The TT enclitics. */
58
        public static String FClitic_en = "'(s|re|ve|d|m|em|ll)|n['‘’]t";
59
        public static String PClitic_fr = '[dcjlmnstDCJLNMST][\'‘’]|[Qq]u[\'‘’]|[Jj]usqu[\'‘’]|[Ll]orsqu[\'‘’]|[Pp]uisqu[\'‘’]|[Qq]uoiqu[\'‘’]';
60
        public static String FClitic_fr = '-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m[\'‘’]|-moi|-nous|-on|-toi|-tu|-t[\'‘’]|-vous|-en|-y|-ci|-là';
61
        public static String PClitic_it = '[dD][ae]ll[\'‘’]|[nN]ell[\'‘’]|[Aa]ll[\'‘’]|[lLDd][\'‘’]|[Ss]ull[\'‘’]|[Qq]uest[\'‘’]|[Uu]n[\'‘’]|[Ss]enz[\'‘’]|[Tt]utt[\'‘’]';
62
        public static String FClitic_gl = '-la|-las|-lo|-los|-nos';
63
        
64
        /** The div_tags. */
65
        public static String div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl|opener|dateline";
66

    
67
        /** The q_tags. */
68
        public static String q_tags = "q|quote|said|item|stage|cit|label|heraldry";
69

    
70
        /** The extraword_tags. */
71
        public static String extraword1_tags = "expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
72

    
73
        /** The corr_tags_no_seg. */
74
        public static String corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno|surplus";
75

    
76
        /** The word_tags. */
77
        public static String word_tags = "w";
78

    
79
        /** The intraword_tags. */
80
        public static String intraword_tags = "c|ex|caesura";
81

    
82
        /** The punct_quotes. */
83
        public static String punct_quotes = "'‘’’"
84

    
85
        /** The punct_strong1. */
86
        public static String punct_strong1 = ".!?";
87

    
88
        /** The punct_strong2. */
89
        public static String punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
90

    
91
        /** The punct_paren_open1. */
92
        public static String punct_paren_open1 = "«";
93

    
94
        /** The punct_paren_open2. */
95
        public static String punct_paren_open2 = "``|\\(|\\[|\\{";
96

    
97
        /** The punct_paren_close1. */
98
        public static String punct_paren_close1 = "»";
99

    
100
        /** The punct_paren_close2. */
101
        public static String punct_paren_close2 = "''|‘‘|’’|\\)|\\]|\\}";
102

    
103
        /** The punct_weak. */
104
        public static String punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\≤≥<>\\—ν√μ•@≈→αγ∞≡σ&%|#¼½Θĩ†φθΓ§ẽə∈";
105

    
106
        public static String entity = "&[^;]+;";
107

    
108
        public static String seg_tags = "seg"
109

    
110
        //// COMBINAISONS ////
111

    
112
        /** The corr_tags. */
113
        public static String corr_tags = "$corr_tags_no_seg|$seg_tags";
114
        /** The extraword_tags. */
115
        public static String extraword_tags = "$div_tags|$q_tags|$extraword1_tags";
116

    
117
        /** The punct_strong. */
118
        public static String punct_strong = "[$punct_strong1]+|$punct_strong2";
119

    
120
        /** The punct_paren_open. */
121
        public static String punct_paren_open = "$punct_paren_open1|$punct_paren_open2";
122

    
123
        /** The punct_paren_close. */
124
        public static String punct_paren_close = "$punct_paren_close1|$punct_paren_close2";
125

    
126
        /** The punct_paren. */
127
        public static String punct_paren = "$punct_paren_open|$punct_paren_close";
128

    
129
        /** The punct_all. */
130
        public static String punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
131

    
132
        /** The word_chars. */
133
        public static String word_chars = "[^ $punct_quotes$punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|$entity";
134

    
135
        public static def tests = [
136
                new TTest(/\A(.*-)(je|m[eo]i|tu|t[eo]i|lui|luy|ilz|ils?|no?u?s|vo?u?s|on|leurs|ce|ci|là|elles?)(.*)\Z/),
137
                new TTest(/\A(.*)(-)()\Z/, "pon"),
138
                //new TTest(/\A(.*)(Bande dessinée|eau de vie|machine à écrire|Moyen Âge|petit pois|poule d'eau|Avoir l'air|se rendre compte|faire semblant|prendre froid|s'en aller|Comme il faut|bon marché|bon enfant|en retard|en colère|à la mode|de bonne humeur|hors de propos|de travers|en vacances|de standing|à l'abandon|sans défense|pieds nus|Tout à fait|d'ores et déjà|de temps en temps|tout de suite|de gré ou de force|ne pas|ne jamais|ne plus|sans doute|on ne peut plus|Quelque chose|ce dernier|n'importe quoi|N'importe quel|beaucoup de|plus d'un|peu de|un tas de|un groupe de|je ne sais quel|De sorte que|bien que|parce que|c'est pourquoi|de même que|depuis que|Quant à|au-dessus de|par rapport à|à côté de|grâce à|hors de|le long de|Par ma barbe !|Mon dieu !|Eh bien !|Bonne nuit !|Pas de panique !)(.*)\Z/),
139
                new TTest(/\A([Aa]ujourd'hui)(.*)/,0,1,2),
140
                
141
                // BFM ONLY
142
//                new TTest(/\A()([.·][^ .·]+[.·])(.*)\Z/, "num"), 
143

    
144
                // TXM REFMAN ONLY
145
//                new TTest(/\A(.*)($tag_all)(.*)\Z/, "tag"),
146
//                new TTest(/\A(.*)([0-9][0-9][\/][0-9][0-9][\/][0-9][0-9])(.*)\Z/, "date", 1,2,3),
147
//                new TTest(/\A(.*)([0-9][0-9]:[0-9][0-9]:[0-9][0-9])(.*)\Z/, "time", 1,2,3),
148
//                new TTest(/\A([^0-9]*)([0-9]+(\.[0-9]+[a-z]+))(.*)\Z/, "version", 1,2,4),
149
//                new TTest(/\A([^0-9]*)([0-9]+(\.[0-9]+)(\.[0-9]+)+)(\.[0-9]+[a-z]+)?(.*)\Z/, "version", 1,2,6),
150
//                new TTest(/\A()([a-z]++:\/\/)(.*)\Z/, "protocol", 1,2,3),
151
//                new TTest(/\A()([0-9]+([.,][0-9]+)?)()\Z/, "num", 1,2,4),
152
//                new TTest(/\A()(([A-Z]:)?([\/\\][^\/\\]+)+)()\Z/, "path", 1,2,5),
153
                
154
                new TTest(/\A([\p{Ps}|\p{Pe}|\p{Pi}|\p{Pf}|\p{Po}|\p{S}]*)(\p{L}++['‘’])(.*)\Z/, "w", 1, 2, 3),
155
                new TTest(/\A(.*)(\.\.\.)(.*)\Z/, "pon"),
156
                new TTest(/\A(.*)(\p{Ps}|\p{Pe}|\p{Pi}|\p{Pf}|\p{Po}|\p{S})(.*)\Z/, "pon")
157
                
158
        ];
159
        //tag_all = "<[^>]+>";
160
        //enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
161
        //encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
162
        //div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl";
163
        //q_tags = "q|quote|item|stage";
164
        //extraword_tags = "$div_tags|$q_tags|expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
165
        //corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno";
166
        //corr_tags = "$corr_tags_no_seg|seg";
167
        //word_tags = "w|abbr|num";
168
        //intraword_tags = "c|ex";
169

    
170
        public static String whitespaces = "[\\p{Z}\\p{C}]+";
171
        public static String regElision = "['‘’]" // obsolete
172
        public static String regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]"
173

    
174
        //punct_strong1 = ".!?";
175
        //punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
176
        //punct_strong = "[$punct_strong1]|$punct_strong2";
177
        //punct_paren_open1 = "«";
178
        //punct_paren_open2 = "<<|``|\\(|\\[|\\{";
179
        //punct_paren_open = "$punct_paren_open1|$punct_paren_open2";
180
        //punct_paren_close1 = "»";
181
        //punct_paren_close2 = ">>|''|‘‘|’’|\\)|\\]|\\}";
182
        //punct_paren_close = "$punct_paren_close1|$punct_paren_close2";
183
        //punct_paren = "$punct_paren_open|$punct_paren_close";
184
        //punct_weak = ",;∼ˆ·*:\"`'“”\\+±=/\\-≤≥<>\\—_ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
185
        //punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
186
        //word_chars = "[^ $punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|&[^;]+;";
187

    
188
        public static reset() {
189
                tag_all = "<[^>]+>";
190
                enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
191
                encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
192
                div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl";
193
                q_tags = "q|quote|item|stage|cit";
194
                extraword1_tags = "expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
195
                corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno|surplus";
196
                word_tags = "w";
197
                intraword_tags = "c|ex|caesura";
198
                punct_quotes = "'‘’’"
199
                punct_strong1 = ".!?";
200
                punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
201
                punct_paren_open1 = "«";
202
                punct_paren_open2 = "``|\\(|\\[|\\{";
203
                punct_paren_close1 = "»";
204
                punct_paren_close2 = "''|‘‘|’’|\\)|\\]|\\}";
205
                punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\-≤≥<>\\—ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
206
                entity = "&[^;]+;";
207
                seg_tags = "seg"
208
                
209
                whitespaces = "[\\p{Z}\\p{C}]+";
210
                regElision = "['‘’]"
211
                regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]"
212

    
213
                recombine();
214
        }
215

    
216
        public static recombine() {
217
                corr_tags = "$corr_tags_no_seg|$seg_tags";
218
                extraword_tags = "$div_tags|$q_tags|$extraword1_tags";
219
                punct_strong = "[$punct_strong1]|$punct_strong2";
220
                punct_paren_open = "$punct_paren_open1|$punct_paren_open2";
221
                punct_paren_close = "$punct_paren_close1|$punct_paren_close2";
222
                punct_paren = "$punct_paren_open|$punct_paren_close";
223
                punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
224
                word_chars = "[^ $punct_quotes$punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|$entity";
225
        }
226

    
227
        /**
228
         * To dom.
229
         *
230
         * @param doc the doc
231
         * @param parent the parent
232
         * @return the java.lang. object
233
         */
234
        public static toDom(Document doc, Node parent) {
235
                Element tokenizer = doc.createElement("tokenizer");
236
                tokenizer.setAttribute("onlyThoseTests", "false")
237
                parent.appendChild(tokenizer)
238
                
239
                //String tag_all = "<[^>]+>";
240
                Element p = doc.createElement("param");
241
                p.setAttribute("key","tag_all");
242
                p.setTextContent(tag_all);
243
                tokenizer.appendChild(p)
244
                //String enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
245
                p = doc.createElement("param");
246
                p.setAttribute("key","enclitics");
247
                p.setTextContent(enclitics);
248
                tokenizer.appendChild(p)
249
                //String encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
250
                p = doc.createElement("param");
251
                p.setAttribute("key","encliticsFR");
252
                p.setTextContent(encliticsFR);
253
                tokenizer.appendChild(p)
254
                //String div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl";
255
                p = doc.createElement("param");
256
                p.setAttribute("key","div_tags");
257
                p.setTextContent(div_tags);
258
                tokenizer.appendChild(p)
259
                //String q_tags = "q|quote|item|stage";
260
                p = doc.createElement("param");
261
                p.setAttribute("key","q_tags");
262
                p.setTextContent(q_tags);
263
                tokenizer.appendChild(p)
264
                //String extraword_tags = "$div_tags|$q_tags|expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
265
                p = doc.createElement("param");
266
                p.setAttribute("key","extraword_tags");
267
                p.setTextContent(extraword_tags);
268
                tokenizer.appendChild(p)
269
                //String corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno";
270
                p = doc.createElement("param");
271
                p.setAttribute("key","enclitics");
272
                p.setTextContent(enclitics);
273
                tokenizer.appendChild(p)
274
                //String corr_tags = "$corr_tags_no_seg|seg";
275
                p = doc.createElement("param");
276
                p.setAttribute("key","corr_tags");
277
                p.setTextContent(corr_tags);
278
                tokenizer.appendChild(p)
279
                //String word_tags = "w|abbr|num";
280
                p = doc.createElement("param");
281
                p.setAttribute("key","word_tags");
282
                p.setTextContent(word_tags);
283
                tokenizer.appendChild(p)
284
                //String intraword_tags = "c|ex";
285
                p = doc.createElement("param");
286
                p.setAttribute("key","intraword_tags");
287
                p.setTextContent(intraword_tags);
288
                tokenizer.appendChild(p)
289

    
290
                //String punct_strong1 = ".!?";
291
                p = doc.createElement("param");
292
                p.setAttribute("key","punct_strong1");
293
                p.setTextContent(punct_strong1);
294
                tokenizer.appendChild(p)
295
                //String punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
296
                p = doc.createElement("param");
297
                p.setAttribute("key","punct_strong2");
298
                p.setTextContent(punct_strong2);
299
                tokenizer.appendChild(p)
300
                //String punct_strong = "[$punct_strong1]|$punct_strong2";
301
                p = doc.createElement("param");
302
                p.setAttribute("key","punct_strong");
303
                p.setTextContent(punct_strong);
304
                tokenizer.appendChild(p)
305
                //String punct_paren_open1 = "«";
306
                p = doc.createElement("param");
307
                p.setAttribute("key","punct_paren_open1");
308
                p.setTextContent(punct_paren_open1);
309
                tokenizer.appendChild(p)
310
                //String punct_paren_open2 = "<<|``|\\(|\\[|\\{";
311
                p = doc.createElement("param");
312
                p.setAttribute("key","punct_paren_open2");
313
                p.setTextContent(punct_paren_open2);
314
                tokenizer.appendChild(p)
315
                //String punct_paren_open = "$punct_paren_open1|$punct_paren_open2";
316
                p = doc.createElement("param");
317
                p.setAttribute("key","punct_paren_open");
318
                p.setTextContent(punct_paren_open);
319
                tokenizer.appendChild(p)
320
                //String punct_paren_close1 = "»";
321
                p = doc.createElement("param");
322
                p.setAttribute("key","punct_paren_close1");
323
                p.setTextContent(punct_paren_close1);
324
                tokenizer.appendChild(p)
325
                //String punct_paren_close2 = ">>|''|‘‘|’’|\\)|\\]|\\}";
326
                p = doc.createElement("param");
327
                p.setAttribute("key","punct_paren_close2");
328
                p.setTextContent(punct_paren_close2);
329
                tokenizer.appendChild(p)
330
                //String punct_paren_close = "$punct_paren_close1|$punct_paren_close2";
331
                p = doc.createElement("param");
332
                p.setAttribute("key","punct_paren_close");
333
                p.setTextContent(punct_paren_close);
334
                tokenizer.appendChild(p)
335
                //String punct_paren = "$punct_paren_open|$punct_paren_close";
336
                p = doc.createElement("param");
337
                p.setAttribute("key","punct_paren");
338
                p.setTextContent(punct_paren);
339
                tokenizer.appendChild(p)
340
                //String punct_weak = ",;∼ˆ·*:\"`'“”\\+±=/\\-≤≥<>\\—_ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
341
                p = doc.createElement("param");
342
                p.setAttribute("key","punct_weak");
343
                p.setTextContent(punct_weak);
344
                tokenizer.appendChild(p)
345
                //String punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
346
                p = doc.createElement("param");
347
                p.setAttribute("key","punct_all");
348
                p.setTextContent(punct_all);
349
                tokenizer.appendChild(p)
350
                //String word_chars = "[^ $punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|&[^;]+;";
351
                p = doc.createElement("param");
352
                p.setAttribute("key","word_chars");
353
                p.setTextContent(word_chars);
354
                tokenizer.appendChild(p)
355
                //whitespaces = "[\\p{Z}\\p{C}]+";
356
                p = doc.createElement("param");
357
                p.setAttribute("key","whitespaces");
358
                p.setTextContent(whitespaces);
359
                tokenizer.appendChild(p)
360
                //regElision = "['‘’]"
361
                p = doc.createElement("param");
362
                p.setAttribute("key","regElision");
363
                p.setTextContent(regElision);
364
                tokenizer.appendChild(p)
365
                //regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]"
366
                p = doc.createElement("param");
367
                p.setAttribute("key","regPunct");
368
                p.setTextContent(regPunct);
369
                tokenizer.appendChild(p)
370
        }
371

    
372
        /**
373
         * Dump.
374
         */
375
        public static void dump() {
376
                println "BRUT"
377
                println "tag_all = $tag_all"
378
                println "enclitics = $enclitics"
379
                println "encliticsFR = $encliticsFR"
380
                println "div_tags = $div_tags"
381
                println "q_tags = $q_tags"
382
                println "extraword1_tags = $extraword1_tags"
383
                println "corr_tags_no_seg = $corr_tags_no_seg"
384
                println "word_tags = $word_tags"
385
                println "intraword_tags = $intraword_tags"
386
                println "punct_quotes = $punct_quotes"
387
                println "punct_strong1 = $punct_strong1"
388
                println "punct_strong2 = $punct_strong2"
389
                println "punct_paren_open1 = $punct_paren_open1"
390
                println "punct_paren_open2 = $punct_paren_open2"
391
                println "punct_paren_close1 = $punct_paren_close1"
392
                println "punct_paren_close2 = $punct_paren_close2"
393
                println "punct_weak = $punct_weak"
394
                println "entity = $entity"
395
                println "seg_tags = $seg_tags"
396
                println "COMBINED"
397
                println "corr_tags = $corr_tags"
398
                println "extraword_tags = $extraword_tags"
399
                println "punct_strong = $punct_strong"
400
                println "punct_paren_open = $punct_paren_open"
401
                println "punct_paren_close = $punct_paren_close"
402
                println "punct_paren = $punct_paren"
403
                println "punct_all = $punct_all"
404
                println "word_chars = $word_chars"
405
                println "whitespaces = $whitespaces"
406
                println "regElision = $regElision"
407
                println "regPunct = $regPunct"
408
                println "TESTS"
409
                for(def test : tests)
410
                        println " $test"
411
        }
412

    
413
        public static boolean isValid() {
414
                try { Pattern.compile(tag_all) }
415
                catch (Exception e) { println "tag_all=${tag_all}: "+e;return false; }
416

    
417
                try { Pattern.compile(enclitics) }
418
                catch (Exception e) { println "enclitics=${enclitics}: "+e;return false; }
419

    
420
                try { Pattern.compile(encliticsFR) }
421
                catch (Exception e) { println "encliticsFR=${encliticsFR}: "+e;return false; }
422

    
423
                try { Pattern.compile(div_tags) }
424
                catch (Exception e) { println "div_tags=${div_tags}: "+e;return false; }
425

    
426
                try { Pattern.compile(q_tags) }
427
                catch (Exception e) { println "q_tags=${q_tags}: "+e;return false; }
428

    
429
                try { Pattern.compile(extraword1_tags) }
430
                catch (Exception e) { println "extraword1_tags=${extraword1_tags}: "+e;return false; }
431

    
432
                try { Pattern.compile(corr_tags_no_seg) }
433
                catch (Exception e) { println "corr_tags_no_seg=${corr_tags_no_seg}: "+e;return false; }
434

    
435
                try { Pattern.compile(word_tags) }
436
                catch (Exception e) { println "word_tags=${word_tags}: "+e;return false; }
437

    
438
                try { Pattern.compile(intraword_tags) }
439
                catch (Exception e) { println "intraword_tags=${intraword_tags}: "+e;return false; }
440

    
441
                try { Pattern.compile(punct_quotes) }
442
                catch (Exception e) { println "punct_quotes=${punct_quotes}: "+e;return false; }
443

    
444
                try { Pattern.compile(punct_strong1) }
445
                catch (Exception e) { println "punct_strong1=${punct_strong1}: "+e;return false; }
446

    
447
                try { Pattern.compile(punct_strong2) }
448
                catch (Exception e) { println "punct_strong2=${punct_strong2}: "+e;return false; }
449

    
450
                try { Pattern.compile(punct_paren_open1) }
451
                catch (Exception e) { println "punct_paren_open1=${punct_paren_open1}: "+e;return false; }
452

    
453
                try { Pattern.compile(punct_paren_open2) }
454
                catch (Exception e) { println "punct_paren_open2=${punct_paren_open2}: "+e;return false; }
455

    
456
                try { Pattern.compile(punct_paren_close1) }
457
                catch (Exception e) { println "punct_paren_close1=${punct_paren_close1}: "+e;return false; }
458

    
459
                try { Pattern.compile(punct_paren_close2) }
460
                catch (Exception e) { println "punct_paren_close2=${punct_paren_close2}: "+e;return false; }
461

    
462
                try { Pattern.compile(punct_weak) }
463
                catch (Exception e) { println "punct_weak=${punct_weak}: "+e;return false; }
464

    
465
                try { Pattern.compile(entity) }
466
                catch (Exception e) { println "entity=${entity}: "+e;return false; }
467

    
468
                try { Pattern.compile(seg_tags) }
469
                catch (Exception e) { println "seg_tags=${seg_tags}: "+e;return false; }
470

    
471
                try { Pattern.compile(corr_tags) }
472
                catch (Exception e) { println "corr_tags: "+e;return false; }
473

    
474
                try {         Pattern.compile(extraword_tags) }
475
                catch (Exception e) { println "extraword_tags: "+e;return false; }
476

    
477
                if (punct_strong != null)
478
                try {         Pattern.compile(punct_strong) }
479
                catch (Exception e) { println "punct_strong: "+e;return false; }
480

    
481
                try { Pattern.compile(punct_paren_open) }
482
                catch (Exception e) { println "punct_paren_open: "+e;return false; }
483

    
484
                try { Pattern.compile(punct_paren_close) }
485
                catch (Exception e) { println "punct_paren_close: "+e;return false; }
486

    
487
                try { Pattern.compile(punct_paren) }
488
                catch (Exception e) { println "punct_paren: "+e;return false; }
489

    
490
                try { Pattern.compile(punct_all) }
491
                catch (Exception e) { println "punct_all: "+e;return false; }
492

    
493
                try { Pattern.compile(word_chars) }
494
                catch (Exception e) { println "word_chars: "+e;return false; }
495
                
496
                if (regPunct != null)
497
                try { Pattern.compile(regPunct) }
498
                catch (Exception e) { println "regPunct=${regPunct}: "+e;return false; }
499

    
500
                if (regElision != null)
501
                try { Pattern.compile(regElision) }
502
                catch (Exception e) { println "regElision=${regElision}: "+e;return false; }
503

    
504
                if (whitespaces != null)
505
                try { Pattern.compile(whitespaces) }
506
                catch (Exception e) { println "whitespaces=${whitespaces}: "+e;return false; }
507

    
508
                for(TTest test : tests) {
509
                        try { Pattern.compile(test.regex) }
510
                        catch (Exception e) { println test.regex+": "+e;return false; }
511
                }
512
                return true;
513
        }
514

    
515
        public static boolean loadFromNode(Element tokenizerElement) {
516
                //load params
517
                String tmp_strong_punct;
518
                NodeList params = tokenizerElement.getElementsByTagName("param");
519
                for (int i = 0 ; i < params.getLength() ; i++) {
520
                        Element param = params.item(i);
521
                        String key = param.getAttribute("name")
522
                        String value = param.getAttribute("value")
523
                        if (value == null || value.length() == 0) value = param.getTextContent()
524
                        if (value.length() == 0) value = null;
525
                        
526
                        if (debug) println (" Tokenizer parametrized with $key=$value.")
527
                        if (key == "tag_all")
528
                                tag_all = value
529
                        else if (key == "enclitics")
530
                                enclitics = value
531
                        else if (key == "encliticsFR")
532
                                encliticsFR = value
533
                        else if (key == "div_tags")
534
                                div_tags = value
535
                        else if (key == "q_tags")
536
                                q_tags = value
537
                        else if (key == "extraword1_tags")
538
                                extraword1_tags = value
539
                        else if (key == "corr_tags_no_seg")
540
                                corr_tags_no_seg = value
541
                        else if (key == "word_tags")
542
                                word_tags = value
543
                        else if (key == "intraword_tags")
544
                                intraword_tags = value
545
                        else if (key == "punct_quotes")
546
                                punct_quotes = value
547
                        else if (key == "punct_strong1")
548
                                punct_strong1 = value
549
                        else if (key == "punct_strong2")
550
                                punct_strong2 = value
551
                        else if (key == "punct_paren_open1")
552
                                punct_paren_open1 = value
553
                        else if (key == "punct_paren_open2")
554
                                punct_paren_open2 = value
555
                        else if (key == "punct_paren_close1")
556
                                punct_paren_close1 = value
557
                        else if (key == "punct_paren_close2")
558
                                punct_paren_close2 = value
559
                        else if (key == "punct_weak")
560
                                punct_weak = value
561
                        else if (key == "entity")
562
                                entity = value
563
                        else if (key == "seg_tags")
564
                                seg_tags = value
565
                        else if (key == "regPunct")
566
                                regPunct = value
567
                        else if (key == "regElision")
568
                                regElision = value
569
                        else if (key == "whitespaces")
570
                                whitespaces = value
571
                        else if (key == "punct_strong") // this is temporary
572
                                tmp_strong_punct = value // this is temporary
573
                        else
574
                                println "MISSING TOKENIZER KEY: "+key
575
                }
576
                //recombine
577
                recombine();
578
                if (tmp_strong_punct != null) punct_strong = tmp_strong_punct;  // this is temporary
579
                
580
                String shouldResetTests = tokenizerElement.getAttribute("onlyThoseTests")
581
                if ("true".equals(shouldResetTests)) {
582
                        println "Warning: tokenizer only using import parameters tests"
583
                        tests = []
584
                }
585
                        
586
                NodeList testsList = tokenizerElement.getElementsByTagName("test");
587
                if (testsList.getLength() > 0) {
588
                        //println "Add "+testsList.getLength()+" tests to the tokenizer"
589
                        for (int i = 0 ; i < testsList.getLength() ; i++) {
590
                                Element test = testsList.item(i);
591
                                tests.add(i, new TTest(test.getTextContent(), test.getAttribute("type"),
592
                                                Integer.parseInt(test.getAttribute("before")),
593
                                                Integer.parseInt(test.getAttribute("hit")),
594
                                                Integer.parseInt(test.getAttribute("after")))
595
                                                )
596
                        }
597
                        //println "Tests: "+tests
598
                }
599

    
600
                if (isValid())
601
                        return true
602
                else {
603
                        reset();
604
                        recombine();
605
                        return false;
606
                }
607
        }
608

    
609
        /**
610
         * The main method.
611
         *
612
         * @param args the arguments
613
         */
614
        public static void main(String[] args) {
615

    
616
                Document doc = DomUtils.load(new File("/home/mdecorde/xml/discours/import.xml"));
617
                Node corpora = doc.getDocumentElement().getElementsByTagName("corpora").item(0);
618
                Node corpus = ((Element)corpora).getElementsByTagName("corpus").item(0)
619
                Node tokenizer = ((Element)corpus).getElementsByTagName("tokenizer").item(0)
620
                println "OK? "+ TokenizerClasses.loadFromNode(tokenizer)
621
        }
622
}