45 |
45 |
import org.w3c.dom.NodeList;
|
46 |
46 |
import org.xml.sax.SAXException;
|
47 |
47 |
|
|
48 |
import cern.colt.Arrays;
|
|
49 |
|
48 |
50 |
/**
|
49 |
51 |
* Holder for tokenizer rules regexp
|
50 |
52 |
*/
|
51 |
53 |
public class TokenizerClasses {
|
52 |
|
|
|
54 |
|
53 |
55 |
public String lang;
|
54 |
|
|
|
56 |
|
55 |
57 |
public TokenizerClasses(String lang) {
|
56 |
58 |
if (lang != null) {
|
57 |
59 |
this.lang = lang;
|
... | ... | |
61 |
63 |
}
|
62 |
64 |
reset();
|
63 |
65 |
}
|
64 |
|
|
|
66 |
|
65 |
67 |
public TokenizerClasses() {
|
66 |
68 |
this(Locale.getDefault().getCountry());
|
67 |
69 |
}
|
68 |
|
|
|
70 |
|
69 |
71 |
/**
|
70 |
72 |
* Helper
|
71 |
73 |
*
|
... | ... | |
78 |
80 |
tc.lang = lang;
|
79 |
81 |
return tc;
|
80 |
82 |
}
|
81 |
|
|
|
83 |
|
82 |
84 |
//// ROOTS ? ////
|
83 |
85 |
public boolean debug = false;
|
84 |
|
|
|
86 |
|
85 |
87 |
/** The tag_all. */
|
86 |
88 |
public String tag_all = null;
|
87 |
|
|
|
89 |
|
88 |
90 |
/** The Weblex enclitics. */
|
89 |
91 |
public String enclitics = null;
|
90 |
|
|
|
92 |
|
91 |
93 |
public String encliticsFR = null;
|
92 |
|
|
|
94 |
|
93 |
95 |
/** The TT enclitics. */
|
94 |
96 |
public String FClitic_en = null;
|
95 |
|
|
|
97 |
|
96 |
98 |
public String PClitic_fr = null;
|
97 |
|
|
|
99 |
|
98 |
100 |
public String FClitic_fr = null;
|
99 |
|
|
|
101 |
|
100 |
102 |
public String PClitic_it = null;
|
101 |
|
|
|
103 |
|
102 |
104 |
public String FClitic_gl = null;
|
103 |
|
|
|
105 |
|
104 |
106 |
/** The div_tags. */
|
105 |
107 |
public String div_tags = null;
|
106 |
|
|
|
108 |
|
107 |
109 |
/** The q_tags. */
|
108 |
110 |
public String q_tags = null;
|
109 |
|
|
|
111 |
|
110 |
112 |
/** The extraword_tags. */
|
111 |
113 |
public String extraword1_tags = null;
|
112 |
|
|
|
114 |
|
113 |
115 |
/** The corr_tags_no_seg. */
|
114 |
116 |
public String corr_tags_no_seg = null;
|
115 |
|
|
|
117 |
|
116 |
118 |
/** The word_tags. */
|
117 |
119 |
public String word_tags = null;
|
118 |
|
|
|
120 |
|
119 |
121 |
/** the element to create when a word is created */
|
120 |
122 |
public String word_element_to_create = null;
|
121 |
|
|
|
123 |
|
122 |
124 |
/** The intraword_tags. */
|
123 |
125 |
public String intraword_tags = null;
|
124 |
|
|
|
126 |
|
125 |
127 |
/** The punct_quotes. */
|
126 |
128 |
public String punct_quotes = null;
|
127 |
|
|
|
129 |
|
128 |
130 |
/** The punct_strong1. */
|
129 |
131 |
public String punct_strong1 = null;
|
130 |
|
|
|
132 |
|
131 |
133 |
/** The punct_strong2. */
|
132 |
134 |
public String punct_strong2 = null;
|
133 |
|
|
|
135 |
|
134 |
136 |
/** The punct_paren_open1. */
|
135 |
137 |
public String punct_paren_open1 = null;
|
136 |
|
|
|
138 |
|
137 |
139 |
/** The punct_paren_open2. */
|
138 |
140 |
public String punct_paren_open2 = null;
|
139 |
|
|
|
141 |
|
140 |
142 |
/** The punct_paren_close1. */
|
141 |
143 |
public String punct_paren_close1 = null;
|
142 |
|
|
|
144 |
|
143 |
145 |
/** The punct_paren_close2. */
|
144 |
146 |
public String punct_paren_close2 = null;
|
145 |
|
|
|
147 |
|
146 |
148 |
/** The punct_weak. */
|
147 |
149 |
public String punct_weak = null;
|
148 |
|
|
|
150 |
|
149 |
151 |
public String entity = null;
|
150 |
|
|
|
152 |
|
151 |
153 |
public String seg_tags = null;
|
152 |
|
|
|
154 |
|
153 |
155 |
//// COMBINAISONS ////
|
154 |
|
|
|
156 |
|
155 |
157 |
/** The corr_tags. */
|
156 |
158 |
public String corr_tags = null;
|
157 |
|
|
|
159 |
|
158 |
160 |
/** The extraword_tags. */
|
159 |
161 |
public String extraword_tags = null;
|
160 |
|
|
|
162 |
|
161 |
163 |
/** The punct_strong. */
|
162 |
164 |
public String punct_strong = null;
|
163 |
|
|
|
165 |
|
164 |
166 |
/** The punct_paren_open. */
|
165 |
167 |
public String punct_paren_open = null;
|
166 |
|
|
|
168 |
|
167 |
169 |
/** The punct_paren_close. */
|
168 |
170 |
public String punct_paren_close = null;
|
169 |
|
|
|
171 |
|
170 |
172 |
/** The punct_paren. */
|
171 |
173 |
public String punct_paren = null;
|
172 |
|
|
|
174 |
|
173 |
175 |
/** The punct_all. */
|
174 |
176 |
public String punct_all = null;
|
175 |
|
|
|
177 |
|
176 |
178 |
/** The word_chars. */
|
177 |
179 |
public String word_chars = null;
|
178 |
|
|
|
180 |
|
179 |
181 |
public TTest[] tests = {
|
180 |
182 |
// new TTest(/\A(.*-)(je|m[eo]i|tu|t[eo]i|lui|luy|ilz|ils?|no?u?s|vo?u?s|on|leurs|ce|ci|là|elles?)(.*)\Z/),
|
181 |
183 |
// new TTest(/\A(.*)(-)()\Z/, "pon"),
|
... | ... | |
185 |
187 |
// de|un groupe de|je ne sais quel|De sorte que|bien que|parce que|c'est pourquoi|de même que|depuis que|Quant à|au-dessus de|par rapport à|à côté de|grâce à|hors de|le long de|Par ma
|
186 |
188 |
// barbe !|Mon dieu !|Eh bien !|Bonne nuit !|Pas de panique !)(.*)\Z/),
|
187 |
189 |
// new TTest(/\A([Aa]ujourd'hui)(.*)/,0,1,2),
|
188 |
|
|
|
190 |
|
189 |
191 |
// BFM ONLY
|
190 |
192 |
// new TTest(/\A()([.·][^ .·]+[.·])(.*)\Z/, "num"),
|
191 |
|
|
|
193 |
|
192 |
194 |
// TXM REFMAN ONLY
|
193 |
195 |
// new TTest(/\A(.*)($tag_all)(.*)\Z/, "tag"),
|
194 |
196 |
// new TTest(/\A(.*)([0-9][0-9][\/][0-9][0-9][\/][0-9][0-9])(.*)\Z/, "date", 1,2,3),
|
... | ... | |
198 |
200 |
// new TTest("^(.*)(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)(.*)$", "url", 1, 2, 5),
|
199 |
201 |
// new TTest(/\A()([0-9]+([.,][0-9]+)?)()\Z/, "num", 1,2,4),
|
200 |
202 |
// new TTest(/\A()(([A-Z]:)?([\/\\][^\/\\]+)+)()\Z/, "path", 1,2,5),
|
201 |
|
|
|
203 |
|
202 |
204 |
// new TTest(/\A([\p{Ps}|\p{Pe}|\p{Pi}|\p{Pf}|\p{Po}|\p{S}]*)(\p{L}++['‘’])(.*)\Z/, "w", 1, 2, 3),
|
203 |
205 |
// new TTest(/\A(.*)(\.\.\.)(.*)\Z/, "pon"),
|
204 |
206 |
// new TTest(/\A(.*)(\p{Ps}|\p{Pe}|\p{Pi}|\p{Pf}|\p{Po}|\p{S})(.*)\Z/, "pon")
|
205 |
207 |
};
|
206 |
|
|
|
208 |
|
207 |
209 |
public String whitespaces = null;
|
208 |
|
|
|
210 |
|
209 |
211 |
public String regElision = null;
|
210 |
|
|
|
212 |
|
211 |
213 |
public String regPunct = null;
|
212 |
|
|
|
214 |
|
213 |
215 |
// punct_strong1 = ".!?";
|
214 |
216 |
// punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
|
215 |
217 |
// punct_strong = "[$punct_strong1]|$punct_strong2";
|
... | ... | |
223 |
225 |
// punct_weak = ",;∼ˆ·*:\"`'“”\\+±=/\\-≤≥<>\\—_ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
|
224 |
226 |
// punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
|
225 |
227 |
// word_chars = "[^ $punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|&[^;]+;";
|
226 |
|
|
|
228 |
|
227 |
229 |
public void reset() {
|
228 |
|
tag_all = "<[A-Za-z][^>]+>";
|
229 |
230 |
|
|
231 |
if (debug) System.out.println("Reset TC");
|
|
232 |
tag_all = "<[A-Za-z][^>]+>";
|
|
233 |
|
230 |
234 |
div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl|opener|dateline";
|
231 |
235 |
q_tags = "q|quote|said|item|stage|cit|label|heraldry";
|
232 |
236 |
extraword1_tags = "expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
|
... | ... | |
244 |
248 |
punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\≤≥<>\\—ν√μ•@≈→αγ∞≡σ&%|#¼½Θĩ†φθΓ§ẽə∈";
|
245 |
249 |
entity = "&[^;]+;";
|
246 |
250 |
seg_tags = "seg";
|
247 |
|
|
|
251 |
|
248 |
252 |
enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
|
249 |
253 |
encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
|
250 |
|
|
|
254 |
|
251 |
255 |
/** The TT enclitics. */
|
252 |
256 |
FClitic_en = "'(s|re|ve|d|m|em|ll)|n['‘’]t";
|
253 |
257 |
PClitic_fr = "[dcjlmnstyDCJLNMSTY][\'‘’]|[Qq]u[\'‘’]|[Jj]usqu[\'‘’]|[Ll]orsqu[\'‘’]|[Pp]uisqu[\'‘’]|[Qq]uoiqu[\'‘’]";
|
254 |
258 |
FClitic_fr = "-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m[\'‘’]|-moi|-nous|-on|-toi|-tu|-t[\'‘’]|-vous|-en|-y|-ci|-là";
|
255 |
259 |
PClitic_it = "[dD][ae]ll[\'‘’]|[nN]ell[\'‘’]|[Aa]ll[\'‘’]|[lLDd][\'‘’]|[Ss]ull[\'‘’]|[Qq]uest[\'‘’]|[Uu]n[\'‘’]|[Ss]enz[\'‘’]|[Tt]utt[\'‘’]";
|
256 |
260 |
FClitic_gl = "-la|-las|-lo|-los|-nos";
|
257 |
|
|
|
261 |
|
258 |
262 |
whitespaces = "[\\p{Z}\\p{C}]+";
|
259 |
263 |
regElision = "['‘’]";
|
260 |
264 |
regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]";
|
261 |
|
|
|
265 |
|
262 |
266 |
recombine();
|
263 |
267 |
}
|
264 |
|
|
|
268 |
|
265 |
269 |
public void recombine() {
|
266 |
|
|
|
270 |
|
267 |
271 |
if (corr_tags == null) {
|
268 |
272 |
corr_tags = "" + corr_tags_no_seg + "|" + seg_tags + "";
|
269 |
273 |
}
|
... | ... | |
289 |
293 |
word_chars = "[^ " + punct_quotes + "" + punct_strong1 + "" + punct_paren_open1 + "" + punct_paren_close1 + "" + punct_weak + "]+|" + entity + "";
|
290 |
294 |
}
|
291 |
295 |
}
|
292 |
|
|
|
296 |
|
293 |
297 |
/**
|
294 |
298 |
* To dom.
|
295 |
299 |
*
|
... | ... | |
299 |
303 |
*/
|
300 |
304 |
@Deprecated
|
301 |
305 |
public void toDom(Document doc, Node parent) {
|
|
306 |
|
302 |
307 |
Element tokenizer = doc.createElement("tokenizer");
|
303 |
308 |
tokenizer.setAttribute("onlyThoseTests", "false");
|
304 |
309 |
parent.appendChild(tokenizer);
|
305 |
|
|
|
310 |
|
306 |
311 |
// String tag_all = "<[^>]+>";
|
307 |
312 |
Element p = doc.createElement("param");
|
308 |
313 |
p.setAttribute("key", "tag_all");
|
... | ... | |
353 |
358 |
p.setAttribute("key", "intraword_tags");
|
354 |
359 |
p.setTextContent(intraword_tags);
|
355 |
360 |
tokenizer.appendChild(p);
|
356 |
|
|
|
361 |
|
357 |
362 |
// String punct_strong1 = ".!?";
|
358 |
363 |
p = doc.createElement("param");
|
359 |
364 |
p.setAttribute("key", "punct_strong1");
|
... | ... | |
435 |
440 |
p.setTextContent(regPunct);
|
436 |
441 |
tokenizer.appendChild(p);
|
437 |
442 |
}
|
438 |
|
|
|
443 |
|
439 |
444 |
/**
|
440 |
445 |
* Dump.
|
441 |
446 |
*/
|
... | ... | |
473 |
478 |
System.out.println("regElision = " + regElision + "");
|
474 |
479 |
System.out.println("regPunct = " + regPunct + "");
|
475 |
480 |
System.out.println("TESTS:");
|
476 |
|
|
|
481 |
|
477 |
482 |
for (TTest test : tests) {
|
478 |
483 |
System.out.println(" " + test + "");
|
479 |
484 |
}
|
480 |
485 |
}
|
481 |
|
|
|
486 |
|
482 |
487 |
public boolean isValid() {
|
483 |
488 |
try {
|
484 |
489 |
Pattern.compile(tag_all);
|
... | ... | |
487 |
492 |
System.out.println("tag_all=" + tag_all + ": " + e);
|
488 |
493 |
return false;
|
489 |
494 |
}
|
490 |
|
|
|
495 |
|
491 |
496 |
try {
|
492 |
497 |
Pattern.compile(enclitics);
|
493 |
498 |
}
|
... | ... | |
495 |
500 |
System.out.println("enclitics=" + enclitics + ": " + e);
|
496 |
501 |
return false;
|
497 |
502 |
}
|
498 |
|
|
|
503 |
|
499 |
504 |
try {
|
500 |
505 |
Pattern.compile(encliticsFR);
|
501 |
506 |
}
|
... | ... | |
503 |
508 |
System.out.println("encliticsFR=" + encliticsFR + ": " + e);
|
504 |
509 |
return false;
|
505 |
510 |
}
|
506 |
|
|
|
511 |
|
507 |
512 |
try {
|
508 |
513 |
Pattern.compile(div_tags);
|
509 |
514 |
}
|
... | ... | |
511 |
516 |
System.out.println("div_tags=" + div_tags + ": " + e);
|
512 |
517 |
return false;
|
513 |
518 |
}
|
514 |
|
|
|
519 |
|
515 |
520 |
try {
|
516 |
521 |
Pattern.compile(q_tags);
|
517 |
522 |
}
|
... | ... | |
519 |
524 |
System.out.println("q_tags=" + q_tags + ": " + e);
|
520 |
525 |
return false;
|
521 |
526 |
}
|
522 |
|
|
|
527 |
|
523 |
528 |
try {
|
524 |
529 |
Pattern.compile(extraword1_tags);
|
525 |
530 |
}
|
... | ... | |
527 |
532 |
System.out.println("extraword1_tags=" + extraword1_tags + ": " + e);
|
528 |
533 |
return false;
|
529 |
534 |
}
|
530 |
|
|
|
535 |
|
531 |
536 |
try {
|
532 |
537 |
Pattern.compile(corr_tags_no_seg);
|
533 |
538 |
}
|
... | ... | |
535 |
540 |
System.out.println("corr_tags_no_seg=" + corr_tags_no_seg + ": " + e);
|
536 |
541 |
return false;
|
537 |
542 |
}
|
538 |
|
|
|
543 |
|
539 |
544 |
try {
|
540 |
545 |
Pattern.compile(word_tags);
|
541 |
546 |
}
|
... | ... | |
543 |
548 |
System.out.println("word_tags=" + word_tags + ": " + e);
|
544 |
549 |
return false;
|
545 |
550 |
}
|
546 |
|
|
|
551 |
|
547 |
552 |
try {
|
548 |
553 |
Pattern.compile(intraword_tags);
|
549 |
554 |
}
|
... | ... | |
551 |
556 |
System.out.println("intraword_tags=" + intraword_tags + ": " + e);
|
552 |
557 |
return false;
|
553 |
558 |
}
|
554 |
|
|
|
559 |
|
555 |
560 |
try {
|
556 |
561 |
Pattern.compile(punct_quotes);
|
557 |
562 |
}
|
... | ... | |
559 |
564 |
System.out.println("punct_quotes=" + punct_quotes + ": " + e);
|
560 |
565 |
return false;
|
561 |
566 |
}
|
562 |
|
|
|
567 |
|
563 |
568 |
try {
|
564 |
569 |
Pattern.compile(punct_strong1);
|
565 |
570 |
}
|
... | ... | |
567 |
572 |
System.out.println("punct_strong1=" + punct_strong1 + ": " + e);
|
568 |
573 |
return false;
|
569 |
574 |
}
|
570 |
|
|
|
575 |
|
571 |
576 |
try {
|
572 |
577 |
Pattern.compile(punct_strong2);
|
573 |
578 |
}
|
... | ... | |
575 |
580 |
System.out.println("punct_strong2=" + punct_strong2 + ": " + e);
|
576 |
581 |
return false;
|
577 |
582 |
}
|
578 |
|
|
|
583 |
|
579 |
584 |
try {
|
580 |
585 |
Pattern.compile(punct_paren_open1);
|
581 |
586 |
}
|
... | ... | |
583 |
588 |
System.out.println("punct_paren_open1=" + punct_paren_open1 + ": " + e);
|
584 |
589 |
return false;
|
585 |
590 |
}
|
586 |
|
|
|
591 |
|
587 |
592 |
try {
|
588 |
593 |
Pattern.compile(punct_paren_open2);
|
589 |
594 |
}
|
... | ... | |
591 |
596 |
System.out.println("punct_paren_open2=" + punct_paren_open2 + ": " + e);
|
592 |
597 |
return false;
|
593 |
598 |
}
|
594 |
|
|
|
599 |
|
595 |
600 |
try {
|
596 |
601 |
Pattern.compile(punct_paren_close1);
|
597 |
602 |
}
|
... | ... | |
599 |
604 |
System.out.println("punct_paren_close1=" + punct_paren_close1 + ": " + e);
|
600 |
605 |
return false;
|
601 |
606 |
}
|
602 |
|
|
|
607 |
|
603 |
608 |
try {
|
604 |
609 |
Pattern.compile(punct_paren_close2);
|
605 |
610 |
}
|
... | ... | |
607 |
612 |
System.out.println("punct_paren_close2=" + punct_paren_close2 + ": " + e);
|
608 |
613 |
return false;
|
609 |
614 |
}
|
610 |
|
|
|
615 |
|
611 |
616 |
try {
|
612 |
617 |
Pattern.compile(punct_weak);
|
613 |
618 |
}
|
... | ... | |
615 |
620 |
System.out.println("punct_weak=" + punct_weak + ": " + e);
|
616 |
621 |
return false;
|
617 |
622 |
}
|
618 |
|
|
|
623 |
|
619 |
624 |
try {
|
620 |
625 |
Pattern.compile(entity);
|
621 |
626 |
}
|
... | ... | |
623 |
628 |
System.out.println("entity=" + entity + ": " + e);
|
624 |
629 |
return false;
|
625 |
630 |
}
|
626 |
|
|
|
631 |
|
627 |
632 |
try {
|
628 |
633 |
Pattern.compile(seg_tags);
|
629 |
634 |
}
|
... | ... | |
631 |
636 |
System.out.println("seg_tags=" + seg_tags + ": " + e);
|
632 |
637 |
return false;
|
633 |
638 |
}
|
634 |
|
|
|
639 |
|
635 |
640 |
try {
|
636 |
641 |
Pattern.compile(corr_tags);
|
637 |
642 |
}
|
... | ... | |
639 |
644 |
System.out.println("corr_tags: " + e);
|
640 |
645 |
return false;
|
641 |
646 |
}
|
642 |
|
|
|
647 |
|
643 |
648 |
try {
|
644 |
649 |
Pattern.compile(extraword_tags);
|
645 |
650 |
}
|
... | ... | |
647 |
652 |
System.out.println("extraword_tags: " + e);
|
648 |
653 |
return false;
|
649 |
654 |
}
|
650 |
|
|
|
655 |
|
651 |
656 |
if (punct_strong != null)
|
652 |
657 |
try {
|
653 |
|
Pattern.compile(punct_strong);
|
|
658 |
Pattern.compile(punct_strong);
|
654 |
659 |
}
|
655 |
|
catch (Exception e) {
|
|
660 |
catch (Exception e) {
|
656 |
661 |
System.out.println("punct_strong: " + e);
|
657 |
662 |
return false;
|
658 |
|
}
|
659 |
|
|
|
663 |
}
|
|
664 |
|
660 |
665 |
try {
|
661 |
666 |
Pattern.compile(punct_paren_open);
|
662 |
667 |
}
|
... | ... | |
664 |
669 |
System.out.println("punct_paren_open: " + e);
|
665 |
670 |
return false;
|
666 |
671 |
}
|
667 |
|
|
|
672 |
|
668 |
673 |
try {
|
669 |
674 |
Pattern.compile(punct_paren_close);
|
670 |
675 |
}
|
... | ... | |
672 |
677 |
System.out.println("punct_paren_close: " + e);
|
673 |
678 |
return false;
|
674 |
679 |
}
|
675 |
|
|
|
680 |
|
676 |
681 |
try {
|
677 |
682 |
Pattern.compile(punct_paren);
|
678 |
683 |
}
|
... | ... | |
680 |
685 |
System.out.println("punct_paren: " + e);
|
681 |
686 |
return false;
|
682 |
687 |
}
|
683 |
|
|
|
688 |
|
684 |
689 |
try {
|
685 |
690 |
Pattern.compile(punct_all);
|
686 |
691 |
}
|
... | ... | |
688 |
693 |
System.out.println("punct_all: " + e);
|
689 |
694 |
return false;
|
690 |
695 |
}
|
691 |
|
|
|
696 |
|
692 |
697 |
try {
|
693 |
698 |
Pattern.compile(word_chars);
|
694 |
699 |
}
|
... | ... | |
696 |
701 |
System.out.println("word_chars: " + e);
|
697 |
702 |
return false;
|
698 |
703 |
}
|
699 |
|
|
|
704 |
|
700 |
705 |
if (regPunct != null)
|
701 |
706 |
try {
|
702 |
|
Pattern.compile(regPunct);
|
|
707 |
Pattern.compile(regPunct);
|
703 |
708 |
}
|
704 |
|
catch (Exception e) {
|
|
709 |
catch (Exception e) {
|
705 |
710 |
System.out.println("regPunct=" + regPunct + ": " + e);
|
706 |
711 |
return false;
|
707 |
|
}
|
708 |
|
|
|
712 |
}
|
|
713 |
|
709 |
714 |
if (regElision != null)
|
710 |
715 |
try {
|
711 |
|
Pattern.compile(regElision);
|
|
716 |
Pattern.compile(regElision);
|
712 |
717 |
}
|
713 |
|
catch (Exception e) {
|
|
718 |
catch (Exception e) {
|
714 |
719 |
System.out.println("regElision=" + regElision + ": " + e);
|
715 |
720 |
return false;
|
716 |
|
}
|
717 |
|
|
|
721 |
}
|
|
722 |
|
718 |
723 |
if (whitespaces != null)
|
719 |
724 |
try {
|
720 |
|
Pattern.compile(whitespaces);
|
|
725 |
Pattern.compile(whitespaces);
|
721 |
726 |
}
|
722 |
|
catch (Exception e) {
|
|
727 |
catch (Exception e) {
|
723 |
728 |
System.out.println("whitespaces=" + whitespaces + ": " + e);
|
724 |
729 |
return false;
|
725 |
|
}
|
726 |
|
|
|
730 |
}
|
|
731 |
|
727 |
732 |
for (TTest test : tests) {
|
728 |
733 |
try {
|
729 |
734 |
Pattern.compile(test.getRegex());
|
... | ... | |
735 |
740 |
}
|
736 |
741 |
return true;
|
737 |
742 |
}
|
738 |
|
|
|
743 |
|
739 |
744 |
/**
|
740 |
745 |
* old way used in TXM 0.7.9 AND in corpus 0.7.9 -> 0.8.0 restoration
|
741 |
746 |
*
|
... | ... | |
753 |
758 |
String value = param.getAttribute("value");
|
754 |
759 |
if (value == null || value.length() == 0) value = param.getTextContent();
|
755 |
760 |
if (value.length() == 0) value = null;
|
756 |
|
|
|
761 |
|
757 |
762 |
if (debug) System.out.println(" Tokenizer parametrized with " + key + "=" + value + "");
|
758 |
|
|
|
763 |
|
759 |
764 |
if (key.equals("tag_all"))
|
760 |
765 |
tag_all = value;
|
761 |
766 |
else if (key.equals("enclitics"))
|
... | ... | |
808 |
813 |
// recombine
|
809 |
814 |
recombine();
|
810 |
815 |
if (tmp_strong_punct != null) punct_strong = tmp_strong_punct; // this is temporary
|
811 |
|
|
|
816 |
|
812 |
817 |
String shouldResetTests = tokenizerElement.getAttribute("onlyThoseTests");
|
813 |
818 |
if ("true".equals(shouldResetTests)) {
|
814 |
819 |
System.out.println("Warning: tokenizer only using import parameters tests");
|
815 |
820 |
// tests = new ArrayList<>();
|
816 |
821 |
}
|
817 |
|
|
|
822 |
|
818 |
823 |
NodeList testsList = tokenizerElement.getElementsByTagName("test");
|
819 |
824 |
if (testsList.getLength() > 0) {
|
820 |
825 |
// System.out.println("Add "+testsList.getLength()+" tests to the tokenizer"
|
... | ... | |
828 |
833 |
}
|
829 |
834 |
// System.out.println("Tests: "+tests);
|
830 |
835 |
}
|
831 |
|
|
|
836 |
|
832 |
837 |
if (isValid())
|
833 |
838 |
return true;
|
834 |
839 |
else {
|
... | ... | |
837 |
842 |
return false;
|
838 |
843 |
}
|
839 |
844 |
}
|
840 |
|
|
|
845 |
|
841 |
846 |
/**
|
842 |
847 |
* The main method.
|
843 |
848 |
*
|
... | ... | |
847 |
852 |
* @throws ParserConfigurationException
|
848 |
853 |
*/
|
849 |
854 |
public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException {
|
850 |
|
|
|
855 |
|
851 |
856 |
// Document doc = DomUtils.load(new File("/home/mdecorde/xml/discours/import.xml"));
|
852 |
857 |
// Node corpora = doc.getDocumentElement().getElementsByTagName("corpora").item(0);
|
853 |
858 |
// Node corpus = ((Element)corpora).getElementsByTagName("corpus").item(0);
|
854 |
859 |
// Node tokenizer = ((Element)corpus).getElementsByTagName("tokenizer").item(0);
|
855 |
860 |
// System.out.println("OK? "+ TokenizerClasses.loadFromNode((Element) tokenizer));
|
856 |
861 |
}
|
857 |
|
|
|
862 |
|
858 |
863 |
public boolean loadFromProject(ProjectScope projectScope) {
|
859 |
864 |
// load params
|
860 |
865 |
IEclipsePreferences params = projectScope.getNode("Tokenizer");
|
861 |
|
|
862 |
866 |
try {
|
863 |
|
|
|
867 |
if (debug) System.out.println(Arrays.toString(params.keys()));
|
|
868 |
if (debug) System.out.println(Arrays.toString(params.childrenNames()));
|
|
869 |
} catch (BackingStoreException e1) {
|
|
870 |
// TODO Auto-generated catch block
|
|
871 |
e1.printStackTrace();
|
|
872 |
}
|
|
873 |
try {
|
|
874 |
|
864 |
875 |
Field[] fields = TokenizerClasses.class.getFields();
|
865 |
876 |
for (Field field : fields) {
|
866 |
877 |
int m = field.getModifiers();
|
867 |
878 |
if (!Modifier.isStatic(m) && Modifier.isPublic(m) && field.getType().equals(String.class)) {
|
868 |
879 |
String name = field.getName();
|
869 |
|
String value = params.get(name, null);
|
|
880 |
String defaultValue = null;
|
|
881 |
try {
|
|
882 |
if (field.get(this) != null) {
|
|
883 |
defaultValue = field.get(this).toString();
|
|
884 |
}
|
|
885 |
} catch (Exception e) {
|
|
886 |
e.printStackTrace();
|
|
887 |
}
|
|
888 |
String value = params.get(name, defaultValue);
|
870 |
889 |
if (value != null) {
|
871 |
890 |
try {
|
872 |
891 |
if (debug) System.out.println(" Tokenizer parametrized with " + name + "=" + value);
|
... | ... | |
878 |
897 |
}
|
879 |
898 |
}
|
880 |
899 |
}
|
881 |
|
// // old way, now using Reflection
|
882 |
|
|
883 |
|
// String[] keys = params.keys();
|
884 |
|
// for (String key : keys) {
|
885 |
|
// // Element param = (Element) params.item(i);
|
886 |
|
// // String key = param.getAttribute("name");
|
887 |
|
// String value = params.get(key, "");
|
888 |
|
// // if (value == null || value.length() == 0) value = param.getTextContent();
|
889 |
|
// if (value.length() == 0) value = null;
|
890 |
|
//
|
891 |
|
// if (debug) System.out.println(" Tokenizer parametrized with "+key+"="+value+"");
|
892 |
|
//
|
893 |
|
// if (key.equals("tag_all"))
|
894 |
|
// tag_all = value;
|
895 |
|
// else if (key.equals("enclitics"))
|
896 |
|
// enclitics = value;
|
897 |
|
// else if (key.equals("encliticsFR"))
|
898 |
|
// encliticsFR = value;
|
899 |
|
// else if (key.equals("div_tags"))
|
900 |
|
// div_tags = value;
|
901 |
|
// else if (key.equals("q_tags"))
|
902 |
|
// q_tags = value;
|
903 |
|
// else if (key.equals("extraword1_tags"))
|
904 |
|
// extraword1_tags = value;
|
905 |
|
// else if (key.equals("corr_tags_no_seg"))
|
906 |
|
// corr_tags_no_seg = value;
|
907 |
|
// else if (key.equals("word_tags")) {
|
908 |
|
// word_tags = value;
|
909 |
|
// word_element_to_create = value; // FIXME for now
|
910 |
|
// } else if (key.equals("intraword_tags"))
|
911 |
|
// intraword_tags = value;
|
912 |
|
// else if (key.equals("punct_quotes"))
|
913 |
|
// punct_quotes = value;
|
914 |
|
// else if (key.equals("punct_strong1"))
|
915 |
|
// punct_strong1 = value;
|
916 |
|
// else if (key.equals("punct_strong2"))
|
917 |
|
// punct_strong2 = value;
|
918 |
|
// else if (key.equals("punct_paren_open1"))
|
919 |
|
// punct_paren_open1 = value;
|
920 |
|
// else if (key.equals("punct_paren_open2"))
|
921 |
|
// punct_paren_open2 = value;
|
922 |
|
// else if (key.equals("punct_paren_close1"))
|
923 |
|
// punct_paren_close1 = value;
|
924 |
|
// else if (key.equals("punct_paren_close2"))
|
925 |
|
// punct_paren_close2 = value;
|
926 |
|
// else if (key.equals("punct_weak"))
|
927 |
|
// punct_weak = value;
|
928 |
|
// else if (key.equals("entity"))
|
929 |
|
// entity = value;
|
930 |
|
// else if (key.equals("seg_tags"))
|
931 |
|
// seg_tags = value;
|
932 |
|
// else if (key.equals("regPunct"))
|
933 |
|
// regPunct = value;
|
934 |
|
// else if (key.equals("regElision"))
|
935 |
|
// regElision = value;
|
936 |
|
// else if (key.equals("whitespaces"))
|
937 |
|
// whitespaces = value;
|
938 |
|
// else if (key.equals("punct_strong")) // this is temporary
|
939 |
|
// tmp_strong_punct = value; // this is temporary
|
940 |
|
//// else
|
941 |
|
//// System.out.println("MISSING TOKENIZER KEY: "+key);
|
942 |
|
// }
|
943 |
|
|
|
900 |
|
944 |
901 |
// recombine
|
945 |
902 |
recombine();
|
946 |
|
|
|
903 |
|
947 |
904 |
if (params.getBoolean("onlyThoseTests", false)) {
|
948 |
905 |
System.out.println("Warning: tokenizer only using import parameters tests");
|
949 |
906 |
// tests = new ArrayList<>();
|
950 |
907 |
}
|
951 |
|
|
|
908 |
|
952 |
909 |
org.osgi.service.prefs.Preferences testsList = params.node("tests");
|
953 |
910 |
String[] tests;
|
954 |
911 |
tests = testsList.childrenNames();
|
955 |
912 |
// System.out.println("Add "+testsList.getLength()+" tests to the tokenizer"
|
956 |
913 |
for (String testname : tests) {
|
957 |
914 |
org.osgi.service.prefs.Preferences testdef = testsList.node(testname);
|
958 |
|
|
|
915 |
|
959 |
916 |
// Element test = (Element) testsList.item(i);
|
960 |
917 |
TTest t = new TTest(testdef.get("content", null), testdef.get("type", null),
|
961 |
918 |
testdef.getInt("before", 0),
|
... | ... | |
963 |
920 |
testdef.getInt("after", 0));
|
964 |
921 |
}
|
965 |
922 |
// System.out.println("Tests: "+tests);
|
966 |
|
|
|
923 |
|
967 |
924 |
}
|
968 |
925 |
catch (BackingStoreException e) {
|
969 |
926 |
// TODO Auto-generated catch block
|
970 |
927 |
e.printStackTrace();
|
971 |
928 |
}
|
972 |
|
|
|
929 |
|
973 |
930 |
if (isValid()) {
|
974 |
931 |
return true;
|
975 |
932 |
}
|