263 |
263 |
}
|
264 |
264 |
|
265 |
265 |
public void recombine() {
|
266 |
|
corr_tags = "" + corr_tags_no_seg + "|" + seg_tags + "";
|
267 |
|
extraword_tags = "" + div_tags + "|" + q_tags + "|" + extraword1_tags + "";
|
268 |
|
punct_strong = "[" + punct_strong1 + "]|" + punct_strong2 + "";
|
269 |
|
punct_paren_open = "" + punct_paren_open1 + "|" + punct_paren_open2 + "";
|
270 |
|
punct_paren_close = "" + punct_paren_close1 + "|" + punct_paren_close2 + "";
|
271 |
|
punct_paren = "" + punct_paren_open + "|" + punct_paren_close + "";
|
272 |
|
punct_all = "" + punct_strong + "|" + punct_paren + "|[" + punct_weak + "]";
|
273 |
|
word_chars = "[^ " + punct_quotes + "" + punct_strong1 + "" + punct_paren_open1 + "" + punct_paren_close1 + "" + punct_weak + "]+|" + entity + "";
|
|
266 |
|
|
267 |
if (corr_tags == null) {
|
|
268 |
corr_tags = "" + corr_tags_no_seg + "|" + seg_tags + "";
|
|
269 |
}
|
|
270 |
if (extraword_tags == null) {
|
|
271 |
extraword_tags = "" + div_tags + "|" + q_tags + "|" + extraword1_tags + "";
|
|
272 |
}
|
|
273 |
if (punct_strong == null) {
|
|
274 |
punct_strong = "[" + punct_strong1 + "]|" + punct_strong2 + "";
|
|
275 |
}
|
|
276 |
if (punct_paren_open == null) {
|
|
277 |
punct_paren_open = "" + punct_paren_open1 + "|" + punct_paren_open2 + "";
|
|
278 |
}
|
|
279 |
if (punct_paren_close == null) {
|
|
280 |
punct_paren_close = "" + punct_paren_close1 + "|" + punct_paren_close2 + "";
|
|
281 |
}
|
|
282 |
if (punct_paren == null) {
|
|
283 |
punct_paren = "" + punct_paren_open + "|" + punct_paren_close + "";
|
|
284 |
}
|
|
285 |
if (punct_all == null) {
|
|
286 |
punct_all = "" + punct_strong + "|" + punct_paren + "|[" + punct_weak + "]";
|
|
287 |
}
|
|
288 |
if (word_chars == null) {
|
|
289 |
word_chars = "[^ " + punct_quotes + "" + punct_strong1 + "" + punct_paren_open1 + "" + punct_paren_close1 + "" + punct_weak + "]+|" + entity + "";
|
|
290 |
}
|
274 |
291 |
}
|
275 |
292 |
|
276 |
293 |
/**
|
... | ... | |
455 |
472 |
System.out.println("whitespaces = " + whitespaces + "");
|
456 |
473 |
System.out.println("regElision = " + regElision + "");
|
457 |
474 |
System.out.println("regPunct = " + regPunct + "");
|
458 |
|
System.out.println("TESTS");
|
|
475 |
System.out.println("TESTS:");
|
459 |
476 |
|
460 |
477 |
for (TTest test : tests) {
|
461 |
478 |
System.out.println(" " + test + "");
|
... | ... | |
847 |
864 |
Field[] fields = TokenizerClasses.class.getFields();
|
848 |
865 |
for (Field field : fields) {
|
849 |
866 |
int m = field.getModifiers();
|
850 |
|
if (Modifier.isStatic(m) && Modifier.isPublic(m) && field.getType().equals(String.class)) {
|
|
867 |
if (!Modifier.isStatic(m) && Modifier.isPublic(m) && field.getType().equals(String.class)) {
|
851 |
868 |
String name = field.getName();
|
852 |
869 |
String value = params.get(name, null);
|
853 |
870 |
if (value != null) {
|
854 |
871 |
try {
|
855 |
872 |
if (debug) System.out.println(" Tokenizer parametrized with " + name + "=" + value);
|
856 |
|
field.set(field, value);
|
|
873 |
field.set(this, value);
|
857 |
874 |
}
|
858 |
875 |
catch (Exception e) {
|
859 |
876 |
Log.printStackTrace(e);
|