Revision 2390
tmp/org.txm.tokenizer.core/src/org/txm/tokenizer/TokenizerClasses.java (revision 2390) | ||
---|---|---|
30 | 30 |
import java.io.File; |
31 | 31 |
import java.io.FileWriter; |
32 | 32 |
import java.io.IOException; |
33 |
import java.lang.reflect.Field; |
|
34 |
import java.lang.reflect.Modifier; |
|
33 | 35 |
import java.util.ArrayList; |
34 | 36 |
import java.util.List; |
35 | 37 |
import java.util.prefs.Preferences; |
... | ... | |
49 | 51 |
import javax.xml.stream.*; |
50 | 52 |
import java.net.URL; |
51 | 53 |
|
52 |
// TODO: Auto-generated Javadoc |
|
53 | 54 |
/** |
54 |
* The Class TokenizerClasses.
|
|
55 |
* Holder for tokenizer rules regexp
|
|
55 | 56 |
*/ |
56 | 57 |
public class TokenizerClasses { |
57 | 58 |
|
58 | 59 |
//// ROOTS ? //// |
59 | 60 |
public static boolean debug = false; |
60 | 61 |
/** The tag_all. */ |
61 |
public static String tag_all = "<[A-Za-z][^>]+>";
|
|
62 |
public static String tag_all = null;
|
|
62 | 63 |
|
63 | 64 |
/** The Weblex enclitics. */ |
64 |
public static String enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
|
|
65 |
public static String encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
|
|
65 |
public static String enclitics = null;
|
|
66 |
public static String encliticsFR = null;
|
|
66 | 67 |
|
67 | 68 |
/** The TT enclitics. */ |
68 |
public static String FClitic_en = "'(s|re|ve|d|m|em|ll)|n['‘’]t";
|
|
69 |
public static String PClitic_fr = "[dcjlmnstDCJLNMST][\'‘’]|[Qq]u[\'‘’]|[Jj]usqu[\'‘’]|[Ll]orsqu[\'‘’]|[Pp]uisqu[\'‘’]|[Qq]uoiqu[\'‘’]";
|
|
70 |
public static String FClitic_fr = "-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m[\'‘’]|-moi|-nous|-on|-toi|-tu|-t[\'‘’]|-vous|-en|-y|-ci|-là";
|
|
71 |
public static String PClitic_it = "[dD][ae]ll[\'‘’]|[nN]ell[\'‘’]|[Aa]ll[\'‘’]|[lLDd][\'‘’]|[Ss]ull[\'‘’]|[Qq]uest[\'‘’]|[Uu]n[\'‘’]|[Ss]enz[\'‘’]|[Tt]utt[\'‘’]";
|
|
72 |
public static String FClitic_gl = "-la|-las|-lo|-los|-nos";
|
|
69 |
public static String FClitic_en = null;
|
|
70 |
public static String PClitic_fr = null;
|
|
71 |
public static String FClitic_fr = null;
|
|
72 |
public static String PClitic_it = null;
|
|
73 |
public static String FClitic_gl = null;
|
|
73 | 74 |
|
74 | 75 |
/** The div_tags. */ |
75 |
public static String div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl|opener|dateline";
|
|
76 |
public static String div_tags = null;
|
|
76 | 77 |
|
77 | 78 |
/** The q_tags. */ |
78 |
public static String q_tags = "q|quote|said|item|stage|cit|label|heraldry";
|
|
79 |
public static String q_tags = null;
|
|
79 | 80 |
|
80 | 81 |
/** The extraword_tags. */ |
81 |
public static String extraword1_tags = "expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
|
|
82 |
public static String extraword1_tags = null;
|
|
82 | 83 |
|
83 | 84 |
/** The corr_tags_no_seg. */ |
84 |
public static String corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno|surplus";
|
|
85 |
public static String corr_tags_no_seg = null;
|
|
85 | 86 |
|
86 | 87 |
/** The word_tags. */ |
87 |
public static String word_tags = "w";
|
|
88 |
public static String word_tags = null;
|
|
88 | 89 |
/** the element to create when a word is created */ |
89 |
public static String word_element_to_create = "w";
|
|
90 |
public static String word_element_to_create = null;
|
|
90 | 91 |
|
91 | 92 |
/** The intraword_tags. */ |
92 |
public static String intraword_tags = "c|ex|caesura";
|
|
93 |
public static String intraword_tags = null;
|
|
93 | 94 |
|
94 | 95 |
/** The punct_quotes. */ |
95 |
public static String punct_quotes = "'‘’’";
|
|
96 |
public static String punct_quotes = null;
|
|
96 | 97 |
|
97 | 98 |
/** The punct_strong1. */ |
98 |
public static String punct_strong1 = ".!?";
|
|
99 |
public static String punct_strong1 = null;
|
|
99 | 100 |
|
100 | 101 |
/** The punct_strong2. */ |
101 |
public static String punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
|
|
102 |
public static String punct_strong2 = null;
|
|
102 | 103 |
|
103 | 104 |
/** The punct_paren_open1. */ |
104 |
public static String punct_paren_open1 = "«";
|
|
105 |
public static String punct_paren_open1 = null;
|
|
105 | 106 |
|
106 | 107 |
/** The punct_paren_open2. */ |
107 |
public static String punct_paren_open2 = "``|\\(|\\[|\\{";
|
|
108 |
public static String punct_paren_open2 = null;
|
|
108 | 109 |
|
109 | 110 |
/** The punct_paren_close1. */ |
110 |
public static String punct_paren_close1 = "»";
|
|
111 |
public static String punct_paren_close1 = null;
|
|
111 | 112 |
|
112 | 113 |
/** The punct_paren_close2. */ |
113 |
public static String punct_paren_close2 = "''|‘‘|’’|\\)|\\]|\\}";
|
|
114 |
public static String punct_paren_close2 = null;
|
|
114 | 115 |
|
115 | 116 |
/** The punct_weak. */ |
116 |
public static String punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\≤≥<>\\—ν√μ•@≈→αγ∞≡σ&%|#¼½Θĩ†φθΓ§ẽə∈";
|
|
117 |
public static String punct_weak = null;
|
|
117 | 118 |
|
118 |
public static String entity = "&[^;]+;";
|
|
119 |
public static String entity = null;
|
|
119 | 120 |
|
120 |
public static String seg_tags = "seg";
|
|
121 |
public static String seg_tags = null;
|
|
121 | 122 |
|
122 | 123 |
//// COMBINAISONS //// |
123 | 124 |
|
124 | 125 |
/** The corr_tags. */ |
125 |
public static String corr_tags = ""+corr_tags_no_seg+"|"+seg_tags+"";
|
|
126 |
public static String corr_tags = null;
|
|
126 | 127 |
/** The extraword_tags. */ |
127 |
public static String extraword_tags = ""+div_tags+"|"+q_tags+"|"+extraword1_tags+"";
|
|
128 |
public static String extraword_tags = null;
|
|
128 | 129 |
|
129 | 130 |
/** The punct_strong. */ |
130 |
public static String punct_strong = "["+punct_strong1+"]+|"+punct_strong2+"";
|
|
131 |
public static String punct_strong = null;
|
|
131 | 132 |
|
132 | 133 |
/** The punct_paren_open. */ |
133 |
public static String punct_paren_open = ""+punct_paren_open1+"|"+punct_paren_open2+"";
|
|
134 |
public static String punct_paren_open = null;
|
|
134 | 135 |
|
135 | 136 |
/** The punct_paren_close. */ |
136 |
public static String punct_paren_close = ""+punct_paren_close1+"|"+punct_paren_close2+"";
|
|
137 |
public static String punct_paren_close = null;
|
|
137 | 138 |
|
138 | 139 |
/** The punct_paren. */ |
139 |
public static String punct_paren = ""+punct_paren_open+"|"+punct_paren_close+"";
|
|
140 |
public static String punct_paren = null;
|
|
140 | 141 |
|
141 | 142 |
/** The punct_all. */ |
142 |
public static String punct_all = ""+punct_strong+"|"+punct_paren+"|["+punct_weak+"]+";
|
|
143 |
public static String punct_all = null;
|
|
143 | 144 |
|
144 | 145 |
/** The word_chars. */ |
145 |
public static String word_chars = "[^ "+punct_quotes+""+punct_strong1+""+punct_paren_open1+""+punct_paren_close1+""+punct_weak+"]+|"+entity+"";
|
|
146 |
public static String word_chars = null;
|
|
146 | 147 |
|
147 | 148 |
public static TTest[] tests = { |
148 | 149 |
// new TTest(/\A(.*-)(je|m[eo]i|tu|t[eo]i|lui|luy|ilz|ils?|no?u?s|vo?u?s|on|leurs|ce|ci|là|elles?)(.*)\Z/), |
... | ... | |
168 | 169 |
// new TTest(/\A(.*)(\p{Ps}|\p{Pe}|\p{Pi}|\p{Pf}|\p{Po}|\p{S})(.*)\Z/, "pon") |
169 | 170 |
}; |
170 | 171 |
|
171 |
//tag_all = "<[^>]+>"; |
|
172 |
//enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO |
|
173 |
//encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR |
|
174 |
//div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl"; |
|
175 |
//q_tags = "q|quote|item|stage"; |
|
176 |
//extraword_tags = "$div_tags|$q_tags|expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail"; |
|
177 |
//corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno"; |
|
178 |
//corr_tags = "$corr_tags_no_seg|seg"; |
|
179 |
//word_tags = "w|abbr|num"; |
|
180 |
//intraword_tags = "c|ex"; |
|
172 |
public static String whitespaces = null; |
|
173 |
public static String regElision = null; |
|
174 |
public static String regPunct = null; |
|
175 |
|
|
176 |
static { |
|
177 |
reset(); // Initialize static fields |
|
178 |
} |
|
181 | 179 |
|
182 |
public static String whitespaces = "[\\p{Z}\\p{C}]+"; |
|
183 |
public static String regElision = "['‘’]"; // obsolete |
|
184 |
public static String regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]"; |
|
185 |
|
|
186 | 180 |
//punct_strong1 = ".!?"; |
187 | 181 |
//punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|"; |
188 | 182 |
//punct_strong = "[$punct_strong1]|$punct_strong2"; |
... | ... | |
198 | 192 |
//word_chars = "[^ $punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|&[^;]+;"; |
199 | 193 |
|
200 | 194 |
public static void reset() { |
201 |
tag_all = "<[^>]+>"; |
|
202 |
enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO |
|
203 |
encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR |
|
204 |
div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl"; |
|
205 |
q_tags = "q|quote|item|stage|cit"; |
|
195 |
tag_all = "<[A-Za-z][^>]+>"; |
|
196 |
|
|
197 |
div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl|opener|dateline"; |
|
198 |
q_tags = "q|quote|said|item|stage|cit|label|heraldry"; |
|
206 | 199 |
extraword1_tags = "expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail"; |
207 | 200 |
corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno|surplus"; |
208 | 201 |
word_tags = "w"; |
202 |
word_element_to_create = "w"; |
|
209 | 203 |
intraword_tags = "c|ex|caesura"; |
210 | 204 |
punct_quotes = "'‘’’"; |
211 | 205 |
punct_strong1 = ".!?"; |
... | ... | |
214 | 208 |
punct_paren_open2 = "``|\\(|\\[|\\{"; |
215 | 209 |
punct_paren_close1 = "»"; |
216 | 210 |
punct_paren_close2 = "''|‘‘|’’|\\)|\\]|\\}"; |
217 |
punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\-≤≥<>\\—ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
|
|
211 |
punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\≤≥<>\\—ν√μ•@≈→αγ∞≡σ&%|#¼½Θĩ†φθΓ§ẽə∈";
|
|
218 | 212 |
entity = "&[^;]+;"; |
219 | 213 |
seg_tags = "seg"; |
214 |
|
|
215 |
enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO |
|
216 |
encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR |
|
217 |
|
|
218 |
/** The TT enclitics. */ |
|
219 |
FClitic_en = "'(s|re|ve|d|m|em|ll)|n['‘’]t"; |
|
220 |
PClitic_fr = "[dcjlmnstDCJLNMST][\'‘’]|[Qq]u[\'‘’]|[Jj]usqu[\'‘’]|[Ll]orsqu[\'‘’]|[Pp]uisqu[\'‘’]|[Qq]uoiqu[\'‘’]"; |
|
221 |
FClitic_fr = "-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m[\'‘’]|-moi|-nous|-on|-toi|-tu|-t[\'‘’]|-vous|-en|-y|-ci|-là"; |
|
222 |
PClitic_it = "[dD][ae]ll[\'‘’]|[nN]ell[\'‘’]|[Aa]ll[\'‘’]|[lLDd][\'‘’]|[Ss]ull[\'‘’]|[Qq]uest[\'‘’]|[Uu]n[\'‘’]|[Ss]enz[\'‘’]|[Tt]utt[\'‘’]"; |
|
223 |
FClitic_gl = "-la|-las|-lo|-los|-nos"; |
|
220 | 224 |
|
221 | 225 |
whitespaces = "[\\p{Z}\\p{C}]+"; |
222 | 226 |
regElision = "['‘’]"; |
... | ... | |
233 | 237 |
punct_paren_close = ""+punct_paren_close1+"|"+punct_paren_close2+""; |
234 | 238 |
punct_paren = ""+punct_paren_open+"|"+punct_paren_close+""; |
235 | 239 |
punct_all = ""+punct_strong+"|"+punct_paren+"|["+punct_weak+"]"; |
236 |
word_chars = "[^ "+punct_quotes+""+punct_strong1+""+punct_paren_open1+""+punct_paren_close1+""+punct_weak+"]|"+entity+""; |
|
240 |
word_chars = "[^ "+punct_quotes+""+punct_strong1+""+punct_paren_open1+""+punct_paren_close1+""+punct_weak+"]+|"+entity+"";
|
|
237 | 241 |
} |
238 | 242 |
|
239 | 243 |
/** |
... | ... | |
243 | 247 |
* @param parent the parent |
244 | 248 |
* @return the java.lang. object |
245 | 249 |
*/ |
250 |
@Deprecated |
|
246 | 251 |
public static void toDom(Document doc, Node parent) { |
247 | 252 |
Element tokenizer = doc.createElement("tokenizer"); |
248 | 253 |
tokenizer.setAttribute("onlyThoseTests", "false"); |
249 | 254 |
parent.appendChild(tokenizer); |
250 |
|
|
255 |
|
|
251 | 256 |
//String tag_all = "<[^>]+>"; |
252 | 257 |
Element p = doc.createElement("param"); |
253 | 258 |
p.setAttribute("key","tag_all"); |
... | ... | |
526 | 531 |
return true; |
527 | 532 |
} |
528 | 533 |
|
534 |
/** |
|
535 |
* old way used in TXM 0.7.9 AND in corpus 0.7.9 -> 0.8.0 restoration |
|
536 |
* @param tokenizerElement |
|
537 |
* @return |
|
538 |
*/ |
|
539 |
@Deprecated |
|
529 | 540 |
public static boolean loadFromNode(Element tokenizerElement) { |
530 | 541 |
//load params |
531 | 542 |
String tmp_strong_punct = null; |
... | ... | |
631 | 642 |
*/ |
632 | 643 |
public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException { |
633 | 644 |
|
634 |
Document doc = DomUtils.load(new File("/home/mdecorde/xml/discours/import.xml")); |
|
635 |
Node corpora = doc.getDocumentElement().getElementsByTagName("corpora").item(0); |
|
636 |
Node corpus = ((Element)corpora).getElementsByTagName("corpus").item(0); |
|
637 |
Node tokenizer = ((Element)corpus).getElementsByTagName("tokenizer").item(0); |
|
638 |
System.out.println("OK? "+ TokenizerClasses.loadFromNode((Element) tokenizer)); |
|
645 |
// Document doc = DomUtils.load(new File("/home/mdecorde/xml/discours/import.xml"));
|
|
646 |
// Node corpora = doc.getDocumentElement().getElementsByTagName("corpora").item(0);
|
|
647 |
// Node corpus = ((Element)corpora).getElementsByTagName("corpus").item(0);
|
|
648 |
// Node tokenizer = ((Element)corpus).getElementsByTagName("tokenizer").item(0);
|
|
649 |
// System.out.println("OK? "+ TokenizerClasses.loadFromNode((Element) tokenizer));
|
|
639 | 650 |
} |
640 | 651 |
|
641 | 652 |
public static boolean loadFromProject(ProjectScope projectScope) { |
642 | 653 |
//load params |
643 |
String tmp_strong_punct = null; |
|
644 | 654 |
IEclipsePreferences params = projectScope.getNode("Tokenizer"); |
655 |
|
|
645 | 656 |
try { |
646 |
String[] keys = params.keys(); |
|
647 |
for (String key : keys) { |
|
648 |
// Element param = (Element) params.item(i); |
|
649 |
// String key = param.getAttribute("name"); |
|
650 |
String value = params.get(key, ""); |
|
651 |
// if (value == null || value.length() == 0) value = param.getTextContent(); |
|
652 |
if (value.length() == 0) value = null; |
|
653 |
|
|
654 |
if (debug) System.out.println(" Tokenizer parametrized with "+key+"="+value+""); |
|
655 |
|
|
656 |
if (key.equals("tag_all")) |
|
657 |
tag_all = value; |
|
658 |
else if (key.equals("enclitics")) |
|
659 |
enclitics = value; |
|
660 |
else if (key.equals("encliticsFR")) |
|
661 |
encliticsFR = value; |
|
662 |
else if (key.equals("div_tags")) |
|
663 |
div_tags = value; |
|
664 |
else if (key.equals("q_tags")) |
|
665 |
q_tags = value; |
|
666 |
else if (key.equals("extraword1_tags")) |
|
667 |
extraword1_tags = value; |
|
668 |
else if (key.equals("corr_tags_no_seg")) |
|
669 |
corr_tags_no_seg = value; |
|
670 |
else if (key.equals("word_tags")) { |
|
671 |
word_tags = value; |
|
672 |
word_element_to_create = value; // FIXME for now |
|
673 |
} else if (key.equals("intraword_tags")) |
|
674 |
intraword_tags = value; |
|
675 |
else if (key.equals("punct_quotes")) |
|
676 |
punct_quotes = value; |
|
677 |
else if (key.equals("punct_strong1")) |
|
678 |
punct_strong1 = value; |
|
679 |
else if (key.equals("punct_strong2")) |
|
680 |
punct_strong2 = value; |
|
681 |
else if (key.equals("punct_paren_open1")) |
|
682 |
punct_paren_open1 = value; |
|
683 |
else if (key.equals("punct_paren_open2")) |
|
684 |
punct_paren_open2 = value; |
|
685 |
else if (key.equals("punct_paren_close1")) |
|
686 |
punct_paren_close1 = value; |
|
687 |
else if (key.equals("punct_paren_close2")) |
|
688 |
punct_paren_close2 = value; |
|
689 |
else if (key.equals("punct_weak")) |
|
690 |
punct_weak = value; |
|
691 |
else if (key.equals("entity")) |
|
692 |
entity = value; |
|
693 |
else if (key.equals("seg_tags")) |
|
694 |
seg_tags = value; |
|
695 |
else if (key.equals("regPunct")) |
|
696 |
regPunct = value; |
|
697 |
else if (key.equals("regElision")) |
|
698 |
regElision = value; |
|
699 |
else if (key.equals("whitespaces")) |
|
700 |
whitespaces = value; |
|
701 |
else if (key.equals("punct_strong")) // this is temporary |
|
702 |
tmp_strong_punct = value; // this is temporary |
|
703 |
// else |
|
704 |
// System.out.println("MISSING TOKENIZER KEY: "+key); |
|
657 |
|
|
658 |
Field[] fields = TokenizerClasses.class.getFields(); |
|
659 |
for (Field field : fields) { |
|
660 |
int m = field.getModifiers(); |
|
661 |
if (Modifier.isStatic(m) && Modifier.isPublic(m) && field.getType().equals(String.class)) { |
|
662 |
String name = field.getName(); |
|
663 |
String value = params.get(name, null); |
|
664 |
if (value != null) { |
|
665 |
try { |
|
666 |
if (debug) System.out.println(" Tokenizer parametrized with "+name+"="+value); |
|
667 |
field.set(field, value); |
|
668 |
} |
|
669 |
catch (IllegalArgumentException e) { |
|
670 |
// TODO Auto-generated catch block |
|
671 |
e.printStackTrace(); |
|
672 |
} |
|
673 |
catch (IllegalAccessException e) { |
|
674 |
// TODO Auto-generated catch block |
|
675 |
e.printStackTrace(); |
|
676 |
} |
|
677 |
} |
|
678 |
} |
|
705 | 679 |
} |
680 |
// // old way, now using Reflection |
|
681 |
|
|
682 |
// String[] keys = params.keys(); |
|
683 |
// for (String key : keys) { |
|
684 |
// // Element param = (Element) params.item(i); |
|
685 |
// // String key = param.getAttribute("name"); |
|
686 |
// String value = params.get(key, ""); |
|
687 |
// // if (value == null || value.length() == 0) value = param.getTextContent(); |
|
688 |
// if (value.length() == 0) value = null; |
|
689 |
// |
|
690 |
// if (debug) System.out.println(" Tokenizer parametrized with "+key+"="+value+""); |
|
691 |
// |
|
692 |
// if (key.equals("tag_all")) |
|
693 |
// tag_all = value; |
|
694 |
// else if (key.equals("enclitics")) |
|
695 |
// enclitics = value; |
|
696 |
// else if (key.equals("encliticsFR")) |
|
697 |
// encliticsFR = value; |
|
698 |
// else if (key.equals("div_tags")) |
|
699 |
// div_tags = value; |
|
700 |
// else if (key.equals("q_tags")) |
|
701 |
// q_tags = value; |
|
702 |
// else if (key.equals("extraword1_tags")) |
|
703 |
// extraword1_tags = value; |
|
704 |
// else if (key.equals("corr_tags_no_seg")) |
|
705 |
// corr_tags_no_seg = value; |
|
706 |
// else if (key.equals("word_tags")) { |
|
707 |
// word_tags = value; |
|
708 |
// word_element_to_create = value; // FIXME for now |
|
709 |
// } else if (key.equals("intraword_tags")) |
|
710 |
// intraword_tags = value; |
|
711 |
// else if (key.equals("punct_quotes")) |
|
712 |
// punct_quotes = value; |
|
713 |
// else if (key.equals("punct_strong1")) |
|
714 |
// punct_strong1 = value; |
|
715 |
// else if (key.equals("punct_strong2")) |
|
716 |
// punct_strong2 = value; |
|
717 |
// else if (key.equals("punct_paren_open1")) |
|
718 |
// punct_paren_open1 = value; |
|
719 |
// else if (key.equals("punct_paren_open2")) |
|
720 |
// punct_paren_open2 = value; |
|
721 |
// else if (key.equals("punct_paren_close1")) |
|
722 |
// punct_paren_close1 = value; |
|
723 |
// else if (key.equals("punct_paren_close2")) |
|
724 |
// punct_paren_close2 = value; |
|
725 |
// else if (key.equals("punct_weak")) |
|
726 |
// punct_weak = value; |
|
727 |
// else if (key.equals("entity")) |
|
728 |
// entity = value; |
|
729 |
// else if (key.equals("seg_tags")) |
|
730 |
// seg_tags = value; |
|
731 |
// else if (key.equals("regPunct")) |
|
732 |
// regPunct = value; |
|
733 |
// else if (key.equals("regElision")) |
|
734 |
// regElision = value; |
|
735 |
// else if (key.equals("whitespaces")) |
|
736 |
// whitespaces = value; |
|
737 |
// else if (key.equals("punct_strong")) // this is temporary |
|
738 |
// tmp_strong_punct = value; // this is temporary |
|
739 |
//// else |
|
740 |
//// System.out.println("MISSING TOKENIZER KEY: "+key); |
|
741 |
// } |
|
706 | 742 |
|
707 | 743 |
//recombine |
708 | 744 |
recombine(); |
709 |
if (tmp_strong_punct != null) punct_strong = tmp_strong_punct; // this is temporary |
|
710 | 745 |
|
711 | 746 |
if (params.getBoolean("onlyThoseTests", false)) { |
712 | 747 |
System.out.println("Warning: tokenizer only using import parameters tests"); |
713 | 748 |
//tests = new ArrayList<>(); |
714 | 749 |
} |
715 | 750 |
|
716 |
org.osgi.service.prefs.Preferences testsList = params.node("test"); |
|
751 |
org.osgi.service.prefs.Preferences testsList = params.node("tests");
|
|
717 | 752 |
String[] tests; |
718 | 753 |
tests = testsList.childrenNames(); |
719 | 754 |
//System.out.println("Add "+testsList.getLength()+" tests to the tokenizer" |
... | ... | |
733 | 768 |
e.printStackTrace(); |
734 | 769 |
} |
735 | 770 |
|
736 |
if (isValid()) |
|
771 |
if (isValid()) {
|
|
737 | 772 |
return true; |
738 |
else { |
|
773 |
} else {
|
|
739 | 774 |
reset(); |
740 | 775 |
recombine(); |
741 | 776 |
return false; |
Also available in: Unified diff