Révision 2390
| tmp/org.txm.tokenizer.core/src/org/txm/tokenizer/TokenizerClasses.java (revision 2390) | ||
|---|---|---|
| 30 | 30 |
import java.io.File; |
| 31 | 31 |
import java.io.FileWriter; |
| 32 | 32 |
import java.io.IOException; |
| 33 |
import java.lang.reflect.Field; |
|
| 34 |
import java.lang.reflect.Modifier; |
|
| 33 | 35 |
import java.util.ArrayList; |
| 34 | 36 |
import java.util.List; |
| 35 | 37 |
import java.util.prefs.Preferences; |
| ... | ... | |
| 49 | 51 |
import javax.xml.stream.*; |
| 50 | 52 |
import java.net.URL; |
| 51 | 53 |
|
| 52 |
// TODO: Auto-generated Javadoc |
|
| 53 | 54 |
/** |
| 54 |
* The Class TokenizerClasses.
|
|
| 55 |
* Holder for tokenizer rules regexp
|
|
| 55 | 56 |
*/ |
| 56 | 57 |
public class TokenizerClasses {
|
| 57 | 58 |
|
| 58 | 59 |
//// ROOTS ? //// |
| 59 | 60 |
public static boolean debug = false; |
| 60 | 61 |
/** The tag_all. */ |
| 61 |
public static String tag_all = "<[A-Za-z][^>]+>";
|
|
| 62 |
public static String tag_all = null;
|
|
| 62 | 63 |
|
| 63 | 64 |
/** The Weblex enclitics. */ |
| 64 |
public static String enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
|
|
| 65 |
public static String encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
|
|
| 65 |
public static String enclitics = null;
|
|
| 66 |
public static String encliticsFR = null;
|
|
| 66 | 67 |
|
| 67 | 68 |
/** The TT enclitics. */ |
| 68 |
public static String FClitic_en = "'(s|re|ve|d|m|em|ll)|n['‘’]t";
|
|
| 69 |
public static String PClitic_fr = "[dcjlmnstDCJLNMST][\'‘’]|[Qq]u[\'‘’]|[Jj]usqu[\'‘’]|[Ll]orsqu[\'‘’]|[Pp]uisqu[\'‘’]|[Qq]uoiqu[\'‘’]";
|
|
| 70 |
public static String FClitic_fr = "-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m[\'‘’]|-moi|-nous|-on|-toi|-tu|-t[\'‘’]|-vous|-en|-y|-ci|-là";
|
|
| 71 |
public static String PClitic_it = "[dD][ae]ll[\'‘’]|[nN]ell[\'‘’]|[Aa]ll[\'‘’]|[lLDd][\'‘’]|[Ss]ull[\'‘’]|[Qq]uest[\'‘’]|[Uu]n[\'‘’]|[Ss]enz[\'‘’]|[Tt]utt[\'‘’]";
|
|
| 72 |
public static String FClitic_gl = "-la|-las|-lo|-los|-nos";
|
|
| 69 |
public static String FClitic_en = null;
|
|
| 70 |
public static String PClitic_fr = null;
|
|
| 71 |
public static String FClitic_fr = null;
|
|
| 72 |
public static String PClitic_it = null;
|
|
| 73 |
public static String FClitic_gl = null;
|
|
| 73 | 74 |
|
| 74 | 75 |
/** The div_tags. */ |
| 75 |
public static String div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl|opener|dateline";
|
|
| 76 |
public static String div_tags = null;
|
|
| 76 | 77 |
|
| 77 | 78 |
/** The q_tags. */ |
| 78 |
public static String q_tags = "q|quote|said|item|stage|cit|label|heraldry";
|
|
| 79 |
public static String q_tags = null;
|
|
| 79 | 80 |
|
| 80 | 81 |
/** The extraword_tags. */ |
| 81 |
public static String extraword1_tags = "expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
|
|
| 82 |
public static String extraword1_tags = null;
|
|
| 82 | 83 |
|
| 83 | 84 |
/** The corr_tags_no_seg. */ |
| 84 |
public static String corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno|surplus";
|
|
| 85 |
public static String corr_tags_no_seg = null;
|
|
| 85 | 86 |
|
| 86 | 87 |
/** The word_tags. */ |
| 87 |
public static String word_tags = "w";
|
|
| 88 |
public static String word_tags = null;
|
|
| 88 | 89 |
/** the element to create when a word is created */ |
| 89 |
public static String word_element_to_create = "w";
|
|
| 90 |
public static String word_element_to_create = null;
|
|
| 90 | 91 |
|
| 91 | 92 |
/** The intraword_tags. */ |
| 92 |
public static String intraword_tags = "c|ex|caesura";
|
|
| 93 |
public static String intraword_tags = null;
|
|
| 93 | 94 |
|
| 94 | 95 |
/** The punct_quotes. */ |
| 95 |
public static String punct_quotes = "'‘’’";
|
|
| 96 |
public static String punct_quotes = null;
|
|
| 96 | 97 |
|
| 97 | 98 |
/** The punct_strong1. */ |
| 98 |
public static String punct_strong1 = ".!?";
|
|
| 99 |
public static String punct_strong1 = null;
|
|
| 99 | 100 |
|
| 100 | 101 |
/** The punct_strong2. */ |
| 101 |
public static String punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
|
|
| 102 |
public static String punct_strong2 = null;
|
|
| 102 | 103 |
|
| 103 | 104 |
/** The punct_paren_open1. */ |
| 104 |
public static String punct_paren_open1 = "«";
|
|
| 105 |
public static String punct_paren_open1 = null;
|
|
| 105 | 106 |
|
| 106 | 107 |
/** The punct_paren_open2. */ |
| 107 |
public static String punct_paren_open2 = "``|\\(|\\[|\\{";
|
|
| 108 |
public static String punct_paren_open2 = null;
|
|
| 108 | 109 |
|
| 109 | 110 |
/** The punct_paren_close1. */ |
| 110 |
public static String punct_paren_close1 = "»";
|
|
| 111 |
public static String punct_paren_close1 = null;
|
|
| 111 | 112 |
|
| 112 | 113 |
/** The punct_paren_close2. */ |
| 113 |
public static String punct_paren_close2 = "''|‘‘|’’|\\)|\\]|\\}";
|
|
| 114 |
public static String punct_paren_close2 = null;
|
|
| 114 | 115 |
|
| 115 | 116 |
/** The punct_weak. */ |
| 116 |
public static String punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\≤≥<>\\—ν√μ•@≈→αγ∞≡σ&%|#¼½Θĩ†φθΓ§ẽə∈";
|
|
| 117 |
public static String punct_weak = null;
|
|
| 117 | 118 |
|
| 118 |
public static String entity = "&[^;]+;";
|
|
| 119 |
public static String entity = null;
|
|
| 119 | 120 |
|
| 120 |
public static String seg_tags = "seg";
|
|
| 121 |
public static String seg_tags = null;
|
|
| 121 | 122 |
|
| 122 | 123 |
//// COMBINAISONS //// |
| 123 | 124 |
|
| 124 | 125 |
/** The corr_tags. */ |
| 125 |
public static String corr_tags = ""+corr_tags_no_seg+"|"+seg_tags+"";
|
|
| 126 |
public static String corr_tags = null;
|
|
| 126 | 127 |
/** The extraword_tags. */ |
| 127 |
public static String extraword_tags = ""+div_tags+"|"+q_tags+"|"+extraword1_tags+"";
|
|
| 128 |
public static String extraword_tags = null;
|
|
| 128 | 129 |
|
| 129 | 130 |
/** The punct_strong. */ |
| 130 |
public static String punct_strong = "["+punct_strong1+"]+|"+punct_strong2+"";
|
|
| 131 |
public static String punct_strong = null;
|
|
| 131 | 132 |
|
| 132 | 133 |
/** The punct_paren_open. */ |
| 133 |
public static String punct_paren_open = ""+punct_paren_open1+"|"+punct_paren_open2+"";
|
|
| 134 |
public static String punct_paren_open = null;
|
|
| 134 | 135 |
|
| 135 | 136 |
/** The punct_paren_close. */ |
| 136 |
public static String punct_paren_close = ""+punct_paren_close1+"|"+punct_paren_close2+"";
|
|
| 137 |
public static String punct_paren_close = null;
|
|
| 137 | 138 |
|
| 138 | 139 |
/** The punct_paren. */ |
| 139 |
public static String punct_paren = ""+punct_paren_open+"|"+punct_paren_close+"";
|
|
| 140 |
public static String punct_paren = null;
|
|
| 140 | 141 |
|
| 141 | 142 |
/** The punct_all. */ |
| 142 |
public static String punct_all = ""+punct_strong+"|"+punct_paren+"|["+punct_weak+"]+";
|
|
| 143 |
public static String punct_all = null;
|
|
| 143 | 144 |
|
| 144 | 145 |
/** The word_chars. */ |
| 145 |
public static String word_chars = "[^ "+punct_quotes+""+punct_strong1+""+punct_paren_open1+""+punct_paren_close1+""+punct_weak+"]+|"+entity+"";
|
|
| 146 |
public static String word_chars = null;
|
|
| 146 | 147 |
|
| 147 | 148 |
public static TTest[] tests = {
|
| 148 | 149 |
// new TTest(/\A(.*-)(je|m[eo]i|tu|t[eo]i|lui|luy|ilz|ils?|no?u?s|vo?u?s|on|leurs|ce|ci|là|elles?)(.*)\Z/), |
| ... | ... | |
| 168 | 169 |
// new TTest(/\A(.*)(\p{Ps}|\p{Pe}|\p{Pi}|\p{Pf}|\p{Po}|\p{S})(.*)\Z/, "pon")
|
| 169 | 170 |
}; |
| 170 | 171 |
|
| 171 |
//tag_all = "<[^>]+>"; |
|
| 172 |
//enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO |
|
| 173 |
//encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR |
|
| 174 |
//div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl"; |
|
| 175 |
//q_tags = "q|quote|item|stage"; |
|
| 176 |
//extraword_tags = "$div_tags|$q_tags|expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail"; |
|
| 177 |
//corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno"; |
|
| 178 |
//corr_tags = "$corr_tags_no_seg|seg"; |
|
| 179 |
//word_tags = "w|abbr|num"; |
|
| 180 |
//intraword_tags = "c|ex"; |
|
| 172 |
public static String whitespaces = null; |
|
| 173 |
public static String regElision = null; |
|
| 174 |
public static String regPunct = null; |
|
| 175 |
|
|
| 176 |
static {
|
|
| 177 |
reset(); // Initialize static fields |
|
| 178 |
} |
|
| 181 | 179 |
|
| 182 |
public static String whitespaces = "[\\p{Z}\\p{C}]+";
|
|
| 183 |
public static String regElision = "['‘’]"; // obsolete |
|
| 184 |
public static String regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]";
|
|
| 185 |
|
|
| 186 | 180 |
//punct_strong1 = ".!?"; |
| 187 | 181 |
//punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|"; |
| 188 | 182 |
//punct_strong = "[$punct_strong1]|$punct_strong2"; |
| ... | ... | |
| 198 | 192 |
//word_chars = "[^ $punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|&[^;]+;"; |
| 199 | 193 |
|
| 200 | 194 |
public static void reset() {
|
| 201 |
tag_all = "<[^>]+>"; |
|
| 202 |
enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO |
|
| 203 |
encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR |
|
| 204 |
div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl"; |
|
| 205 |
q_tags = "q|quote|item|stage|cit"; |
|
| 195 |
tag_all = "<[A-Za-z][^>]+>"; |
|
| 196 |
|
|
| 197 |
div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl|opener|dateline"; |
|
| 198 |
q_tags = "q|quote|said|item|stage|cit|label|heraldry"; |
|
| 206 | 199 |
extraword1_tags = "expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail"; |
| 207 | 200 |
corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno|surplus"; |
| 208 | 201 |
word_tags = "w"; |
| 202 |
word_element_to_create = "w"; |
|
| 209 | 203 |
intraword_tags = "c|ex|caesura"; |
| 210 | 204 |
punct_quotes = "'‘’’"; |
| 211 | 205 |
punct_strong1 = ".!?"; |
| ... | ... | |
| 214 | 208 |
punct_paren_open2 = "``|\\(|\\[|\\{";
|
| 215 | 209 |
punct_paren_close1 = "»"; |
| 216 | 210 |
punct_paren_close2 = "''|‘‘|’’|\\)|\\]|\\}"; |
| 217 |
punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\-≤≥<>\\—ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
|
|
| 211 |
punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\≤≥<>\\—ν√μ•@≈→αγ∞≡σ&%|#¼½Θĩ†φθΓ§ẽə∈";
|
|
| 218 | 212 |
entity = "&[^;]+;"; |
| 219 | 213 |
seg_tags = "seg"; |
| 214 |
|
|
| 215 |
enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO |
|
| 216 |
encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR |
|
| 217 |
|
|
| 218 |
/** The TT enclitics. */ |
|
| 219 |
FClitic_en = "'(s|re|ve|d|m|em|ll)|n['‘’]t"; |
|
| 220 |
PClitic_fr = "[dcjlmnstDCJLNMST][\'‘’]|[Qq]u[\'‘’]|[Jj]usqu[\'‘’]|[Ll]orsqu[\'‘’]|[Pp]uisqu[\'‘’]|[Qq]uoiqu[\'‘’]"; |
|
| 221 |
FClitic_fr = "-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m[\'‘’]|-moi|-nous|-on|-toi|-tu|-t[\'‘’]|-vous|-en|-y|-ci|-là"; |
|
| 222 |
PClitic_it = "[dD][ae]ll[\'‘’]|[nN]ell[\'‘’]|[Aa]ll[\'‘’]|[lLDd][\'‘’]|[Ss]ull[\'‘’]|[Qq]uest[\'‘’]|[Uu]n[\'‘’]|[Ss]enz[\'‘’]|[Tt]utt[\'‘’]"; |
|
| 223 |
FClitic_gl = "-la|-las|-lo|-los|-nos"; |
|
| 220 | 224 |
|
| 221 | 225 |
whitespaces = "[\\p{Z}\\p{C}]+";
|
| 222 | 226 |
regElision = "['‘’]"; |
| ... | ... | |
| 233 | 237 |
punct_paren_close = ""+punct_paren_close1+"|"+punct_paren_close2+""; |
| 234 | 238 |
punct_paren = ""+punct_paren_open+"|"+punct_paren_close+""; |
| 235 | 239 |
punct_all = ""+punct_strong+"|"+punct_paren+"|["+punct_weak+"]"; |
| 236 |
word_chars = "[^ "+punct_quotes+""+punct_strong1+""+punct_paren_open1+""+punct_paren_close1+""+punct_weak+"]|"+entity+""; |
|
| 240 |
word_chars = "[^ "+punct_quotes+""+punct_strong1+""+punct_paren_open1+""+punct_paren_close1+""+punct_weak+"]+|"+entity+"";
|
|
| 237 | 241 |
} |
| 238 | 242 |
|
| 239 | 243 |
/** |
| ... | ... | |
| 243 | 247 |
* @param parent the parent |
| 244 | 248 |
* @return the java.lang. object |
| 245 | 249 |
*/ |
| 250 |
@Deprecated |
|
| 246 | 251 |
public static void toDom(Document doc, Node parent) {
|
| 247 | 252 |
Element tokenizer = doc.createElement("tokenizer");
|
| 248 | 253 |
tokenizer.setAttribute("onlyThoseTests", "false");
|
| 249 | 254 |
parent.appendChild(tokenizer); |
| 250 |
|
|
| 255 |
|
|
| 251 | 256 |
//String tag_all = "<[^>]+>"; |
| 252 | 257 |
Element p = doc.createElement("param");
|
| 253 | 258 |
p.setAttribute("key","tag_all");
|
| ... | ... | |
| 526 | 531 |
return true; |
| 527 | 532 |
} |
| 528 | 533 |
|
| 534 |
/** |
|
| 535 |
* old way used in TXM 0.7.9 AND in corpus 0.7.9 -> 0.8.0 restoration |
|
| 536 |
* @param tokenizerElement |
|
| 537 |
* @return |
|
| 538 |
*/ |
|
| 539 |
@Deprecated |
|
| 529 | 540 |
public static boolean loadFromNode(Element tokenizerElement) {
|
| 530 | 541 |
//load params |
| 531 | 542 |
String tmp_strong_punct = null; |
| ... | ... | |
| 631 | 642 |
*/ |
| 632 | 643 |
public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException {
|
| 633 | 644 |
|
| 634 |
Document doc = DomUtils.load(new File("/home/mdecorde/xml/discours/import.xml"));
|
|
| 635 |
Node corpora = doc.getDocumentElement().getElementsByTagName("corpora").item(0);
|
|
| 636 |
Node corpus = ((Element)corpora).getElementsByTagName("corpus").item(0);
|
|
| 637 |
Node tokenizer = ((Element)corpus).getElementsByTagName("tokenizer").item(0);
|
|
| 638 |
System.out.println("OK? "+ TokenizerClasses.loadFromNode((Element) tokenizer));
|
|
| 645 |
// Document doc = DomUtils.load(new File("/home/mdecorde/xml/discours/import.xml"));
|
|
| 646 |
// Node corpora = doc.getDocumentElement().getElementsByTagName("corpora").item(0);
|
|
| 647 |
// Node corpus = ((Element)corpora).getElementsByTagName("corpus").item(0);
|
|
| 648 |
// Node tokenizer = ((Element)corpus).getElementsByTagName("tokenizer").item(0);
|
|
| 649 |
// System.out.println("OK? "+ TokenizerClasses.loadFromNode((Element) tokenizer));
|
|
| 639 | 650 |
} |
| 640 | 651 |
|
| 641 | 652 |
public static boolean loadFromProject(ProjectScope projectScope) {
|
| 642 | 653 |
//load params |
| 643 |
String tmp_strong_punct = null; |
|
| 644 | 654 |
IEclipsePreferences params = projectScope.getNode("Tokenizer");
|
| 655 |
|
|
| 645 | 656 |
try {
|
| 646 |
String[] keys = params.keys(); |
|
| 647 |
for (String key : keys) {
|
|
| 648 |
// Element param = (Element) params.item(i); |
|
| 649 |
// String key = param.getAttribute("name");
|
|
| 650 |
String value = params.get(key, ""); |
|
| 651 |
// if (value == null || value.length() == 0) value = param.getTextContent(); |
|
| 652 |
if (value.length() == 0) value = null; |
|
| 653 |
|
|
| 654 |
if (debug) System.out.println(" Tokenizer parametrized with "+key+"="+value+"");
|
|
| 655 |
|
|
| 656 |
if (key.equals("tag_all"))
|
|
| 657 |
tag_all = value; |
|
| 658 |
else if (key.equals("enclitics"))
|
|
| 659 |
enclitics = value; |
|
| 660 |
else if (key.equals("encliticsFR"))
|
|
| 661 |
encliticsFR = value; |
|
| 662 |
else if (key.equals("div_tags"))
|
|
| 663 |
div_tags = value; |
|
| 664 |
else if (key.equals("q_tags"))
|
|
| 665 |
q_tags = value; |
|
| 666 |
else if (key.equals("extraword1_tags"))
|
|
| 667 |
extraword1_tags = value; |
|
| 668 |
else if (key.equals("corr_tags_no_seg"))
|
|
| 669 |
corr_tags_no_seg = value; |
|
| 670 |
else if (key.equals("word_tags")) {
|
|
| 671 |
word_tags = value; |
|
| 672 |
word_element_to_create = value; // FIXME for now |
|
| 673 |
} else if (key.equals("intraword_tags"))
|
|
| 674 |
intraword_tags = value; |
|
| 675 |
else if (key.equals("punct_quotes"))
|
|
| 676 |
punct_quotes = value; |
|
| 677 |
else if (key.equals("punct_strong1"))
|
|
| 678 |
punct_strong1 = value; |
|
| 679 |
else if (key.equals("punct_strong2"))
|
|
| 680 |
punct_strong2 = value; |
|
| 681 |
else if (key.equals("punct_paren_open1"))
|
|
| 682 |
punct_paren_open1 = value; |
|
| 683 |
else if (key.equals("punct_paren_open2"))
|
|
| 684 |
punct_paren_open2 = value; |
|
| 685 |
else if (key.equals("punct_paren_close1"))
|
|
| 686 |
punct_paren_close1 = value; |
|
| 687 |
else if (key.equals("punct_paren_close2"))
|
|
| 688 |
punct_paren_close2 = value; |
|
| 689 |
else if (key.equals("punct_weak"))
|
|
| 690 |
punct_weak = value; |
|
| 691 |
else if (key.equals("entity"))
|
|
| 692 |
entity = value; |
|
| 693 |
else if (key.equals("seg_tags"))
|
|
| 694 |
seg_tags = value; |
|
| 695 |
else if (key.equals("regPunct"))
|
|
| 696 |
regPunct = value; |
|
| 697 |
else if (key.equals("regElision"))
|
|
| 698 |
regElision = value; |
|
| 699 |
else if (key.equals("whitespaces"))
|
|
| 700 |
whitespaces = value; |
|
| 701 |
else if (key.equals("punct_strong")) // this is temporary
|
|
| 702 |
tmp_strong_punct = value; // this is temporary |
|
| 703 |
// else |
|
| 704 |
// System.out.println("MISSING TOKENIZER KEY: "+key);
|
|
| 657 |
|
|
| 658 |
Field[] fields = TokenizerClasses.class.getFields(); |
|
| 659 |
for (Field field : fields) {
|
|
| 660 |
int m = field.getModifiers(); |
|
| 661 |
if (Modifier.isStatic(m) && Modifier.isPublic(m) && field.getType().equals(String.class)) {
|
|
| 662 |
String name = field.getName(); |
|
| 663 |
String value = params.get(name, null); |
|
| 664 |
if (value != null) {
|
|
| 665 |
try {
|
|
| 666 |
if (debug) System.out.println(" Tokenizer parametrized with "+name+"="+value);
|
|
| 667 |
field.set(field, value); |
|
| 668 |
} |
|
| 669 |
catch (IllegalArgumentException e) {
|
|
| 670 |
// TODO Auto-generated catch block |
|
| 671 |
e.printStackTrace(); |
|
| 672 |
} |
|
| 673 |
catch (IllegalAccessException e) {
|
|
| 674 |
// TODO Auto-generated catch block |
|
| 675 |
e.printStackTrace(); |
|
| 676 |
} |
|
| 677 |
} |
|
| 678 |
} |
|
| 705 | 679 |
} |
| 680 |
// // old way, now using Reflection |
|
| 681 |
|
|
| 682 |
// String[] keys = params.keys(); |
|
| 683 |
// for (String key : keys) {
|
|
| 684 |
// // Element param = (Element) params.item(i); |
|
| 685 |
// // String key = param.getAttribute("name");
|
|
| 686 |
// String value = params.get(key, ""); |
|
| 687 |
// // if (value == null || value.length() == 0) value = param.getTextContent(); |
|
| 688 |
// if (value.length() == 0) value = null; |
|
| 689 |
// |
|
| 690 |
// if (debug) System.out.println(" Tokenizer parametrized with "+key+"="+value+"");
|
|
| 691 |
// |
|
| 692 |
// if (key.equals("tag_all"))
|
|
| 693 |
// tag_all = value; |
|
| 694 |
// else if (key.equals("enclitics"))
|
|
| 695 |
// enclitics = value; |
|
| 696 |
// else if (key.equals("encliticsFR"))
|
|
| 697 |
// encliticsFR = value; |
|
| 698 |
// else if (key.equals("div_tags"))
|
|
| 699 |
// div_tags = value; |
|
| 700 |
// else if (key.equals("q_tags"))
|
|
| 701 |
// q_tags = value; |
|
| 702 |
// else if (key.equals("extraword1_tags"))
|
|
| 703 |
// extraword1_tags = value; |
|
| 704 |
// else if (key.equals("corr_tags_no_seg"))
|
|
| 705 |
// corr_tags_no_seg = value; |
|
| 706 |
// else if (key.equals("word_tags")) {
|
|
| 707 |
// word_tags = value; |
|
| 708 |
// word_element_to_create = value; // FIXME for now |
|
| 709 |
// } else if (key.equals("intraword_tags"))
|
|
| 710 |
// intraword_tags = value; |
|
| 711 |
// else if (key.equals("punct_quotes"))
|
|
| 712 |
// punct_quotes = value; |
|
| 713 |
// else if (key.equals("punct_strong1"))
|
|
| 714 |
// punct_strong1 = value; |
|
| 715 |
// else if (key.equals("punct_strong2"))
|
|
| 716 |
// punct_strong2 = value; |
|
| 717 |
// else if (key.equals("punct_paren_open1"))
|
|
| 718 |
// punct_paren_open1 = value; |
|
| 719 |
// else if (key.equals("punct_paren_open2"))
|
|
| 720 |
// punct_paren_open2 = value; |
|
| 721 |
// else if (key.equals("punct_paren_close1"))
|
|
| 722 |
// punct_paren_close1 = value; |
|
| 723 |
// else if (key.equals("punct_paren_close2"))
|
|
| 724 |
// punct_paren_close2 = value; |
|
| 725 |
// else if (key.equals("punct_weak"))
|
|
| 726 |
// punct_weak = value; |
|
| 727 |
// else if (key.equals("entity"))
|
|
| 728 |
// entity = value; |
|
| 729 |
// else if (key.equals("seg_tags"))
|
|
| 730 |
// seg_tags = value; |
|
| 731 |
// else if (key.equals("regPunct"))
|
|
| 732 |
// regPunct = value; |
|
| 733 |
// else if (key.equals("regElision"))
|
|
| 734 |
// regElision = value; |
|
| 735 |
// else if (key.equals("whitespaces"))
|
|
| 736 |
// whitespaces = value; |
|
| 737 |
// else if (key.equals("punct_strong")) // this is temporary
|
|
| 738 |
// tmp_strong_punct = value; // this is temporary |
|
| 739 |
//// else |
|
| 740 |
//// System.out.println("MISSING TOKENIZER KEY: "+key);
|
|
| 741 |
// } |
|
| 706 | 742 |
|
| 707 | 743 |
//recombine |
| 708 | 744 |
recombine(); |
| 709 |
if (tmp_strong_punct != null) punct_strong = tmp_strong_punct; // this is temporary |
|
| 710 | 745 |
|
| 711 | 746 |
if (params.getBoolean("onlyThoseTests", false)) {
|
| 712 | 747 |
System.out.println("Warning: tokenizer only using import parameters tests");
|
| 713 | 748 |
//tests = new ArrayList<>(); |
| 714 | 749 |
} |
| 715 | 750 |
|
| 716 |
org.osgi.service.prefs.Preferences testsList = params.node("test");
|
|
| 751 |
org.osgi.service.prefs.Preferences testsList = params.node("tests");
|
|
| 717 | 752 |
String[] tests; |
| 718 | 753 |
tests = testsList.childrenNames(); |
| 719 | 754 |
//System.out.println("Add "+testsList.getLength()+" tests to the tokenizer"
|
| ... | ... | |
| 733 | 768 |
e.printStackTrace(); |
| 734 | 769 |
} |
| 735 | 770 |
|
| 736 |
if (isValid()) |
|
| 771 |
if (isValid()) {
|
|
| 737 | 772 |
return true; |
| 738 |
else {
|
|
| 773 |
} else {
|
|
| 739 | 774 |
reset(); |
| 740 | 775 |
recombine(); |
| 741 | 776 |
return false; |
Formats disponibles : Unified diff