| 66 |
66 |
public String lemmaProperty = null;
|
| 67 |
67 |
@Option(name="lexique", usage="Lexicon file", widget="File", required=true, def="lexicon.txt")
|
| 68 |
68 |
public File lexique = null;
|
|
69 |
@Option(name="openclassfile", usage="openclassfile file", widget="File", required=true, def="")
|
|
70 |
public File openclassfile = null;
|
| 69 |
71 |
@Option(name="options", usage="TreeTagger supplementary options", widget="String", required=true, def="")
|
| 70 |
72 |
public String options = null;
|
| 71 |
73 |
|
| ... | ... | |
| 85 |
87 |
corpus = (CQPCorpus)first;
|
| 86 |
88 |
if (ParametersDialog.open(this)) {
|
| 87 |
89 |
|
| 88 |
|
train(corpus, model, lexique, new String[]{posProperty, lemmaProperty}, sentenceTag, options.split(" "));
|
|
90 |
train(corpus, model, lexique, openclassfile, new String[]{posProperty, lemmaProperty}, sentenceTag, options.split(" "));
|
| 89 |
91 |
|
| 90 |
92 |
return corpus;
|
| 91 |
93 |
}
|
| ... | ... | |
| 96 |
98 |
return null;
|
| 97 |
99 |
}
|
| 98 |
100 |
|
| 99 |
|
public static void train(final CQPCorpus corpus, final File model, final File lexique, final String[] properties, final String sentenceTag, final String[] options) {
|
|
101 |
public static void train(final CQPCorpus corpus, final File model, final File lexique, final File openclassfile, final String[] properties, final String sentenceTag, final String[] options) {
|
| 100 |
102 |
|
| 101 |
103 |
JobHandler job = new JobHandler("Applying TreeTagger to "+corpus+" corpus.") {
|
| 102 |
104 |
@Override
|
| ... | ... | |
| 106 |
108 |
File lexique2 = lexique;
|
| 107 |
109 |
MainCorpus mainCorpus = corpus.getMainCorpus();
|
| 108 |
110 |
File corpusBinaryDirectory = mainCorpus.getProjectDirectory();
|
| 109 |
|
|
|
111 |
|
| 110 |
112 |
System.out.println("TRAIN : "+corpus+" with "+lexique2+" to create "+model+ " with properties "+Arrays.toString(properties));
|
| 111 |
113 |
|
| 112 |
114 |
if (properties == null || properties.length != 2) {
|
| ... | ... | |
| 121 |
123 |
return Status.CANCEL_STATUS;
|
| 122 |
124 |
}
|
| 123 |
125 |
}
|
| 124 |
|
|
|
126 |
|
| 125 |
127 |
Property pos = corpus.getProperty(properties[0]);
|
| 126 |
128 |
Property lemma = corpus.getProperty(properties[1]);
|
| 127 |
|
|
|
129 |
|
| 128 |
130 |
// Prepare temporary directory
|
| 129 |
131 |
File treetaggerSrcDirectory = new File(mainCorpus.getProjectDirectory(), "treetagger");
|
| 130 |
132 |
DeleteDir.deleteDirectory(treetaggerSrcDirectory);
|
| 131 |
133 |
treetaggerSrcDirectory.mkdirs();
|
| 132 |
|
|
|
134 |
|
| 133 |
135 |
HashMap<String, HashSet<String>> simplified_lexicon = null;
|
| 134 |
136 |
HashMap<String, HashSet<String>> simplified_lexicon_errors = null;
|
| 135 |
137 |
int error_counter = 0;
|
| ... | ... | |
| 160 |
162 |
if (!lex.containsKey(form)) {
|
| 161 |
163 |
ArrayList<String> pairs = new ArrayList<String>();
|
| 162 |
164 |
HashSet<String> posValues = new HashSet<String>();
|
| 163 |
|
|
|
165 |
|
| 164 |
166 |
allPosValues.put(form, posValues);
|
| 165 |
167 |
lex.put(form, pairs);
|
| 166 |
168 |
}
|
| ... | ... | |
| 169 |
171 |
String posValue = values.get(1).get(0);
|
| 170 |
172 |
String lemmaValue = values.get(2).get(0);
|
| 171 |
173 |
if (posValues.contains(posValue)) {
|
| 172 |
|
|
|
174 |
|
| 173 |
175 |
} else {
|
| 174 |
176 |
posValues.add(posValue);
|
| 175 |
177 |
pairs.add(posValue);
|
| ... | ... | |
| 185 |
187 |
for (String v : lex.get(form)) {
|
| 186 |
188 |
if (tab) writer.write("\t"+v);
|
| 187 |
189 |
else writer.write(" "+v);
|
| 188 |
|
|
|
190 |
|
| 189 |
191 |
tab = !tab;
|
| 190 |
192 |
}
|
| 191 |
193 |
writer.write("\n");
|
| ... | ... | |
| 209 |
211 |
}
|
| 210 |
212 |
reader.close();
|
| 211 |
213 |
}
|
| 212 |
|
|
| 213 |
|
|
|
214 |
|
|
215 |
|
| 214 |
216 |
// create TT SRC file from CWB indexes
|
| 215 |
|
|
|
217 |
|
| 216 |
218 |
File ttSrcFile = new File(treetaggerSrcDirectory, mainCorpus.getID()+".tt");
|
| 217 |
219 |
System.out.println("TT SRC file: "+ttSrcFile.getAbsolutePath());
|
| 218 |
220 |
BufferedOutputStream fos = new BufferedOutputStream(new FileOutputStream(ttSrcFile));
|
| ... | ... | |
| 223 |
225 |
for (Match m : corpus.getMatches()) {
|
| 224 |
226 |
for (int i = m.getStart() ; i <= m.getEnd() ; i++) { // end match must be included
|
| 225 |
227 |
positions.add(i);
|
| 226 |
|
|
|
228 |
|
| 227 |
229 |
if (positions.size() >= 1000) { // avoid too big array
|
| 228 |
230 |
int[] positions_array = new int[positions.size()];
|
| 229 |
231 |
int ip = 0;
|
| ... | ... | |
| 236 |
238 |
if (w != null) {
|
| 237 |
239 |
String s = w+"\t"+values[iW];
|
| 238 |
240 |
ps.println(s);
|
| 239 |
|
|
|
241 |
|
| 240 |
242 |
if (simplified_lexicon != null) { // check given lexicon
|
| 241 |
243 |
if (simplified_lexicon.containsKey(w)) {
|
| 242 |
244 |
if (!simplified_lexicon.get(w).contains(values[iW])) {
|
| ... | ... | |
| 277 |
279 |
positions.clear();
|
| 278 |
280 |
}
|
| 279 |
281 |
ps.close();
|
| 280 |
|
|
|
282 |
|
| 281 |
283 |
if (simplified_lexicon_errors != null && simplified_lexicon_errors.size() > 0) {
|
| 282 |
284 |
File error_file = new File(treetaggerSrcDirectory, "errors.txt");
|
| 283 |
285 |
PrintWriter errorwriter = IOUtils.getWriter(error_file);
|
| ... | ... | |
| 300 |
302 |
String line = reader.readLine();
|
| 301 |
303 |
while (line != null) {
|
| 302 |
304 |
String w = line.split("\t", 2)[0];
|
| 303 |
|
|
|
305 |
|
| 304 |
306 |
if (simplified_lexicon_errors.containsKey(w)) {
|
| 305 |
307 |
for (String p : simplified_lexicon_errors.get(w)) {
|
| 306 |
308 |
if (!p.startsWith("#"))
|
| ... | ... | |
| 308 |
310 |
}
|
| 309 |
311 |
simplified_lexicon_errors.remove(w);
|
| 310 |
312 |
}
|
| 311 |
|
|
|
313 |
|
| 312 |
314 |
writer.println(line);
|
| 313 |
315 |
line = reader.readLine();
|
| 314 |
316 |
}
|
| 315 |
|
|
|
317 |
|
| 316 |
318 |
// write missing words
|
| 317 |
319 |
for (String w2 : simplified_lexicon_errors.keySet()) {
|
| 318 |
320 |
writer.print(w2);
|
| ... | ... | |
| 321 |
323 |
}
|
| 322 |
324 |
writer.println("");
|
| 323 |
325 |
}
|
| 324 |
|
|
|
326 |
|
| 325 |
327 |
reader.close();
|
| 326 |
328 |
writer.close();
|
| 327 |
329 |
System.out.println("Adding words to a temporary lexicon: "+lexique3);
|
| 328 |
330 |
lexique2 = lexique3;
|
| 329 |
331 |
}
|
| 330 |
|
|
|
332 |
|
| 331 |
333 |
// Create open class file : contains all pos values
|
| 332 |
|
File openclassfile = new File(treetaggerSrcDirectory, "openclasses.txt");
|
| 333 |
|
PrintWriter openClassFileWriter = IOUtils.getWriter(openclassfile);
|
| 334 |
|
|
| 335 |
|
// Lexicon poslexicon = corpus.getLexicon(pos);
|
| 336 |
|
// String[] posValues = poslexicon.getForms();
|
| 337 |
|
// for (int iV = 0 ; iV < posValues.length ; iV++) {
|
| 338 |
|
// if (iV == 0) openClassFileWriter.print(posValues[iV]);
|
| 339 |
|
// else openClassFileWriter.print(" "+posValues[iV]);
|
| 340 |
|
// }
|
| 341 |
|
openClassFileWriter.close();
|
|
334 |
File tmpopenclassfile = openclassfile;
|
|
335 |
if (tmpopenclassfile == null || tmpopenclassfile.getName().length() ==0) {
|
|
336 |
tmpopenclassfile = new File(treetaggerSrcDirectory, "openclasses.txt");
|
|
337 |
PrintWriter openClassFileWriter = IOUtils.getWriter(openclassfile);
|
| 342 |
338 |
|
|
339 |
// Lexicon poslexicon = corpus.getLexicon(pos);
|
|
340 |
// String[] posValues = poslexicon.getForms();
|
|
341 |
// for (int iV = 0 ; iV < posValues.length ; iV++) {
|
|
342 |
// if (iV == 0) openClassFileWriter.print(posValues[iV]);
|
|
343 |
// else openClassFileWriter.print(" "+posValues[iV]);
|
|
344 |
// }
|
|
345 |
openClassFileWriter.close();
|
|
346 |
}
|
|
347 |
|
| 343 |
348 |
// Call treetagger-train
|
| 344 |
|
if (ttSrcFile.exists() && lexique2.exists() && openclassfile.exists()) {
|
|
349 |
if (ttSrcFile.exists() && lexique2.exists() && tmpopenclassfile.exists()) {
|
| 345 |
350 |
System.out.println("Running treetagger-train...");
|
| 346 |
351 |
String treetaggerBinDirectory = new File(TreeTaggerPreferences.getInstance().getString(TreeTaggerPreferences.INSTALL_PATH), "bin").getAbsolutePath();
|
| 347 |
352 |
if (!treetaggerBinDirectory.endsWith("/")) treetaggerBinDirectory += "/";
|
| 348 |
353 |
|
| 349 |
354 |
TreeTagger tt = new TreeTagger(treetaggerBinDirectory, options);
|
| 350 |
355 |
tt.settoken();
|
| 351 |
|
|
|
356 |
|
| 352 |
357 |
//tt.setlemma();
|
| 353 |
358 |
tt.setsgml();
|
| 354 |
359 |
tt.setst(sentenceTag);
|
| ... | ... | |
| 358 |
363 |
if (cl > 0) {
|
| 359 |
364 |
tt.setcl(cl);
|
| 360 |
365 |
}
|
| 361 |
|
|
|
366 |
|
| 362 |
367 |
float dtg = TreeTaggerPreferences.getInstance().getFloat(TreeTaggerPreferences.OPTIONS_DTG);
|
| 363 |
368 |
if (dtg > 0) {
|
| 364 |
369 |
tt.setdtg(dtg);
|
| 365 |
370 |
}
|
| 366 |
|
|
|
371 |
|
| 367 |
372 |
float sw = TreeTaggerPreferences.getInstance().getFloat(TreeTaggerPreferences.OPTIONS_SW);
|
| 368 |
373 |
if (sw > 0) {
|
| 369 |
374 |
tt.setsw(sw);
|
| 370 |
375 |
}
|
| 371 |
|
|
|
376 |
|
| 372 |
377 |
float atg = TreeTaggerPreferences.getInstance().getFloat(TreeTaggerPreferences.OPTIONS_ATG);
|
| 373 |
378 |
if (atg > 0) {
|
| 374 |
379 |
tt.setatg(atg);
|
| 375 |
380 |
}
|
| 376 |
|
|
|
381 |
|
| 377 |
382 |
float ecw = TreeTaggerPreferences.getInstance().getFloat(TreeTaggerPreferences.OPTIONS_ECW);
|
| 378 |
383 |
if (ecw > 0) {
|
| 379 |
384 |
tt.setecw(ecw);
|
| 380 |
385 |
}
|
| 381 |
|
|
|
386 |
|
| 382 |
387 |
float lt = TreeTaggerPreferences.getInstance().getFloat(TreeTaggerPreferences.OPTIONS_LT);
|
| 383 |
388 |
if (lt > 0) {
|
| 384 |
389 |
tt.setlt(lt);
|
| 385 |
390 |
}
|
| 386 |
|
|
|
391 |
|
| 387 |
392 |
if (TreeTaggerPreferences.getInstance().getBoolean(TreeTaggerPreferences.OPTIONS_DEBUG)) {
|
| 388 |
393 |
tt.debug(true);
|
| 389 |
394 |
} else {
|
| 390 |
395 |
tt.setquiet();
|
| 391 |
396 |
}
|
| 392 |
|
|
| 393 |
|
tt.traintreetagger(lexique2.getAbsolutePath(), openclassfile.getAbsolutePath(), ttSrcFile.getAbsolutePath(), model.getAbsolutePath());
|
| 394 |
|
|
|
397 |
|
|
398 |
tt.traintreetagger(lexique2.getAbsolutePath(), tmpopenclassfile.getAbsolutePath(), ttSrcFile.getAbsolutePath(), model.getAbsolutePath());
|
|
399 |
|
| 395 |
400 |
System.out.println("Done: "+model.getAbsolutePath());
|
| 396 |
401 |
} else {
|
| 397 |
402 |
System.out.println("Aborting.");
|
| 398 |
403 |
}
|
| 399 |
|
|
|
404 |
|
| 400 |
405 |
return Status.OK_STATUS;
|
| 401 |
406 |
} catch (Exception e) {
|
| 402 |
407 |
System.out.println("Error while training TT: "+e);
|