66 |
66 |
public String lemmaProperty = null;
|
67 |
67 |
@Option(name="lexique", usage="Lexicon file", widget="File", required=true, def="lexicon.txt")
|
68 |
68 |
public File lexique = null;
|
|
69 |
@Option(name="openclassfile", usage="openclassfile file", widget="File", required=true, def="")
|
|
70 |
public File openclassfile = null;
|
69 |
71 |
@Option(name="options", usage="TreeTagger supplementary options", widget="String", required=true, def="")
|
70 |
72 |
public String options = null;
|
71 |
73 |
|
... | ... | |
85 |
87 |
corpus = (CQPCorpus)first;
|
86 |
88 |
if (ParametersDialog.open(this)) {
|
87 |
89 |
|
88 |
|
train(corpus, model, lexique, new String[]{posProperty, lemmaProperty}, sentenceTag, options.split(" "));
|
|
90 |
train(corpus, model, lexique, openclassfile, new String[]{posProperty, lemmaProperty}, sentenceTag, options.split(" "));
|
89 |
91 |
|
90 |
92 |
return corpus;
|
91 |
93 |
}
|
... | ... | |
96 |
98 |
return null;
|
97 |
99 |
}
|
98 |
100 |
|
99 |
|
public static void train(final CQPCorpus corpus, final File model, final File lexique, final String[] properties, final String sentenceTag, final String[] options) {
|
|
101 |
public static void train(final CQPCorpus corpus, final File model, final File lexique, final File openclassfile, final String[] properties, final String sentenceTag, final String[] options) {
|
100 |
102 |
|
101 |
103 |
JobHandler job = new JobHandler("Applying TreeTagger to "+corpus+" corpus.") {
|
102 |
104 |
@Override
|
... | ... | |
106 |
108 |
File lexique2 = lexique;
|
107 |
109 |
MainCorpus mainCorpus = corpus.getMainCorpus();
|
108 |
110 |
File corpusBinaryDirectory = mainCorpus.getProjectDirectory();
|
109 |
|
|
|
111 |
|
110 |
112 |
System.out.println("TRAIN : "+corpus+" with "+lexique2+" to create "+model+ " with properties "+Arrays.toString(properties));
|
111 |
113 |
|
112 |
114 |
if (properties == null || properties.length != 2) {
|
... | ... | |
121 |
123 |
return Status.CANCEL_STATUS;
|
122 |
124 |
}
|
123 |
125 |
}
|
124 |
|
|
|
126 |
|
125 |
127 |
Property pos = corpus.getProperty(properties[0]);
|
126 |
128 |
Property lemma = corpus.getProperty(properties[1]);
|
127 |
|
|
|
129 |
|
128 |
130 |
// Prepare temporary directory
|
129 |
131 |
File treetaggerSrcDirectory = new File(mainCorpus.getProjectDirectory(), "treetagger");
|
130 |
132 |
DeleteDir.deleteDirectory(treetaggerSrcDirectory);
|
131 |
133 |
treetaggerSrcDirectory.mkdirs();
|
132 |
|
|
|
134 |
|
133 |
135 |
HashMap<String, HashSet<String>> simplified_lexicon = null;
|
134 |
136 |
HashMap<String, HashSet<String>> simplified_lexicon_errors = null;
|
135 |
137 |
int error_counter = 0;
|
... | ... | |
160 |
162 |
if (!lex.containsKey(form)) {
|
161 |
163 |
ArrayList<String> pairs = new ArrayList<String>();
|
162 |
164 |
HashSet<String> posValues = new HashSet<String>();
|
163 |
|
|
|
165 |
|
164 |
166 |
allPosValues.put(form, posValues);
|
165 |
167 |
lex.put(form, pairs);
|
166 |
168 |
}
|
... | ... | |
169 |
171 |
String posValue = values.get(1).get(0);
|
170 |
172 |
String lemmaValue = values.get(2).get(0);
|
171 |
173 |
if (posValues.contains(posValue)) {
|
172 |
|
|
|
174 |
|
173 |
175 |
} else {
|
174 |
176 |
posValues.add(posValue);
|
175 |
177 |
pairs.add(posValue);
|
... | ... | |
185 |
187 |
for (String v : lex.get(form)) {
|
186 |
188 |
if (tab) writer.write("\t"+v);
|
187 |
189 |
else writer.write(" "+v);
|
188 |
|
|
|
190 |
|
189 |
191 |
tab = !tab;
|
190 |
192 |
}
|
191 |
193 |
writer.write("\n");
|
... | ... | |
209 |
211 |
}
|
210 |
212 |
reader.close();
|
211 |
213 |
}
|
212 |
|
|
213 |
|
|
|
214 |
|
|
215 |
|
214 |
216 |
// create TT SRC file from CWB indexes
|
215 |
|
|
|
217 |
|
216 |
218 |
File ttSrcFile = new File(treetaggerSrcDirectory, mainCorpus.getID()+".tt");
|
217 |
219 |
System.out.println("TT SRC file: "+ttSrcFile.getAbsolutePath());
|
218 |
220 |
BufferedOutputStream fos = new BufferedOutputStream(new FileOutputStream(ttSrcFile));
|
... | ... | |
223 |
225 |
for (Match m : corpus.getMatches()) {
|
224 |
226 |
for (int i = m.getStart() ; i <= m.getEnd() ; i++) { // end match must be included
|
225 |
227 |
positions.add(i);
|
226 |
|
|
|
228 |
|
227 |
229 |
if (positions.size() >= 1000) { // avoid too big array
|
228 |
230 |
int[] positions_array = new int[positions.size()];
|
229 |
231 |
int ip = 0;
|
... | ... | |
236 |
238 |
if (w != null) {
|
237 |
239 |
String s = w+"\t"+values[iW];
|
238 |
240 |
ps.println(s);
|
239 |
|
|
|
241 |
|
240 |
242 |
if (simplified_lexicon != null) { // check given lexicon
|
241 |
243 |
if (simplified_lexicon.containsKey(w)) {
|
242 |
244 |
if (!simplified_lexicon.get(w).contains(values[iW])) {
|
... | ... | |
277 |
279 |
positions.clear();
|
278 |
280 |
}
|
279 |
281 |
ps.close();
|
280 |
|
|
|
282 |
|
281 |
283 |
if (simplified_lexicon_errors != null && simplified_lexicon_errors.size() > 0) {
|
282 |
284 |
File error_file = new File(treetaggerSrcDirectory, "errors.txt");
|
283 |
285 |
PrintWriter errorwriter = IOUtils.getWriter(error_file);
|
... | ... | |
300 |
302 |
String line = reader.readLine();
|
301 |
303 |
while (line != null) {
|
302 |
304 |
String w = line.split("\t", 2)[0];
|
303 |
|
|
|
305 |
|
304 |
306 |
if (simplified_lexicon_errors.containsKey(w)) {
|
305 |
307 |
for (String p : simplified_lexicon_errors.get(w)) {
|
306 |
308 |
if (!p.startsWith("#"))
|
... | ... | |
308 |
310 |
}
|
309 |
311 |
simplified_lexicon_errors.remove(w);
|
310 |
312 |
}
|
311 |
|
|
|
313 |
|
312 |
314 |
writer.println(line);
|
313 |
315 |
line = reader.readLine();
|
314 |
316 |
}
|
315 |
|
|
|
317 |
|
316 |
318 |
// write missing words
|
317 |
319 |
for (String w2 : simplified_lexicon_errors.keySet()) {
|
318 |
320 |
writer.print(w2);
|
... | ... | |
321 |
323 |
}
|
322 |
324 |
writer.println("");
|
323 |
325 |
}
|
324 |
|
|
|
326 |
|
325 |
327 |
reader.close();
|
326 |
328 |
writer.close();
|
327 |
329 |
System.out.println("Adding words to a temporary lexicon: "+lexique3);
|
328 |
330 |
lexique2 = lexique3;
|
329 |
331 |
}
|
330 |
|
|
|
332 |
|
331 |
333 |
// Create open class file : contains all pos values
|
332 |
|
File openclassfile = new File(treetaggerSrcDirectory, "openclasses.txt");
|
333 |
|
PrintWriter openClassFileWriter = IOUtils.getWriter(openclassfile);
|
334 |
|
|
335 |
|
// Lexicon poslexicon = corpus.getLexicon(pos);
|
336 |
|
// String[] posValues = poslexicon.getForms();
|
337 |
|
// for (int iV = 0 ; iV < posValues.length ; iV++) {
|
338 |
|
// if (iV == 0) openClassFileWriter.print(posValues[iV]);
|
339 |
|
// else openClassFileWriter.print(" "+posValues[iV]);
|
340 |
|
// }
|
341 |
|
openClassFileWriter.close();
|
|
334 |
File tmpopenclassfile = openclassfile;
|
|
335 |
if (tmpopenclassfile == null || tmpopenclassfile.getName().length() ==0) {
|
|
336 |
tmpopenclassfile = new File(treetaggerSrcDirectory, "openclasses.txt");
|
|
337 |
PrintWriter openClassFileWriter = IOUtils.getWriter(openclassfile);
|
342 |
338 |
|
|
339 |
// Lexicon poslexicon = corpus.getLexicon(pos);
|
|
340 |
// String[] posValues = poslexicon.getForms();
|
|
341 |
// for (int iV = 0 ; iV < posValues.length ; iV++) {
|
|
342 |
// if (iV == 0) openClassFileWriter.print(posValues[iV]);
|
|
343 |
// else openClassFileWriter.print(" "+posValues[iV]);
|
|
344 |
// }
|
|
345 |
openClassFileWriter.close();
|
|
346 |
}
|
|
347 |
|
343 |
348 |
// Call treetagger-train
|
344 |
|
if (ttSrcFile.exists() && lexique2.exists() && openclassfile.exists()) {
|
|
349 |
if (ttSrcFile.exists() && lexique2.exists() && tmpopenclassfile.exists()) {
|
345 |
350 |
System.out.println("Running treetagger-train...");
|
346 |
351 |
String treetaggerBinDirectory = new File(TreeTaggerPreferences.getInstance().getString(TreeTaggerPreferences.INSTALL_PATH), "bin").getAbsolutePath();
|
347 |
352 |
if (!treetaggerBinDirectory.endsWith("/")) treetaggerBinDirectory += "/";
|
348 |
353 |
|
349 |
354 |
TreeTagger tt = new TreeTagger(treetaggerBinDirectory, options);
|
350 |
355 |
tt.settoken();
|
351 |
|
|
|
356 |
|
352 |
357 |
//tt.setlemma();
|
353 |
358 |
tt.setsgml();
|
354 |
359 |
tt.setst(sentenceTag);
|
... | ... | |
358 |
363 |
if (cl > 0) {
|
359 |
364 |
tt.setcl(cl);
|
360 |
365 |
}
|
361 |
|
|
|
366 |
|
362 |
367 |
float dtg = TreeTaggerPreferences.getInstance().getFloat(TreeTaggerPreferences.OPTIONS_DTG);
|
363 |
368 |
if (dtg > 0) {
|
364 |
369 |
tt.setdtg(dtg);
|
365 |
370 |
}
|
366 |
|
|
|
371 |
|
367 |
372 |
float sw = TreeTaggerPreferences.getInstance().getFloat(TreeTaggerPreferences.OPTIONS_SW);
|
368 |
373 |
if (sw > 0) {
|
369 |
374 |
tt.setsw(sw);
|
370 |
375 |
}
|
371 |
|
|
|
376 |
|
372 |
377 |
float atg = TreeTaggerPreferences.getInstance().getFloat(TreeTaggerPreferences.OPTIONS_ATG);
|
373 |
378 |
if (atg > 0) {
|
374 |
379 |
tt.setatg(atg);
|
375 |
380 |
}
|
376 |
|
|
|
381 |
|
377 |
382 |
float ecw = TreeTaggerPreferences.getInstance().getFloat(TreeTaggerPreferences.OPTIONS_ECW);
|
378 |
383 |
if (ecw > 0) {
|
379 |
384 |
tt.setecw(ecw);
|
380 |
385 |
}
|
381 |
|
|
|
386 |
|
382 |
387 |
float lt = TreeTaggerPreferences.getInstance().getFloat(TreeTaggerPreferences.OPTIONS_LT);
|
383 |
388 |
if (lt > 0) {
|
384 |
389 |
tt.setlt(lt);
|
385 |
390 |
}
|
386 |
|
|
|
391 |
|
387 |
392 |
if (TreeTaggerPreferences.getInstance().getBoolean(TreeTaggerPreferences.OPTIONS_DEBUG)) {
|
388 |
393 |
tt.debug(true);
|
389 |
394 |
} else {
|
390 |
395 |
tt.setquiet();
|
391 |
396 |
}
|
392 |
|
|
393 |
|
tt.traintreetagger(lexique2.getAbsolutePath(), openclassfile.getAbsolutePath(), ttSrcFile.getAbsolutePath(), model.getAbsolutePath());
|
394 |
|
|
|
397 |
|
|
398 |
tt.traintreetagger(lexique2.getAbsolutePath(), tmpopenclassfile.getAbsolutePath(), ttSrcFile.getAbsolutePath(), model.getAbsolutePath());
|
|
399 |
|
395 |
400 |
System.out.println("Done: "+model.getAbsolutePath());
|
396 |
401 |
} else {
|
397 |
402 |
System.out.println("Aborting.");
|
398 |
403 |
}
|
399 |
|
|
|
404 |
|
400 |
405 |
return Status.OK_STATUS;
|
401 |
406 |
} catch (Exception e) {
|
402 |
407 |
System.out.println("Error while training TT: "+e);
|