root / tmp / org.txm.treetagger.core / src / org / txm / importer / xmltxm / Annotate.groovy @ 1681
History | View | Annotate | Download (18.3 kB)
1 |
// Copyright © 2010-2013 ENS de Lyon.
|
---|---|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice
|
4 |
// Sophia Antipolis, University of Paris 3.
|
5 |
//
|
6 |
// The TXM platform is free software: you can redistribute it
|
7 |
// and/or modify it under the terms of the GNU General Public
|
8 |
// License as published by the Free Software Foundation,
|
9 |
// either version 2 of the License, or (at your option) any
|
10 |
// later version.
|
11 |
//
|
12 |
// The TXM platform is distributed in the hope that it will be
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 |
// PURPOSE. See the GNU General Public License for more
|
16 |
// details.
|
17 |
//
|
18 |
// You should have received a copy of the GNU General
|
19 |
// Public License along with the TXM platform. If not, see
|
20 |
// http://www.gnu.org/licenses.
|
21 |
//
|
22 |
//
|
23 |
//
|
24 |
// $LastChangedDate: 2016-10-03 15:30:36 +0200 (lun. 03 oct. 2016) $
|
25 |
// $LastChangedRevision: 3313 $
|
26 |
// $LastChangedBy: mdecorde $
|
27 |
//
|
28 |
package org.txm.importer.xmltxm
|
29 |
|
30 |
import java.io.File |
31 |
import java.text.DateFormat |
32 |
import java.util.Date |
33 |
import java.util.concurrent.* |
34 |
|
35 |
import org.txm.Toolbox |
36 |
import org.txm.importer.* |
37 |
import org.txm.scripts.importer.* |
38 |
import org.txm.importer.scripts.xmltxm.AnnotationInjection |
39 |
import org.txm.importer.cwb.* |
40 |
import org.txm.objects.* |
41 |
import org.txm.treetagger.core.preferences.TreeTaggerPreferences |
42 |
import org.txm.utils.ConsoleProgressBar |
43 |
import org.txm.utils.LangDetector; |
44 |
import org.txm.utils.io.IOUtils |
45 |
import org.txm.utils.logger.Log; |
46 |
import org.txm.utils.treetagger.TreeTagger |
47 |
|
48 |
/**
|
49 |
* Annotate and replace the TEI-TXM files of the folder $rootDirFile/txm with TreeTagger.
|
50 |
* creates $rootDirFile/interp and $rootDirFile/treetagger
|
51 |
*
|
52 |
*/
|
53 |
class Annotate { |
54 |
boolean cancelNow = false; |
55 |
|
56 |
/** The report file. */
|
57 |
File reportFile;//contains the txm:application tag content |
58 |
|
59 |
/** The resp person. */
|
60 |
String respPerson;
|
61 |
|
62 |
/** The resp id. */
|
63 |
String respId;
|
64 |
|
65 |
/** The resp desc. */
|
66 |
String respDesc;
|
67 |
|
68 |
/** The resp date. */
|
69 |
String respDate;
|
70 |
|
71 |
/** The resp when. */
|
72 |
String respWhen;
|
73 |
|
74 |
/** The app ident. */
|
75 |
String appIdent;
|
76 |
|
77 |
/** The app version. */
|
78 |
String appVersion;
|
79 |
|
80 |
/** The distributor. */
|
81 |
String distributor;
|
82 |
|
83 |
/** The publi stmt. */
|
84 |
String publiStmt;
|
85 |
|
86 |
/** The source stmt. */
|
87 |
String sourceStmt;
|
88 |
|
89 |
/** The types. */
|
90 |
def types;
|
91 |
|
92 |
/** The types title. */
|
93 |
def typesTITLE;
|
94 |
|
95 |
/** The types desc. */
|
96 |
def typesDesc;
|
97 |
|
98 |
/** The types tagset. */
|
99 |
def typesTAGSET;
|
100 |
|
101 |
/** The types web. */
|
102 |
def typesWEB;
|
103 |
|
104 |
/** The idform. */
|
105 |
String idform;
|
106 |
|
107 |
/** The debug. */
|
108 |
boolean debug = false; |
109 |
|
110 |
File modelsDirectory;
|
111 |
|
112 |
public Annotate() {
|
113 |
modelsDirectory = new File(TreeTaggerPreferences.getInstance().getString(TreeTaggerPreferences.MODELS_PATH)); // default models directory is set in the Toolbox |
114 |
} |
115 |
|
116 |
/**
|
117 |
* Sets the debug.
|
118 |
*/
|
119 |
public void setDebug() { |
120 |
debug = true;
|
121 |
} |
122 |
|
123 |
String id;
|
124 |
/**
|
125 |
* Inits the tt outfile infos.
|
126 |
*
|
127 |
* @param rootDirFile the root dir file
|
128 |
* @param modelfile the modelfile
|
129 |
*/
|
130 |
public void initTTOutfileInfos(File rootDirFile, File modelfile, String modelfilename) |
131 |
{ |
132 |
initTTOutfileInfos(rootDirFile, modelfile, modelfilename, null);
|
133 |
} |
134 |
|
135 |
/**
|
136 |
* Inits the tt outfile infos.
|
137 |
*
|
138 |
* @param rootDirFile the root dir file
|
139 |
* @param modelfile the modelfile
|
140 |
* @param properties : 2 element array that contains the word properties to create. It can be null (the modelfilename will be used)
|
141 |
*/
|
142 |
public void initTTOutfileInfos(File rootDirFile, File modelfile, String modelfilename, String[] properties) |
143 |
{ |
144 |
id = modelfilename; |
145 |
String[] split = id.split("\\."); |
146 |
if (split.length > 0) id = split[0]; |
147 |
if (id.equals("??")) id = "xx" |
148 |
|
149 |
reportFile = new File(rootDirFile,"NLPToolsParameters.xml"); |
150 |
|
151 |
respPerson = System.getProperty("user.name"); |
152 |
respId = "txm";
|
153 |
respDesc = "NLP annotation tool";
|
154 |
respDate = DateFormat.getDateInstance(DateFormat.SHORT, Locale.UK).format(new Date()); |
155 |
respWhen = DateFormat.getDateInstance(DateFormat.FULL, Locale.UK).format(new Date()); |
156 |
|
157 |
appIdent = "TreeTagger";
|
158 |
appVersion = "3.2";
|
159 |
|
160 |
distributor = "";
|
161 |
publiStmt = """""";
|
162 |
sourceStmt = """""";
|
163 |
|
164 |
if (properties != null && properties.length == 2) { |
165 |
types = [properties[0],properties[1]]; |
166 |
typesTITLE = [properties[0],properties[1]]; |
167 |
} else {
|
168 |
types = [id+"pos",id+"lemma"]; |
169 |
typesTITLE = [id+"pos",id+"lemma"]; |
170 |
} |
171 |
|
172 |
//TODO: the tagset, website and description should be referenced in the model catalog
|
173 |
if(modelfile.getName() == "rgaqcj.par") { |
174 |
typesDesc = ["CATTEX pos tagset built with BFM texts","fr lemma of the model "+modelfile+" - "] |
175 |
typesTAGSET = ["http://bfm.ens-lyon.fr/IMG/pdf/Cattex2009_Manuel.pdf",""] |
176 |
typesWEB = ["http://bfm.ens-lyon.fr/",""] |
177 |
} else {
|
178 |
typesDesc = ["pos tagset built from model "+modelfile,id+" lemma of the model "+modelfile+" - "] |
179 |
typesTAGSET = ["",""] |
180 |
typesWEB = ["",""] |
181 |
} |
182 |
|
183 |
idform ="w";
|
184 |
} |
185 |
|
186 |
/**
|
187 |
* Apply tt.
|
188 |
*
|
189 |
* @param ttsrcfile the ttsrcfile
|
190 |
* @param ttoutfile the ttoutfile
|
191 |
* @param modelfile the modelfile
|
192 |
* @return true, if successful
|
193 |
*/
|
194 |
public boolean applyTT(File ttsrcfile, File ttoutfile, File modelfile) { |
195 |
applyTT(ttsrcfile, ttoutfile, modelfile, null)
|
196 |
} |
197 |
|
198 |
/**
|
199 |
* Apply tt.
|
200 |
*
|
201 |
* @param ttsrcfile the ttsrcfile
|
202 |
* @param ttoutfile the ttoutfile
|
203 |
* @param modelfile the modelfile
|
204 |
* @param options, if null use value set in Toolbox preferences
|
205 |
* @return true, if successful
|
206 |
*/
|
207 |
public boolean applyTT(File ttsrcfile, File ttoutfile, File modelfile, String[] options) |
208 |
{ |
209 |
try {
|
210 |
File infile = ttsrcfile;
|
211 |
File outfile = ttoutfile;
|
212 |
|
213 |
def tt = new TreeTagger(TreeTaggerPreferences.getInstance().getString(TreeTaggerPreferences.INSTALL_PATH)+"/bin/", options); |
214 |
tt.settoken(); |
215 |
tt.setlemma(); |
216 |
tt.setsgml(); |
217 |
if (TreeTaggerPreferences.getInstance().getBoolean(TreeTaggerPreferences.OPTIONS_UNKNOWN)) {
|
218 |
tt.setnounknown(); |
219 |
} |
220 |
tt.seteostag("<s>");
|
221 |
if (TreeTaggerPreferences.getInstance().getBoolean(TreeTaggerPreferences.OPTIONS_DEBUG)) {
|
222 |
tt.debug(true);
|
223 |
} else {
|
224 |
tt.setquiet(); |
225 |
} |
226 |
if (TreeTaggerPreferences.getInstance().getBoolean(TreeTaggerPreferences.OPTIONS_CAPHEURISTIC)) {
|
227 |
tt.setcapheuristics(); |
228 |
} |
229 |
if (TreeTaggerPreferences.getInstance().getBoolean(TreeTaggerPreferences.OPTIONS_HYPHENHEURISTIC)) {
|
230 |
tt.sethyphenheuristics(); |
231 |
} |
232 |
if (TreeTaggerPreferences.getInstance().getBoolean(TreeTaggerPreferences.OPTIONS_PROB)) {
|
233 |
tt.setprob(); |
234 |
} |
235 |
|
236 |
String lex = TreeTaggerPreferences.getInstance().getString(TreeTaggerPreferences.OPTIONS_LEX);
|
237 |
if (lex !=null && lex.length() > 0) { |
238 |
tt.setlex(lex); |
239 |
} |
240 |
String wc = TreeTaggerPreferences.getInstance().getString(TreeTaggerPreferences.OPTIONS_WC);
|
241 |
if (wc !=null && wc.length() > 0) { |
242 |
tt.setwc(wc); |
243 |
} |
244 |
tt.treetagger(modelfile.getAbsolutePath(), infile.getAbsolutePath(), outfile.getAbsolutePath()) |
245 |
infile.delete(); |
246 |
} catch(Exception e) { |
247 |
Log.printStackTrace(e); |
248 |
System.out.println("Failed to apply TreeTagger on $ttsrcfile input file with the $modelfile model file."); |
249 |
return false; |
250 |
} |
251 |
return true; |
252 |
} |
253 |
|
254 |
/**
|
255 |
* Write standoff file.
|
256 |
*
|
257 |
* @param ttoutfile the ttoutfile
|
258 |
* @param posfile the posfile
|
259 |
* @return true, if successful
|
260 |
*/
|
261 |
public boolean writeStandoffFile(File ttoutfile, File posfile) |
262 |
{ |
263 |
def encoding ="UTF-8"; |
264 |
def transfo = new CSV2W_ANA(); |
265 |
//println("build w-interp "+ttfile.getName()+ ">>"+posfile.getName())
|
266 |
transfo.setAnnotationTypes( types, typesDesc, typesTAGSET, typesWEB, idform); |
267 |
transfo.setResp(respId, respDesc, respDate, respPerson, respWhen); |
268 |
transfo.setApp(appIdent, appVersion); |
269 |
transfo.setTarget(ttoutfile.getAbsolutePath(), reportFile); |
270 |
transfo.setInfos(distributor, publiStmt, sourceStmt); |
271 |
return transfo.process( ttoutfile, posfile, encoding );
|
272 |
} |
273 |
|
274 |
/**
|
275 |
* Run step by step : build TT src files, run TT, build xml-standoff files, inject standoff annotations
|
276 |
*
|
277 |
* @param rootDirFile the root dir file
|
278 |
* @param modelfilename the modelfilename
|
279 |
* @return true, if successful
|
280 |
*/
|
281 |
public boolean run(File binDir, File txmDir, String modelfilename) |
282 |
{ |
283 |
//test if modelfile exists
|
284 |
if (debug) {
|
285 |
println "rootDirFile "+binDir
|
286 |
println "txmDir "+txmDir
|
287 |
println "TREETAGGER INSTALL PATH : "+TreeTaggerPreferences.getInstance().getString(TreeTaggerPreferences.INSTALL_PATH);
|
288 |
println "TREETAGGER MODELS PATH : "+TreeTaggerPreferences.getInstance().getString(TreeTaggerPreferences.MODELS_PATH)
|
289 |
} |
290 |
|
291 |
//test if the Toolbox know TreeTagger
|
292 |
if (!new File(TreeTaggerPreferences.getInstance().getString(TreeTaggerPreferences.INSTALL_PATH)+"/bin/").exists()) { |
293 |
println("Could not find TreeTagger binaries in "+TreeTaggerPreferences.getInstance().getString(TreeTaggerPreferences.INSTALL_PATH)+"/bin/") |
294 |
return false; |
295 |
} |
296 |
String langAll = null |
297 |
String lang;
|
298 |
if (modelfilename.startsWith("??")) { |
299 |
langAll = new LangDetector(binDir).getLang();
|
300 |
println "General lang $langAll"
|
301 |
} |
302 |
|
303 |
//cleaning
|
304 |
new File(binDir, "annotations").deleteDir(); |
305 |
new File(binDir, "annotations").mkdir(); |
306 |
new File(binDir, "treetagger").deleteDir(); |
307 |
new File(binDir, "treetagger").mkdir(); |
308 |
|
309 |
ArrayList<String> milestones = []; |
310 |
|
311 |
//BUILD TT FILE READY TO BE TAGGED
|
312 |
List<File> files = txmDir.listFiles(IOUtils.HIDDENFILE_FILTER) |
313 |
|
314 |
println("Building TT source files ("+files.size()+") from directory "+txmDir) |
315 |
ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
|
316 |
for (File f : files) { |
317 |
cpb.tick() |
318 |
File srcfile = f;
|
319 |
File resultfile = new File(binDir, "treetagger/"+f.getName()+".tt"); |
320 |
if(debug)
|
321 |
println "build tt src : "+srcfile+" >> "+resultfile |
322 |
def ttsrcbuilder = new BuildTTSrc(srcfile.toURI().toURL()) |
323 |
if (!ttsrcbuilder.process(resultfile, null)) |
324 |
System.out.println("Failed to build tt src file of "+srcfile); |
325 |
} |
326 |
|
327 |
if (cancelNow) return; |
328 |
|
329 |
File modelDirectory = new File(TreeTaggerPreferences.getInstance().getString(TreeTaggerPreferences.MODELS_PATH)); |
330 |
if (!modelDirectory.exists()) {
|
331 |
println "Skipping ANNOTATE: TreeTagger language model file directory not found: "+modelDirectory.getAbsolutePath();
|
332 |
return false; |
333 |
} else if (!modelDirectory.canRead()) { |
334 |
println "Skipping ANNOTATE: impossible to access the TreeTagger language model file directory: "+modelDirectory.getAbsolutePath();
|
335 |
return false; |
336 |
} |
337 |
println("")
|
338 |
//Convert encoding if needed
|
339 |
|
340 |
//APPLY TREETAGGER
|
341 |
files = new File(binDir, "treetagger").listFiles(IOUtils.HIDDENFILE_FILTER) |
342 |
println("Applying $modelfilename TreeTagger model on dir: "+new File(binDir, "treetagger")+ " ("+files.size()+" files)") |
343 |
if (files == null || files.size() == 0) |
344 |
return false; |
345 |
File modelfile;
|
346 |
cpb = new ConsoleProgressBar(files.size())
|
347 |
for (File f : files) { |
348 |
String tmpModelFileName = modelfilename
|
349 |
if (modelfilename.startsWith("??")) { |
350 |
lang = langAll; |
351 |
if (f.length() > LangDetector.MINIMALSIZE) {
|
352 |
lang = new LangDetector(f).getLang();
|
353 |
//println "guessing lang $f : $lang"
|
354 |
} |
355 |
tmpModelFileName = lang+".par"
|
356 |
} |
357 |
modelfile = new File(modelsDirectory, tmpModelFileName); |
358 |
if (debug)
|
359 |
println "model file : "+modelfile;
|
360 |
|
361 |
File
|
362 |
|
363 |
if (!modelfile.exists()) {
|
364 |
println "Skipping ANNOTATE: '$modelfile' TreeTagger language model file not found."
|
365 |
if(System.getProperty("os.name").startsWith("Windows") || System.getProperty("os.name").startsWith("Mac")) |
366 |
println "Windows&Mac users: the operating system might be hiding file extensions. Use your file explorer to check the file name."
|
367 |
return false; |
368 |
} else if (!modelfile.canRead()) { |
369 |
println "Skipping ANNOTATE: impossible to access the '$modelfile' TreeTagger language model file."
|
370 |
return false; |
371 |
} |
372 |
|
373 |
// if (modelfile.getName().equals("sp.par")) {//UTF >> Latin1
|
374 |
// if(debug)
|
375 |
// println "fix encoding for model "+modelfile
|
376 |
// new EncodingConverter(f, "UTF-8", "ISO-8859-1")
|
377 |
// }
|
378 |
|
379 |
cpb.tick() |
380 |
File infile = f;
|
381 |
File outfile = new File(f.getParent(),f.getName()+"-out.tt"); |
382 |
if (!applyTT(infile, outfile, modelfile)) {
|
383 |
return false; |
384 |
} |
385 |
|
386 |
// //Reconvert encoding if needed
|
387 |
// if (modelfile.getName().equals("sp.par")) {
|
388 |
// if(debug)
|
389 |
// println "convert "+f+" latin1 >> UTF-8"
|
390 |
// new EncodingConverter(f, "ISO-8859-1", "UTF-8")
|
391 |
// }
|
392 |
} |
393 |
println("")
|
394 |
|
395 |
if (cancelNow) return; |
396 |
|
397 |
//BUILD STAND-OFF FILES
|
398 |
if (modelfile == null) { |
399 |
println "no model applied"
|
400 |
return false; |
401 |
} |
402 |
|
403 |
initTTOutfileInfos(binDir, modelfile, modelfilename); |
404 |
files = new File(binDir, "treetagger").listFiles(IOUtils.HIDDENFILE_FILTER)// now contains the result files of TT |
405 |
println "Building stdoff files ("+files.size()+") from dir:"+new File(binDir, "treetagger")+" to "+new File(binDir, "annotations"); |
406 |
if (files == null || files.size() == 0) |
407 |
return false; |
408 |
cpb = new ConsoleProgressBar(files.size())
|
409 |
for (File ttrezfile : files) { |
410 |
cpb.tick() |
411 |
File annotfile = new File(binDir, "annotations/"+ttrezfile.getName()+"-STOFF.xml"); |
412 |
if (!writeStandoffFile(ttrezfile, annotfile))
|
413 |
println("Failed to build standoff file of "+ttrezfile);
|
414 |
} |
415 |
println("")
|
416 |
|
417 |
if (cancelNow) return; |
418 |
|
419 |
//INJECT ANNOTATIONS
|
420 |
List<File> interpfiles = new File(binDir, "annotations").listFiles(IOUtils.HIDDENFILE_FILTER); |
421 |
List<File> txmfiles = txmDir.listFiles(IOUtils.HIDDENFILE_FILTER); |
422 |
if (txmfiles == null) { |
423 |
println "No file to annotate in "+txmDir.getAbsolutePath()
|
424 |
return false; |
425 |
} |
426 |
interpfiles.sort(); // same order
|
427 |
txmfiles.sort(); //same order
|
428 |
println "Injecting stdoff files ("+interpfiles.size()+") data from "+new File(binDir, "annotations")+ " to xml-txm files of "+txmDir; |
429 |
if (interpfiles == null || interpfiles.size() == 0) |
430 |
return false; |
431 |
cpb = new ConsoleProgressBar(interpfiles.size())
|
432 |
for (int i = 0 ; i < interpfiles.size() ; i++) { |
433 |
cpb.tick() |
434 |
File srcfile = txmfiles.get(i);
|
435 |
File pos1file = interpfiles.get(i);
|
436 |
File temp = File.createTempFile("Annotate", "temp", srcfile.getParentFile()); |
437 |
def builder = new AnnotationInjection(srcfile.toURI().toURL(), pos1file.toURI().toURL()); |
438 |
if (!builder.process(temp)) {
|
439 |
return false; |
440 |
} |
441 |
builder = null;
|
442 |
|
443 |
//println "renaming files..."
|
444 |
if (!(srcfile.delete() && temp.renameTo(srcfile)))
|
445 |
println "Warning can't rename file "+temp+" to "+srcfile |
446 |
} |
447 |
|
448 |
println("")
|
449 |
return true; |
450 |
} |
451 |
|
452 |
public void setModelsDirectory(File modelsDirectory) { |
453 |
this.modelsDirectory = modelsDirectory;
|
454 |
} |
455 |
|
456 |
/**
|
457 |
* Run file by file. Allow to have one different lang per file. Default behavior add new word properties
|
458 |
*
|
459 |
* @param binDir
|
460 |
* @param txmDir
|
461 |
* @param lang associate a file name with a model filename
|
462 |
* @return true, if successful
|
463 |
*/
|
464 |
public boolean run(File binDir, File txmDir, HashMap<String, String> langs) |
465 |
{ |
466 |
return run(binDir, txmDir, langs, false, new String[0], new String[0]); |
467 |
} |
468 |
|
469 |
/**
|
470 |
* Run file by file. Allow to have one different lang per file
|
471 |
*
|
472 |
* @param binDir
|
473 |
* @param txmDir
|
474 |
* @param lang associate a file name with a model filename
|
475 |
* @param replace, replace or create a word property
|
476 |
* @return true, if successful
|
477 |
*/
|
478 |
public boolean run(File binDir, File txmDir, HashMap<String, String> langs, boolean replace, String[] properties, String[] options) |
479 |
{ |
480 |
if (!new File(TreeTaggerPreferences.getInstance().getString(TreeTaggerPreferences.INSTALL_PATH)+"/bin/").exists()) { |
481 |
println("Path to TreeTagger is wrong "+TreeTaggerPreferences.getInstance().getString(TreeTaggerPreferences.INSTALL_PATH)+"/bin/") |
482 |
return true; |
483 |
} |
484 |
|
485 |
List<File> listfiles = txmDir.listFiles(IOUtils.HIDDENFILE_FILTER); |
486 |
|
487 |
//cleaning
|
488 |
File annotDir = new File(binDir,"annotations"); |
489 |
annotDir.deleteDir(); |
490 |
annotDir.mkdir(); |
491 |
File ptreetaggerDir = new File(binDir,"ptreetagger"); |
492 |
ptreetaggerDir.deleteDir(); |
493 |
ptreetaggerDir.mkdir(); |
494 |
File treetaggerDir = new File(binDir,"treetagger"); |
495 |
treetaggerDir.deleteDir(); |
496 |
treetaggerDir.mkdir(); |
497 |
|
498 |
int cores = Runtime.getRuntime().availableProcessors() |
499 |
int coresToUse = Math.max(1.0, cores * 0.7) |
500 |
ExecutorService pool = Executors.newFixedThreadPool(coresToUse) |
501 |
|
502 |
def files = txmDir.listFiles(IOUtils.HIDDENFILE_FILTER)
|
503 |
ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
|
504 |
for (File teiFile : files) { |
505 |
int counter = 1; |
506 |
ThreadFile t = new ThreadFile("TT_"+counter++, teiFile) { |
507 |
|
508 |
public void run() { |
509 |
|
510 |
if (cancelNow) return; |
511 |
if (langs.get(f.getName()) == null) { |
512 |
println "Error: no lang defined for file $f"
|
513 |
return;
|
514 |
} |
515 |
|
516 |
String lang = langs.get(f.getName());
|
517 |
run(f, lang, binDir, txmDir, replace, properties, options, annotDir, ptreetaggerDir, treetaggerDir) |
518 |
|
519 |
cpb.tick(); |
520 |
} |
521 |
}; |
522 |
|
523 |
pool.execute(t) |
524 |
} |
525 |
|
526 |
pool.shutdown() |
527 |
pool.awaitTermination(10, TimeUnit.HOURS) |
528 |
println ""
|
529 |
return true; |
530 |
} |
531 |
public boolean run(File f, String lang, File binDir, File txmDir) { |
532 |
|
533 |
File annotDir = new File(binDir,"annotations"); |
534 |
annotDir.mkdir(); |
535 |
File ptreetaggerDir = new File(binDir,"ptreetagger"); |
536 |
ptreetaggerDir.mkdir(); |
537 |
File treetaggerDir = new File(binDir,"treetagger"); |
538 |
treetaggerDir.mkdir(); |
539 |
|
540 |
return run(f, lang, binDir, txmDir, false, new String[0], new String[0], annotDir, ptreetaggerDir, treetaggerDir) |
541 |
} |
542 |
|
543 |
|
544 |
public boolean run(File f, String lang, File binDir, File txmDir, boolean replace, String[] properties, String[] options, File annotDir, File ptreetaggerDir, File treetaggerDir) { |
545 |
|
546 |
File modelfile = new File(modelsDirectory, lang+".par"); |
547 |
if (!modelfile.exists()) {
|
548 |
println "Error: No Modelfile available for lang "+modelfile+". Continue import process "; |
549 |
return false; |
550 |
} |
551 |
File annotfile = new File(annotDir, f.getName()+"-STDOFF.xml"); |
552 |
File ttsrcfile = new File(ptreetaggerDir, f.getName()+"-src.tt"); |
553 |
File ttrezfile = new File(treetaggerDir, f.getName()+"-out.tt"); |
554 |
//println ("TT with $model "+f+"+"+annotfile+" > "+ttsrcfile+" > "+ttrezfile);
|
555 |
|
556 |
//BUILD TT FILE READY TO BE TAGGED
|
557 |
def builder = new BuildTTSrc(f.toURL()); |
558 |
builder.process(ttsrcfile, null);
|
559 |
|
560 |
//Apply TT
|
561 |
applyTT(ttsrcfile, ttrezfile, modelfile, options); |
562 |
|
563 |
//CREATE STANDOFF FILES
|
564 |
initTTOutfileInfos(binDir, modelfile, lang, properties); |
565 |
writeStandoffFile(ttrezfile, annotfile) |
566 |
|
567 |
//INJECT ANNOTATIONS
|
568 |
File tmpFile = new File(txmDir, "temp_"+f.getName()) |
569 |
builder = new AnnotationInjection(f.toURL(), annotfile.toURL(), replace);
|
570 |
builder.process(tmpFile); |
571 |
if (!(f.delete() && tmpFile.renameTo(f))) println "Warning can't rename file "+tmpFile+" to "+f |
572 |
|
573 |
return f.exists();
|
574 |
} |
575 |
|
576 |
public void setCancelNow() { |
577 |
cancelNow = true;
|
578 |
} |
579 |
|
580 |
public class ThreadFile extends Thread { |
581 |
File f;
|
582 |
public ThreadFile(String name, File f) { |
583 |
this.setName(name)
|
584 |
this.f = f;
|
585 |
} |
586 |
} |
587 |
} |