Révision 3051
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/tmx/tmxLoader.groovy (revision 3051) | ||
---|---|---|
88 | 88 |
if (annotate) { |
89 | 89 |
println "-- ANNOTATE - Running NLP tools" |
90 | 90 |
def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger") |
91 |
if (engine.processDirectory(txmDir, binDir, ["lang":model])) { |
|
91 |
def mapForTreeTagger = [:] |
|
92 |
for (def k : textLangs.keySet()) mapForTreeTagger[k] = textLangs[k].toLowerCase(); |
|
93 |
println "TreeTagger models to use per text: $mapForTreeTagger" |
|
94 |
if (engine.processDirectory(txmDir, binDir, ["langs":mapForTreeTagger])) { |
|
92 | 95 |
annotationSuccess = true; |
93 | 96 |
if (project.getCleanAfterBuild()) { |
94 | 97 |
new File(binDir, "treetagger").deleteDir() |
... | ... | |
97 | 100 |
} |
98 | 101 |
} |
99 | 102 |
} |
100 |
println "langs : "+textLangs |
|
101 |
println "texts : "+langGroups |
|
103 |
//println "langs : "+textLangs
|
|
104 |
//println "texts : "+langGroups
|
|
102 | 105 |
|
103 | 106 |
println "-- COMPILING - Building Search Engine indexes" |
104 | 107 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/Annotate.groovy (revision 3051) | ||
---|---|---|
403 | 403 |
// println "convert "+f+" latin1 >> UTF-8" |
404 | 404 |
// new EncodingConverter(f, "ISO-8859-1", "UTF-8") |
405 | 405 |
// } |
406 |
|
|
407 |
|
|
408 |
|
|
409 |
initTTOutfileInfos(binDir, modelfile, modelfilename); |
|
410 |
|
|
411 |
File annotfile = new File(binDir, "annotations/"+outfile.getName()+"-STOFF.xml"); |
|
412 |
if (!writeStandoffFile(outfile, annotfile)) { |
|
413 |
println("Failed to build standoff file of "+outfile); |
|
414 |
} |
|
415 |
if (cancelNow) return; |
|
406 | 416 |
} |
407 | 417 |
println("") |
408 | 418 |
|
... | ... | |
410 | 420 |
return false; |
411 | 421 |
} |
412 | 422 |
|
413 |
//BUILD STAND-OFF FILES |
|
414 |
if (modelfile == null) { |
|
415 |
println "no model applied" |
|
416 |
return false; |
|
417 |
} |
|
418 |
|
|
419 |
initTTOutfileInfos(binDir, modelfile, modelfilename); |
|
420 |
files = new File(binDir, "treetagger").listFiles(IOUtils.HIDDENFILE_FILTER)// now contains the result files of TT |
|
421 |
println "Building stdoff files ("+files.size()+") from dir:"+new File(binDir, "treetagger")+" to "+new File(binDir, "annotations"); |
|
422 |
if (files == null || files.size() == 0) { |
|
423 |
return false; |
|
424 |
} |
|
425 |
cpb = new ConsoleProgressBar(files.size()) |
|
426 |
for (File ttrezfile : files) { |
|
427 |
cpb.tick() |
|
428 |
File annotfile = new File(binDir, "annotations/"+ttrezfile.getName()+"-STOFF.xml"); |
|
429 |
if (!writeStandoffFile(ttrezfile, annotfile)) { |
|
430 |
println("Failed to build standoff file of "+ttrezfile); |
|
431 |
} |
|
432 |
} |
|
433 |
println("") |
|
434 |
|
|
435 |
if (cancelNow) return; |
|
436 |
|
|
437 | 423 |
//INJECT ANNOTATIONS |
438 | 424 |
List<File> interpfiles = new File(binDir, "annotations").listFiles(IOUtils.HIDDENFILE_FILTER); |
439 | 425 |
List<File> txmfiles = txmDir.listFiles(IOUtils.HIDDENFILE_FILTER); |
tmp/org.txm.treetagger.core/src/org/txm/treetagger/core/TreeTaggerEngine.java (revision 3051) | ||
---|---|---|
2 | 2 |
|
3 | 3 |
import java.io.File; |
4 | 4 |
import java.util.HashMap; |
5 |
import java.util.Map; |
|
5 | 6 |
|
6 | 7 |
import org.eclipse.core.runtime.IProgressMonitor; |
7 | 8 |
import org.eclipse.osgi.util.NLS; |
... | ... | |
82 | 83 |
public boolean processFile(File xmlFile, File binaryCorpusDirectory, HashMap<String, Object> parameters) { |
83 | 84 |
if (!isRunning()) return false; |
84 | 85 |
|
86 |
String lang = null; |
|
87 |
Object ps = parameters.get("langs"); |
|
88 |
Object p = parameters.get("lang"); |
|
85 | 89 |
|
86 |
Object p = parameters.get("lang"); |
|
87 |
if (p == null) { |
|
90 |
if (p == null && ps == null) { |
|
91 |
Log.warning("Warning: can't annotate. No 'lang' (String) or 'langs' (Map<String, String>) parameter specified in " + parameters); |
|
92 |
return false; |
|
93 |
} |
|
94 |
|
|
95 |
if (ps != null && ps instanceof Map) { |
|
96 |
Map<?, ?> map = (Map<?, ?>) ps; |
|
97 |
String text_id = xmlFile.getName(); |
|
98 |
if (map.get(text_id) != null) { |
|
99 |
lang = map.get(text_id).toString().toLowerCase(); |
|
100 |
if (!canAnnotateLang(lang)) { |
|
101 |
Log.warning("Warning: can't annotate text_id=${text_id} with $lang, will use the default lang=$p"); |
|
102 |
return false; |
|
103 |
} |
|
104 |
} |
|
105 |
} |
|
106 |
|
|
107 |
if (lang == null && p == null) { |
|
88 | 108 |
System.out.println(NLS.bind("** Error: no 'lang' parameter given: {0}. Aborting TreeTagger annotation.", parameters)); |
89 | 109 |
return false; |
90 | 110 |
} |
91 |
String lang = p.toString(); |
|
111 |
else { |
|
112 |
lang = p.toString(); |
|
113 |
} |
|
114 |
|
|
92 | 115 |
if (!canAnnotateLang(lang)) { |
93 | 116 |
return false; |
94 | 117 |
} |
118 |
|
|
95 | 119 |
boolean fixExistingValues = false; // default behavior is to replace existing values |
96 | 120 |
if (parameters.get("fix_existing_values") != null) { |
97 | 121 |
fixExistingValues = "true".equals(parameters.get("fix_existing_values")); |
... | ... | |
99 | 123 |
if (FileUtils.isExtension(xmlFile, "cqp")) { |
100 | 124 |
AnnotateCQP annotate = new AnnotateCQP(); |
101 | 125 |
return annotate.run(xmlFile, lang, binaryCorpusDirectory, xmlFile.getParentFile()); |
102 |
} else { |
|
126 |
} |
|
127 |
else { |
|
103 | 128 |
Annotate annotate = new Annotate(); |
104 | 129 |
return annotate.run(xmlFile, lang, fixExistingValues, binaryCorpusDirectory, xmlFile.getParentFile()); |
105 | 130 |
} |
... | ... | |
134 | 159 |
} |
135 | 160 |
|
136 | 161 |
public static boolean canAnnotateLang(String lang) { |
137 |
|
|
162 |
if (lang == null) { |
|
163 |
Log.warning("** Error: not lang given (null)"); |
|
164 |
return false; |
|
165 |
} |
|
138 | 166 |
File ttInstallDirectory = new File(TreeTaggerPreferences.getInstance().getString(TreeTaggerPreferences.INSTALL_PATH)); // default models directory is set in the Toolbox |
139 | 167 |
if (!ttInstallDirectory.exists()) { |
140 |
System.out.println(NLS.bind("** Error: TreeTagger install directory not found at {0}", ttInstallDirectory));
|
|
168 |
Log.warning(NLS.bind("** Error: TreeTagger install directory not found at {0}", ttInstallDirectory));
|
|
141 | 169 |
return false; |
142 | 170 |
} |
143 | 171 |
File modelsDirectory = new File(TreeTaggerPreferences.getInstance().getString(TreeTaggerPreferences.MODELS_PATH)); // default models directory is set in the Toolbox |
144 | 172 |
File modelfile = new File(modelsDirectory, lang + ".par"); |
145 | 173 |
if (!"??".equals(lang) && !modelfile.exists()) { |
146 |
System.out.println(NLS.bind("** Error: no {0} model file found for the {1} lang.", modelfile, lang));
|
|
174 |
Log.warning(NLS.bind("** Error: no {0} model file found for the {1} lang.", modelfile, lang));
|
|
147 | 175 |
return false; |
148 | 176 |
} |
149 | 177 |
return true; |
Formats disponibles : Unified diff