root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZImporter.groovy @ 2126
History | View | Annotate | Download (14.9 kB)
1 |
package org.txm.scripts.importer.xtz
|
---|---|
2 |
|
3 |
import org.txm.scripts.filters.Tokeniser.SimpleTokenizerXml |
4 |
|
5 |
import java.io.File; |
6 |
|
7 |
import net.sf.saxon.style.XSLParam; |
8 |
|
9 |
import org.txm.core.preferences.TBXPreferences; |
10 |
import org.txm.importer.ApplyXsl2 |
11 |
import org.txm.scripts.importer.CleanFile |
12 |
import org.txm.scripts.importer.MileStoneProjection |
13 |
import org.txm.importer.ValidateXml |
14 |
import org.txm.* |
15 |
|
16 |
import javax.xml.parsers.SAXParser |
17 |
import javax.xml.parsers.SAXParserFactory |
18 |
import javax.xml.stream.* |
19 |
|
20 |
import org.w3c.dom.Element |
21 |
import org.xml.sax.XMLReader |
22 |
import org.txm.utils.xml.DomUtils |
23 |
import org.txm.metadatas.Metadatas |
24 |
import org.txm.objects.* |
25 |
import org.txm.importer.scripts.xmltxm.Xml2Ana |
26 |
import org.txm.utils.ConsoleProgressBar |
27 |
import org.txm.utils.io.* |
28 |
import org.txm.importer.xtz.* |
29 |
|
30 |
class XTZImporter extends Importer { |
31 |
|
32 |
public boolean cancelNow = false; |
33 |
public def filesToProcess = [] |
34 |
|
35 |
public File frontXSLResultDirectory; |
36 |
public File tokenizedDirectory; |
37 |
public File sentencedDirectory; |
38 |
|
39 |
public String xslPath; |
40 |
public def xslParams; |
41 |
public String lang; |
42 |
public String wordTag; |
43 |
public boolean doTokenizeStep = false; |
44 |
|
45 |
public Metadatas metadata = null; // text metadata |
46 |
|
47 |
public XTZImporter(ImportModule module) {
|
48 |
super(module);
|
49 |
} |
50 |
|
51 |
public Metadatas getMetadata() {
|
52 |
return metadata;
|
53 |
} |
54 |
|
55 |
public void process() { |
56 |
|
57 |
Project project = module.getProject(); |
58 |
File binDir = module.getBinaryDirectory();
|
59 |
|
60 |
String corpusname = project.getName();
|
61 |
|
62 |
def srcFiles = []; |
63 |
def files = inputDirectory.listFiles();
|
64 |
if (files == null) { |
65 |
reason = "No file to process in "+inputDirectory
|
66 |
return; // no file to process in the directory |
67 |
} |
68 |
|
69 |
for (File f : files) { |
70 |
if (f.isHidden() || f.isDirectory())
|
71 |
continue;
|
72 |
else if (f.getName().endsWith(".properties")) |
73 |
continue;
|
74 |
else if (f.getName().equals("import.xml")) |
75 |
continue;
|
76 |
else if (f.getName().endsWith(".csv")) |
77 |
continue;
|
78 |
else if (f.getName().endsWith(".dtd")) |
79 |
continue;
|
80 |
else if (f.getName().endsWith(".xsl")) |
81 |
continue;
|
82 |
else if (f.getName().endsWith("~")) |
83 |
continue;
|
84 |
else if (f.getName().startsWith(".")) |
85 |
continue;
|
86 |
|
87 |
srcFiles << f |
88 |
} |
89 |
|
90 |
if (srcFiles.size() == 0) { |
91 |
reason = "No suitable file to process in "+inputDirectory
|
92 |
return; // no file to process in the directory |
93 |
} |
94 |
|
95 |
// prepare front XSL if any
|
96 |
xslPath = project.getFrontXSL() |
97 |
xslParams = project.getXsltParameters() |
98 |
|
99 |
lang = project.getLang(); |
100 |
|
101 |
wordTag = project.getTokenizerWordElement() |
102 |
this.doTokenizeStep = project.getDoTokenizerStep()
|
103 |
|
104 |
//prepare metadata if any
|
105 |
File allMetadataFile = Metadatas.findMetadataFile(inputDirectory);
|
106 |
if (allMetadataFile.exists()) {
|
107 |
File copy = new File(binDir, allMetadataFile.getName()) |
108 |
if (!FileCopy.copy(allMetadataFile, copy)) {
|
109 |
println "Error: could not create a copy of the metadata file "+allMetadataFile.getAbsoluteFile();
|
110 |
return;
|
111 |
} |
112 |
metadata = new Metadatas(copy,
|
113 |
Toolbox.getPreference(TBXPreferences.METADATA_ENCODING), |
114 |
Toolbox.getPreference(TBXPreferences.METADATA_COLSEPARATOR), |
115 |
Toolbox.getPreference(TBXPreferences.METADATA_TXTSEPARATOR), 1)
|
116 |
} |
117 |
|
118 |
String cleanDirectories = project.getCleanAfterBuild();
|
119 |
|
120 |
if (!doFixSurrogates()) return; |
121 |
if (!doSplitMergeXSLStep()) return; |
122 |
if (!doFrontXSLStep()) return; |
123 |
if (!doCheckXMLFileStep()) return; |
124 |
if (!doTokenizeStep()) return; |
125 |
if (!doPostTokenizeXSLStep()) return; |
126 |
if (!doEncodeMileStonesStep()) return; |
127 |
if (!doToXMLTXMStep()) return; |
128 |
if (!doInjectMetadataStep()) return; |
129 |
|
130 |
isSuccessFul = filesToProcess.size() > 0
|
131 |
|
132 |
if ("true".equals(cleanDirectories)) { |
133 |
new File(module.getBinaryDirectory(), "tokenized").deleteDir() |
134 |
new File(module.getBinaryDirectory(), "src").deleteDir() |
135 |
new File(module.getBinaryDirectory(), "split").deleteDir() |
136 |
} |
137 |
} |
138 |
|
139 |
/**
|
140 |
* read from source directory and remove the surrogate chars not well managed
|
141 |
* save the result in $bindir/src
|
142 |
*
|
143 |
*
|
144 |
*
|
145 |
*/
|
146 |
public boolean doFixSurrogates() { |
147 |
File srcDirectory = new File(module.getBinaryDirectory(), "src") |
148 |
|
149 |
def files = inputDirectory.listFiles()
|
150 |
if (files == null) { |
151 |
reason = "No file to process in $inputDirectory"
|
152 |
return false; |
153 |
} |
154 |
srcDirectory.deleteDir() // clean before copying
|
155 |
srcDirectory.mkdir() |
156 |
|
157 |
for (File file : files) { |
158 |
def outputFile = new File(srcDirectory, file.getName()); |
159 |
if (file.getName().toLowerCase().endsWith(".xml")) { |
160 |
CleanFile.removeSurrogateFromXmlFile(file, outputFile) |
161 |
} else {
|
162 |
FileCopy.copyFiles(file, outputFile) |
163 |
} |
164 |
} |
165 |
|
166 |
inputDirectory = srcDirectory; // the files to process are now in the "src" directory
|
167 |
return true; |
168 |
} |
169 |
|
170 |
/**
|
171 |
* read from source directory and write the result in $bindir/txm
|
172 |
*
|
173 |
* only one XSL is applied
|
174 |
*
|
175 |
*/
|
176 |
public boolean doSplitMergeXSLStep() { |
177 |
|
178 |
File splitedDirectory = new File(module.getBinaryDirectory(), "split") |
179 |
splitedDirectory.deleteDir() |
180 |
|
181 |
File xslDirectory = new File(module.getSourceDirectory(), "xsl/1-split-merge") |
182 |
println "-- Split-Merge XSL Step with $xslDirectory"
|
183 |
def xslFiles = xslDirectory.listFiles()
|
184 |
if (xslDirectory.exists() && xslFiles != null && xslFiles.size() > 0) { |
185 |
|
186 |
splitedDirectory.mkdir() |
187 |
xslParams["output-directory"] = splitedDirectory.getAbsoluteFile().toURI().toString();
|
188 |
|
189 |
xslFiles.sort() |
190 |
for (File xslFile : xslFiles) { |
191 |
if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue; |
192 |
//if (!xslFile.getName().matches("[1-9]-.+")) continue;
|
193 |
|
194 |
if (ApplyXsl2.processImportSources(xslFile, inputDirectory.listFiles(), xslParams)) {
|
195 |
println ""
|
196 |
} else {
|
197 |
reason = "Fail to apply split-merge XSL: $xslPath"
|
198 |
} |
199 |
break; // THERE IS ONLY ONE XSL FILE TO APPLY |
200 |
} |
201 |
inputDirectory = splitedDirectory; // the files to process are now in the "src" directory
|
202 |
} else {
|
203 |
//println "Nothing to do."
|
204 |
} |
205 |
return true; |
206 |
} |
207 |
|
208 |
public boolean doEncodeMileStonesStep() { |
209 |
filesToProcess = new File(module.getBinaryDirectory(), "tokenized").listFiles() |
210 |
def milestonesString = module.getProject().getTextualPlan("MileStones") |
211 |
if (milestonesString.length() == 0) return true; |
212 |
|
213 |
def milestones = milestonesString.split(",") |
214 |
//def milestones = ["lb", "pb", "cb"]
|
215 |
//println "milestones=$milestones type="+milestones.getClass()+" size="+milestones.size()
|
216 |
if (milestones.size() == 0) return true; |
217 |
|
218 |
println "-- Encoding milestone $milestones into XML files..."
|
219 |
ConsoleProgressBar cpb = new ConsoleProgressBar(milestones.size())
|
220 |
for (def milestone : milestones) { |
221 |
milestone = milestone.trim()// just in case
|
222 |
cpb.tick() |
223 |
|
224 |
for (File inputFile : filesToProcess) { |
225 |
File outputFile = File.createTempFile("tmp", inputFile.getName(),inputFile.getParentFile()); |
226 |
println "$inputFile, 'text', $wordTag, $milestone, $outputFile"
|
227 |
MileStoneProjection msp = new MileStoneProjection(inputFile, "text", wordTag, milestone) |
228 |
if (!msp.process(outputFile)) {
|
229 |
println "Fail to encode $milestone in $inputFile"
|
230 |
return false |
231 |
} else {
|
232 |
if (inputFile.delete()) {
|
233 |
FileCopy.copy(outputFile, new File(outputFile.getParent(), "copy.xml")) |
234 |
outputFile.renameTo(inputFile) |
235 |
} else {
|
236 |
println "Fail to encode $milestone in ${inputFile}: could not replace the file."
|
237 |
return false |
238 |
} |
239 |
} |
240 |
} |
241 |
} |
242 |
println ""
|
243 |
return true; |
244 |
} |
245 |
|
246 |
/**
|
247 |
* read from $inputDirectory and write the result in $bindir/txm
|
248 |
*
|
249 |
*/
|
250 |
public boolean doFrontXSLStep() { |
251 |
|
252 |
//filesToProcess = inputDirectory.listFiles();
|
253 |
|
254 |
File frontXSLdirectory = new File(module.getSourceDirectory(), "xsl/2-front") |
255 |
println "-- Front XSL Step with the $frontXSLdirectory directory."
|
256 |
def xslFiles = frontXSLdirectory.listFiles()
|
257 |
|
258 |
xslParams["output-directory"] = outputDirectory.getAbsoluteFile().toURI().toString();
|
259 |
|
260 |
if (frontXSLdirectory.exists() && xslFiles != null && xslFiles.size() > 0) { |
261 |
|
262 |
xslFiles.sort() |
263 |
for (File xslFile : xslFiles) { |
264 |
if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue; |
265 |
//if (!xslFile.getName().matches("[1-9]-.+")) continue;
|
266 |
|
267 |
if (ApplyXsl2.processImportSources(xslFile, inputDirectory, outputDirectory, xslParams)) {
|
268 |
inputDirectory = outputDirectory; // the files to process are now in the "txm" directory
|
269 |
println ""
|
270 |
} else {
|
271 |
reason = "Fail to apply front XSL: $xslPath"
|
272 |
return false; |
273 |
} |
274 |
} |
275 |
} else {
|
276 |
//println "Nothing to do."
|
277 |
} |
278 |
return true; |
279 |
} |
280 |
|
281 |
public boolean doCheckXMLFileStep() { |
282 |
filesToProcess = []
|
283 |
println "-- Checking XML files for well-formedness."
|
284 |
def files = inputDirectory.listFiles()
|
285 |
|
286 |
if (files == null || files.size() == 0) { |
287 |
println "No XML file (*.xml) to process. Aborting"
|
288 |
return false; |
289 |
} |
290 |
files = files.sort() |
291 |
|
292 |
ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
|
293 |
for (File f : files) { |
294 |
cpb.tick() |
295 |
if (f.getName().toLowerCase().endsWith(".xml")) |
296 |
if (!f.isHidden() && f.canRead() && !f.getName().endsWith(".properties") |
297 |
&& !f.getName().startsWith("metadata") && !f.isDirectory())
|
298 |
{ |
299 |
|
300 |
if (!ValidateXml.test(f)) {
|
301 |
continue;
|
302 |
} else {
|
303 |
if (!f.getName().equals("import.xml")) |
304 |
filesToProcess << f |
305 |
} |
306 |
} |
307 |
} |
308 |
println ""
|
309 |
if (filesToProcess.size() == 0) { |
310 |
reason = "No file suitable to process in "+inputDirectory
|
311 |
return false; |
312 |
} else {
|
313 |
return true; |
314 |
} |
315 |
} |
316 |
|
317 |
|
318 |
public boolean doTokenizeStep() { |
319 |
|
320 |
new File(module.getBinaryDirectory(),"tokenized").mkdir() |
321 |
|
322 |
String outSideTextTagsRegex = ""; |
323 |
String outSideTextTagsAndKeepContentRegex = ""; |
324 |
String noteRegex = ""; |
325 |
// get the element names to ignore
|
326 |
|
327 |
String e1 = module.getProject().getTextualPlan("OutSideTextTags") |
328 |
def split = e1.split(",") |
329 |
for (String s : split) { |
330 |
outSideTextTagsRegex += "|"+s.trim()
|
331 |
} |
332 |
if (outSideTextTagsRegex.trim().length() > 0) { |
333 |
outSideTextTagsRegex = outSideTextTagsRegex.substring(1) // remove the first "|" |
334 |
} |
335 |
|
336 |
String e2 = module.getProject().getTextualPlan("OutSideTextTagsAndKeepContent") |
337 |
def split2 = e2.split(",") |
338 |
for (String s : split2) { |
339 |
outSideTextTagsAndKeepContentRegex += "|"+s.trim()
|
340 |
} |
341 |
if (outSideTextTagsAndKeepContentRegex.trim().length() > 0) { |
342 |
outSideTextTagsAndKeepContentRegex = outSideTextTagsAndKeepContentRegex.substring(1) // remove the first "|" |
343 |
} |
344 |
|
345 |
String e3 = module.getProject().getTextualPlan("Note") |
346 |
def split3 = e3.split(",") |
347 |
for (String s : split3) { |
348 |
noteRegex += "|"+s.trim()
|
349 |
} |
350 |
if (noteRegex.trim().length() > 0) { |
351 |
noteRegex = noteRegex.substring(1) // remove the first "|" |
352 |
} |
353 |
|
354 |
//if (wordTag != "w") {
|
355 |
if (!doTokenizeStep) {
|
356 |
println "No tokenization do to."
|
357 |
// ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
|
358 |
for (File f : filesToProcess) { |
359 |
File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName()); |
360 |
FileCopy.copy(f, outfile); |
361 |
} |
362 |
return true; |
363 |
} else {
|
364 |
println "-- Tokenizing "+filesToProcess.size()+" files" |
365 |
ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
|
366 |
for (File f : filesToProcess) { |
367 |
cpb.tick() |
368 |
File infile = f;
|
369 |
File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName()); |
370 |
SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
|
371 |
|
372 |
if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) { |
373 |
tokenizer.setOutSideTextTags(outSideTextTagsRegex) |
374 |
} |
375 |
|
376 |
if (outSideTextTagsAndKeepContentRegex != null && outSideTextTagsAndKeepContentRegex.trim().length() > 0) { |
377 |
tokenizer.setOutSideTextTagsAndKeepContent(outSideTextTagsAndKeepContentRegex) |
378 |
} |
379 |
|
380 |
if (noteRegex != null && noteRegex.trim().length() > 0) { |
381 |
tokenizer.setNote(noteRegex) |
382 |
} |
383 |
|
384 |
// tokenize !
|
385 |
if (!tokenizer.process()) {
|
386 |
println("Failed to process "+f)
|
387 |
outfile.delete() |
388 |
} |
389 |
} |
390 |
} |
391 |
|
392 |
filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles() |
393 |
println ""
|
394 |
|
395 |
return true; |
396 |
} |
397 |
|
398 |
/**
|
399 |
* read from $bindir/tokenized and write the result in $bindir/tokenized
|
400 |
*
|
401 |
*/
|
402 |
public boolean doPostTokenizeXSLStep() { |
403 |
|
404 |
filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles() |
405 |
File xslDirectory = new File(module.getSourceDirectory(), "xsl/3-posttok") |
406 |
def xslFiles = xslDirectory.listFiles()
|
407 |
if (!xslDirectory.exists() || xslFiles == null || xslFiles.size() == 0) return true; |
408 |
|
409 |
println "-- Posttokenize XSL Step with $xslDirectory"
|
410 |
return ApplyXsl2.processWithMultipleXSL(filesToProcess, xslDirectory, xslParams)
|
411 |
} |
412 |
|
413 |
public boolean doInjectMetadataStep() { |
414 |
if (metadata != null) { |
415 |
println("-- INJECTING METADATA - "+metadata.getHeadersList()+" in texts of directory "+outputDirectory) |
416 |
ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
|
417 |
for (File infile : filesToProcess) { |
418 |
cpb.tick() |
419 |
File outfile = File.createTempFile("temp", ".xml", infile.getParentFile()); |
420 |
if (metadata.injectMetadatasInXml(infile, outfile, "text", null)) { |
421 |
if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile |
422 |
if (!infile.exists()) {
|
423 |
println "Error: could not replace $infile by $outfile"
|
424 |
return false; |
425 |
} |
426 |
} else {
|
427 |
outfile.delete(); // fail
|
428 |
} |
429 |
} |
430 |
cpb.done() |
431 |
} |
432 |
return true; |
433 |
} |
434 |
|
435 |
public boolean doToXMLTXMStep() { |
436 |
filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles() |
437 |
ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
|
438 |
println("-- Building XML-TXM ("+filesToProcess.size()+" files)") |
439 |
for (File f : filesToProcess) { |
440 |
//ArrayList<String> milestones = new ArrayList<String>();
|
441 |
cpb.tick() |
442 |
File file = f;
|
443 |
String txmfile = f.getName();
|
444 |
|
445 |
def correspType = new HashMap<String,String>() |
446 |
def correspRef = new HashMap<String,String>() |
447 |
//il faut lister les id de tous les respStmt
|
448 |
def respId = []; |
449 |
//fait la correspondance entre le respId et le rapport d'execution de l'outil
|
450 |
def applications = new HashMap<String,HashMap<String,String>>(); |
451 |
//fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
|
452 |
//pour construire les ref vers les taxonomies
|
453 |
def taxonomiesUtilisees = new HashMap<String,String[]>(); |
454 |
//associe un id d'item avec sa description et son URI
|
455 |
def itemsURI = new HashMap<String,HashMap<String,String>>(); |
456 |
//informations de respStmt
|
457 |
//resps (respId <voir ci-dessus>, [description, person, date])
|
458 |
def resps = new HashMap<String,String[]>(); |
459 |
//lance le traitement
|
460 |
def builder = new Xml2Ana(file); |
461 |
builder.setWordTag(module.getProject().getTokenizerWordElement()) |
462 |
builder.setConvertAllAtrtibutes true;
|
463 |
builder.setCorrespondances(correspRef, correspType); |
464 |
builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI) |
465 |
//builder.setAddTEIHeader();
|
466 |
if (!builder.process(new File(outputDirectory, txmfile))) { |
467 |
println("Failed to process "+f);
|
468 |
new File(outputDirectory,txmfile).delete(); |
469 |
} |
470 |
} |
471 |
filesToProcess = outputDirectory.listFiles(); |
472 |
cpb.done(); |
473 |
return true; |
474 |
} |
475 |
|
476 |
@Override
|
477 |
public void checkFiles() { |
478 |
//isSuccessFul = isSuccessFul && outputDirectory.listFiles() != null;
|
479 |
} |
480 |
|
481 |
@Override
|
482 |
public void cancel() { |
483 |
cancelNow = true;
|
484 |
} |
485 |
} |