| 28 |
28 |
import org.txm.importer.xtz.*
|
| 29 |
29 |
|
| 30 |
30 |
class XTZImporter extends Importer {
|
| 31 |
|
|
|
31 |
|
| 32 |
32 |
public boolean cancelNow = false;
|
| 33 |
33 |
public def filesToProcess = []
|
| 34 |
|
|
|
34 |
|
| 35 |
35 |
public File frontXSLResultDirectory;
|
| 36 |
36 |
public File tokenizedDirectory;
|
| 37 |
37 |
public File sentencedDirectory;
|
| 38 |
|
|
|
38 |
|
| 39 |
39 |
public String xslPath;
|
| 40 |
40 |
public def xslParams;
|
| 41 |
41 |
public String lang;
|
| 42 |
42 |
public String wordTag;
|
| 43 |
43 |
public boolean doTokenizeStep = false;
|
| 44 |
|
|
|
44 |
|
| 45 |
45 |
public Metadatas metadata = null; // text metadata
|
| 46 |
|
|
|
46 |
|
| 47 |
47 |
public XTZImporter(ImportModule module) {
|
| 48 |
48 |
super(module);
|
| 49 |
49 |
}
|
| 50 |
|
|
|
50 |
|
| 51 |
51 |
public Metadatas getMetadata() {
|
| 52 |
52 |
return metadata;
|
| 53 |
53 |
}
|
| 54 |
|
|
|
54 |
|
| 55 |
55 |
public void process() {
|
| 56 |
|
|
|
56 |
|
| 57 |
57 |
Project project = module.getProject();
|
| 58 |
58 |
File binDir = module.getBinaryDirectory();
|
| 59 |
|
|
|
59 |
|
| 60 |
60 |
String corpusname = project.getName();
|
| 61 |
|
|
|
61 |
|
| 62 |
62 |
def srcFiles = [];
|
| 63 |
63 |
def files = inputDirectory.listFiles();
|
| 64 |
64 |
if (files == null) {
|
| 65 |
65 |
reason = "No file to process in "+inputDirectory
|
| 66 |
66 |
return; // no file to process in the directory
|
| 67 |
67 |
}
|
| 68 |
|
|
|
68 |
|
| 69 |
69 |
for (File f : files) {
|
| 70 |
70 |
if (f.isHidden() || f.isDirectory())
|
| 71 |
|
continue;
|
|
71 |
continue;
|
| 72 |
72 |
else if (f.getName().endsWith(".properties"))
|
| 73 |
|
continue;
|
|
73 |
continue;
|
| 74 |
74 |
else if (f.getName().equals("import.xml"))
|
| 75 |
|
continue;
|
|
75 |
continue;
|
| 76 |
76 |
else if (f.getName().endsWith(".csv"))
|
| 77 |
|
continue;
|
|
77 |
continue;
|
| 78 |
78 |
else if (f.getName().endsWith(".dtd"))
|
| 79 |
|
continue;
|
|
79 |
continue;
|
| 80 |
80 |
else if (f.getName().endsWith(".xsl"))
|
| 81 |
|
continue;
|
|
81 |
continue;
|
| 82 |
82 |
else if (f.getName().endsWith("~"))
|
| 83 |
|
continue;
|
|
83 |
continue;
|
| 84 |
84 |
else if (f.getName().startsWith("."))
|
| 85 |
|
continue;
|
| 86 |
|
|
|
85 |
continue;
|
|
86 |
|
| 87 |
87 |
srcFiles << f
|
| 88 |
88 |
}
|
| 89 |
|
|
|
89 |
|
| 90 |
90 |
if (srcFiles.size() == 0) {
|
| 91 |
91 |
reason = "No suitable file to process in "+inputDirectory
|
| 92 |
92 |
return; // no file to process in the directory
|
| 93 |
93 |
}
|
| 94 |
|
|
|
94 |
|
| 95 |
95 |
// prepare front XSL if any
|
| 96 |
96 |
xslPath = project.getFrontXSL()
|
| 97 |
97 |
xslParams = project.getXsltParameters()
|
| 98 |
|
|
|
98 |
|
| 99 |
99 |
lang = project.getLang();
|
| 100 |
|
|
|
100 |
|
| 101 |
101 |
wordTag = project.getTokenizerWordElement()
|
| 102 |
102 |
this.doTokenizeStep = project.getDoTokenizerStep()
|
| 103 |
|
|
|
103 |
|
| 104 |
104 |
//prepare metadata if any
|
| 105 |
105 |
File allMetadataFile = Metadatas.findMetadataFile(inputDirectory);
|
| 106 |
106 |
if (allMetadataFile.exists()) {
|
| ... | ... | |
| 110 |
110 |
return;
|
| 111 |
111 |
}
|
| 112 |
112 |
metadata = new Metadatas(copy,
|
| 113 |
|
Toolbox.getPreference(TBXPreferences.METADATA_ENCODING),
|
| 114 |
|
Toolbox.getPreference(TBXPreferences.METADATA_COLSEPARATOR),
|
| 115 |
|
Toolbox.getPreference(TBXPreferences.METADATA_TXTSEPARATOR), 1)
|
|
113 |
Toolbox.getPreference(TBXPreferences.METADATA_ENCODING),
|
|
114 |
Toolbox.getPreference(TBXPreferences.METADATA_COLSEPARATOR),
|
|
115 |
Toolbox.getPreference(TBXPreferences.METADATA_TXTSEPARATOR), 1)
|
| 116 |
116 |
}
|
| 117 |
|
|
|
117 |
|
| 118 |
118 |
// remove previous Text if any
|
| 119 |
119 |
project.deleteChildren(Text.class);
|
| 120 |
120 |
|
| 121 |
121 |
|
| 122 |
122 |
// main workflow of XTZ importer step
|
| 123 |
123 |
|
| 124 |
|
|
| 125 |
124 |
if (!doFixSurrogates()) return;
|
| 126 |
125 |
if (!doSplitMergeXSLStep()) return;
|
| 127 |
126 |
if (!doFrontXSLStep()) return;
|
| ... | ... | |
| 133 |
132 |
if (!doInjectMetadataStep()) return;
|
| 134 |
133 |
|
| 135 |
134 |
isSuccessFul = filesToProcess.size() > 0
|
| 136 |
|
|
|
135 |
|
| 137 |
136 |
String cleanDirectories = project.getCleanAfterBuild();
|
| 138 |
137 |
if ("true".equals(cleanDirectories)) {
|
| 139 |
138 |
new File(module.getBinaryDirectory(), "tokenized").deleteDir()
|
| ... | ... | |
| 141 |
140 |
new File(module.getBinaryDirectory(), "split").deleteDir()
|
| 142 |
141 |
}
|
| 143 |
142 |
}
|
| 144 |
|
|
|
143 |
|
| 145 |
144 |
/**
|
| 146 |
145 |
* read from source directory and remove the surrogate chars not well managed
|
| 147 |
146 |
* save the result in $bindir/src
|
| ... | ... | |
| 168 |
167 |
FileCopy.copyFiles(file, outputFile)
|
| 169 |
168 |
}
|
| 170 |
169 |
}
|
| 171 |
|
|
|
170 |
|
| 172 |
171 |
inputDirectory = srcDirectory; // the files to process are now in the "src" directory
|
| 173 |
172 |
return true;
|
| 174 |
173 |
}
|
| 175 |
|
|
|
174 |
|
| 176 |
175 |
/**
|
| 177 |
176 |
* read from source directory and write the result in $bindir/txm
|
| 178 |
177 |
*
|
| ... | ... | |
| 196 |
195 |
for (File xslFile : xslFiles) {
|
| 197 |
196 |
if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
|
| 198 |
197 |
//if (!xslFile.getName().matches("[1-9]-.+")) continue;
|
| 199 |
|
|
|
198 |
|
| 200 |
199 |
if (ApplyXsl2.processImportSources(xslFile, inputDirectory.listFiles(), xslParams)) {
|
| 201 |
200 |
println ""
|
| 202 |
201 |
} else {
|
| ... | ... | |
| 210 |
209 |
}
|
| 211 |
210 |
return true;
|
| 212 |
211 |
}
|
| 213 |
|
|
|
212 |
|
| 214 |
213 |
public boolean doEncodeMileStonesStep() {
|
| 215 |
214 |
filesToProcess = new File(module.getBinaryDirectory(), "tokenized").listFiles()
|
| 216 |
215 |
def milestonesString = module.getProject().getTextualPlan("MileStones")
|
| 217 |
216 |
if (milestonesString.length() == 0) return true;
|
| 218 |
|
|
|
217 |
|
| 219 |
218 |
def milestones = milestonesString.split(",")
|
| 220 |
219 |
//def milestones = ["lb", "pb", "cb"]
|
| 221 |
220 |
//println "milestones=$milestones type="+milestones.getClass()+" size="+milestones.size()
|
| 222 |
221 |
if (milestones.size() == 0) return true;
|
| 223 |
|
|
|
222 |
|
| 224 |
223 |
println "-- Encoding milestone $milestones into XML files..."
|
| 225 |
224 |
ConsoleProgressBar cpb = new ConsoleProgressBar(milestones.size())
|
| 226 |
225 |
for (def milestone : milestones) {
|
| 227 |
226 |
milestone = milestone.trim()// just in case
|
| 228 |
227 |
cpb.tick()
|
| 229 |
|
|
|
228 |
|
| 230 |
229 |
for (File inputFile : filesToProcess) {
|
| 231 |
230 |
File outputFile = File.createTempFile("tmp", inputFile.getName(),inputFile.getParentFile());
|
| 232 |
231 |
println "$inputFile, 'text', $wordTag, $milestone, $outputFile"
|
| ... | ... | |
| 248 |
247 |
println ""
|
| 249 |
248 |
return true;
|
| 250 |
249 |
}
|
| 251 |
|
|
|
250 |
|
| 252 |
251 |
/**
|
| 253 |
252 |
* read from $inputDirectory and write the result in $bindir/txm
|
| 254 |
253 |
*
|
| 255 |
254 |
*/
|
| 256 |
255 |
public boolean doFrontXSLStep() {
|
| 257 |
|
|
|
256 |
|
| 258 |
257 |
//filesToProcess = inputDirectory.listFiles();
|
| 259 |
|
|
|
258 |
|
| 260 |
259 |
File frontXSLdirectory = new File(module.getSourceDirectory(), "xsl/2-front")
|
| 261 |
260 |
println "-- Front XSL Step with the $frontXSLdirectory directory."
|
| 262 |
261 |
def xslFiles = frontXSLdirectory.listFiles()
|
| ... | ... | |
| 264 |
263 |
xslParams["output-directory"] = outputDirectory.getAbsoluteFile().toURI().toString();
|
| 265 |
264 |
|
| 266 |
265 |
if (frontXSLdirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
|
| 267 |
|
|
|
266 |
|
| 268 |
267 |
xslFiles.sort()
|
| 269 |
268 |
for (File xslFile : xslFiles) {
|
| 270 |
269 |
if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
|
| 271 |
270 |
//if (!xslFile.getName().matches("[1-9]-.+")) continue;
|
| 272 |
|
|
|
271 |
|
| 273 |
272 |
if (ApplyXsl2.processImportSources(xslFile, inputDirectory, outputDirectory, xslParams)) {
|
| 274 |
273 |
inputDirectory = outputDirectory; // the files to process are now in the "txm" directory
|
| 275 |
274 |
println ""
|
| ... | ... | |
| 283 |
282 |
}
|
| 284 |
283 |
return true;
|
| 285 |
284 |
}
|
| 286 |
|
|
|
285 |
|
| 287 |
286 |
public boolean doCheckXMLFileStep() {
|
| 288 |
287 |
filesToProcess = []
|
| 289 |
|
println "-- Checking XML files for well-formedness."
|
| 290 |
|
def files = inputDirectory.listFiles()
|
| 291 |
|
|
|
288 |
println "-- Checking XML-TEI files for well-formedness."
|
|
289 |
def files = inputDirectory.listFiles(new FileFilter() {
|
|
290 |
public boolean accept(File f) {
|
|
291 |
return (f.getName().toLowerCase().endsWith(".xml") && !f.isDirectory() && !f.isHidden()
|
|
292 |
&& f.canRead()
|
|
293 |
&& !f.getName().startsWith("metadata")
|
|
294 |
&& !f.getName().startsWith("import"));
|
|
295 |
}
|
|
296 |
});
|
|
297 |
|
| 292 |
298 |
if (files == null || files.size() == 0) {
|
| 293 |
299 |
println "No XML file (*.xml) to process. Aborting"
|
| 294 |
300 |
return false;
|
| 295 |
301 |
}
|
| 296 |
302 |
files = files.sort()
|
| 297 |
|
|
|
303 |
|
| 298 |
304 |
ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
|
| 299 |
305 |
for (File f : files) {
|
| 300 |
306 |
cpb.tick()
|
| 301 |
|
if (f.getName().toLowerCase().endsWith(".xml"))
|
| 302 |
|
if (!f.isHidden() && f.canRead() && !f.getName().endsWith(".properties")
|
| 303 |
|
&& !f.getName().startsWith("metadata") && !f.isDirectory())
|
| 304 |
|
{
|
| 305 |
|
|
| 306 |
|
if (!ValidateXml.test(f)) {
|
| 307 |
|
continue;
|
| 308 |
|
} else {
|
| 309 |
|
if (!f.getName().equals("import.xml"))
|
| 310 |
|
filesToProcess << f
|
| 311 |
|
}
|
|
307 |
|
|
308 |
if (!ValidateXml.teiTest(f)) {
|
|
309 |
println "Won't process XML file: "+f
|
|
310 |
continue;
|
|
311 |
} else {
|
|
312 |
filesToProcess << f
|
| 312 |
313 |
}
|
| 313 |
314 |
}
|
| 314 |
315 |
println ""
|
| 315 |
316 |
if (filesToProcess.size() == 0) {
|
| 316 |
|
reason = "No file suitable to process in "+inputDirectory
|
|
317 |
reason = "No suitable file to process in "+inputDirectory
|
| 317 |
318 |
return false;
|
| 318 |
319 |
} else {
|
| 319 |
320 |
return true;
|
| 320 |
321 |
}
|
| 321 |
322 |
}
|
| 322 |
|
|
| 323 |
|
|
|
323 |
|
| 324 |
324 |
public boolean doTokenizeStep() {
|
| 325 |
|
|
|
325 |
|
| 326 |
326 |
new File(module.getBinaryDirectory(),"tokenized").deleteDir()
|
| 327 |
327 |
new File(module.getBinaryDirectory(),"tokenized").mkdir()
|
| 328 |
|
|
|
328 |
|
| 329 |
329 |
String outSideTextTagsRegex = "";
|
| 330 |
330 |
String outSideTextTagsAndKeepContentRegex = "";
|
| 331 |
331 |
String noteRegex = "";
|
| 332 |
332 |
// get the element names to ignore
|
| 333 |
|
|
|
333 |
|
| 334 |
334 |
String e1 = module.getProject().getTextualPlan("OutSideTextTags")
|
| 335 |
335 |
def split = e1.split(",")
|
| 336 |
336 |
for (String s : split) {
|
| ... | ... | |
| 339 |
339 |
if (outSideTextTagsRegex.trim().length() > 0) {
|
| 340 |
340 |
outSideTextTagsRegex = outSideTextTagsRegex.substring(1) // remove the first "|"
|
| 341 |
341 |
}
|
| 342 |
|
|
|
342 |
|
| 343 |
343 |
String e2 = module.getProject().getTextualPlan("OutSideTextTagsAndKeepContent")
|
| 344 |
344 |
def split2 = e2.split(",")
|
| 345 |
345 |
for (String s : split2) {
|
| ... | ... | |
| 348 |
348 |
if (outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
|
| 349 |
349 |
outSideTextTagsAndKeepContentRegex = outSideTextTagsAndKeepContentRegex.substring(1) // remove the first "|"
|
| 350 |
350 |
}
|
| 351 |
|
|
|
351 |
|
| 352 |
352 |
String e3 = module.getProject().getTextualPlan("Note")
|
| 353 |
353 |
def split3 = e3.split(",")
|
| 354 |
354 |
for (String s : split3) {
|
| ... | ... | |
| 357 |
357 |
if (noteRegex.trim().length() > 0) {
|
| 358 |
358 |
noteRegex = noteRegex.substring(1) // remove the first "|"
|
| 359 |
359 |
}
|
| 360 |
|
|
|
360 |
|
| 361 |
361 |
//if (wordTag != "w") {
|
| 362 |
362 |
if (!doTokenizeStep) {
|
| 363 |
363 |
println "No tokenization do to."
|
| ... | ... | |
| 375 |
375 |
File infile = f;
|
| 376 |
376 |
File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
|
| 377 |
377 |
SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
|
| 378 |
|
|
|
378 |
|
| 379 |
379 |
if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) {
|
| 380 |
380 |
tokenizer.setOutSideTextTags(outSideTextTagsRegex)
|
| 381 |
381 |
}
|
| 382 |
|
|
|
382 |
|
| 383 |
383 |
if (outSideTextTagsAndKeepContentRegex != null && outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
|
| 384 |
384 |
tokenizer.setOutSideTextTagsAndKeepContent(outSideTextTagsAndKeepContentRegex)
|
| 385 |
385 |
}
|
| 386 |
|
|
|
386 |
|
| 387 |
387 |
if (noteRegex != null && noteRegex.trim().length() > 0) {
|
| 388 |
388 |
tokenizer.setNote(noteRegex)
|
| 389 |
389 |
}
|
| 390 |
|
|
|
390 |
|
| 391 |
391 |
// tokenize !
|
| 392 |
392 |
if (!tokenizer.process()) {
|
| 393 |
393 |
println("Failed to process "+f)
|
| ... | ... | |
| 395 |
395 |
}
|
| 396 |
396 |
}
|
| 397 |
397 |
}
|
| 398 |
|
|
|
398 |
|
| 399 |
399 |
filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
|
| 400 |
400 |
println ""
|
| 401 |
|
|
|
401 |
|
| 402 |
402 |
return true;
|
| 403 |
403 |
}
|
| 404 |
|
|
|
404 |
|
| 405 |
405 |
/**
|
| 406 |
406 |
* read from $bindir/tokenized and write the result in $bindir/tokenized
|
| 407 |
407 |
*
|
| 408 |
408 |
*/
|
| 409 |
409 |
public boolean doPostTokenizeXSLStep() {
|
| 410 |
|
|
|
410 |
|
| 411 |
411 |
filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
|
| 412 |
412 |
File xslDirectory = new File(module.getSourceDirectory(), "xsl/3-posttok")
|
| 413 |
413 |
def xslFiles = xslDirectory.listFiles()
|
| 414 |
414 |
if (!xslDirectory.exists() || xslFiles == null || xslFiles.size() == 0) return true;
|
| 415 |
|
|
|
415 |
|
| 416 |
416 |
println "-- Posttokenize XSL Step with $xslDirectory"
|
| 417 |
417 |
return ApplyXsl2.processWithMultipleXSL(filesToProcess, xslDirectory, xslParams)
|
| 418 |
418 |
}
|
| 419 |
|
|
|
419 |
|
| 420 |
420 |
public boolean doInjectMetadataStep() {
|
| 421 |
421 |
if (metadata != null) {
|
| 422 |
422 |
println("-- INJECTING METADATA - "+metadata.getHeadersList()+" in texts of directory "+outputDirectory)
|
| ... | ... | |
| 438 |
438 |
}
|
| 439 |
439 |
return true;
|
| 440 |
440 |
}
|
| 441 |
|
|
|
441 |
|
| 442 |
442 |
public boolean doToXMLTXMStep() {
|
| 443 |
443 |
filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
|
| 444 |
444 |
ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
|
| ... | ... | |
| 448 |
448 |
cpb.tick()
|
| 449 |
449 |
File file = f;
|
| 450 |
450 |
String txmfile = f.getName();
|
| 451 |
|
|
|
451 |
|
| 452 |
452 |
def correspType = new HashMap<String,String>()
|
| 453 |
453 |
def correspRef = new HashMap<String,String>()
|
| 454 |
454 |
//il faut lister les id de tous les respStmt
|
| ... | ... | |
| 479 |
479 |
cpb.done();
|
| 480 |
480 |
return true;
|
| 481 |
481 |
}
|
| 482 |
|
|
|
482 |
|
| 483 |
483 |
@Override
|
| 484 |
484 |
public void checkFiles() {
|
| 485 |
485 |
//isSuccessFul = isSuccessFul && outputDirectory.listFiles() != null;
|
| 486 |
486 |
}
|
| 487 |
|
|
|
487 |
|
| 488 |
488 |
@Override
|
| 489 |
489 |
public void cancel() {
|
| 490 |
490 |
cancelNow = true;
|