Révision 2203
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/tmx/compiler.groovy (revision 2203) | ||
---|---|---|
2 | 2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
3 | 3 |
// Lyon 2, University of Franche-Comté, University of Nice |
4 | 4 |
// Sophia Antipolis, University of Paris 3. |
5 |
//
|
|
5 |
// |
|
6 | 6 |
// The TXM platform is free software: you can redistribute it |
7 | 7 |
// and/or modify it under the terms of the GNU General Public |
8 | 8 |
// License as published by the Free Software Foundation, |
9 | 9 |
// either version 2 of the License, or (at your option) any |
10 | 10 |
// later version. |
11 |
//
|
|
11 |
// |
|
12 | 12 |
// The TXM platform is distributed in the hope that it will be |
13 | 13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
14 | 14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
15 | 15 |
// PURPOSE. See the GNU General Public License for more |
16 | 16 |
// details. |
17 |
//
|
|
17 |
// |
|
18 | 18 |
// You should have received a copy of the GNU General |
19 | 19 |
// Public License along with the TXM platform. If not, see |
20 | 20 |
// http://www.gnu.org/licenses. |
... | ... | |
283 | 283 |
flagAna = true; |
284 | 284 |
anaType = parser.getAttributeValue(null, "type") |
285 | 285 |
if (anaType.length() > 0) anaType = anaType.substring(1); |
286 |
//println "anatype $anaType"
|
|
286 |
//println "anatype $anaType" |
|
287 | 287 |
anahash.put(anaType, ""); |
288 | 288 |
if (firstWord) { |
289 | 289 |
anaTypes << anaType; |
... | ... | |
310 | 310 |
case "w": |
311 | 311 |
firstWord = false; |
312 | 312 |
output.write( vForm.replaceAll("&", "&").replaceAll("<", "<") |
313 |
+"\t"+wordid+"\t"+tuRef);
|
|
313 |
+"\t"+wordid+"\t"+tuRef);
|
|
314 | 314 |
for(String type : anaTypes) { |
315 | 315 |
output.write("\t"+anahash.get(type)); |
316 | 316 |
} |
... | ... | |
342 | 342 |
} |
343 | 343 |
output.close(); |
344 | 344 |
if (parser != null) parser.close(); |
345 |
if (inputData != null) inputData.close(); |
|
345 |
if (inputData != null) inputData.close();
|
|
346 | 346 |
} |
347 | 347 |
catch (Exception ex) { |
348 | 348 |
System.out.println("Exception while parsing " + inputData); |
... | ... | |
410 | 410 |
{ |
411 | 411 |
this.langGroups = groups; |
412 | 412 |
} |
413 |
|
|
413 |
|
|
414 | 414 |
public static HashMap<Integer, ArrayList<String>> corpusIDS; |
415 | 415 |
public void setCorpusIDS(HashMap<Integer, ArrayList<String>> corpusIDS) |
416 | 416 |
{ |
417 | 417 |
this.corpusIDS = corpusIDS; |
418 | 418 |
} |
419 |
|
|
419 |
|
|
420 | 420 |
/** |
421 | 421 |
* Run. |
422 | 422 |
* |
... | ... | |
430 | 430 |
sattrsListener = null; // reset SAttribute Listener for each new import |
431 | 431 |
String rootDir = binDir.getAbsolutePath(); |
432 | 432 |
seg_id=1; |
433 |
|
|
433 |
|
|
434 | 434 |
if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) { |
435 | 435 |
println ("Error: CWB executables not well set.") |
436 | 436 |
return false; |
437 | 437 |
} |
438 |
|
|
438 |
|
|
439 | 439 |
new File(binDir,"cqp").mkdirs() |
440 | 440 |
new File(binDir,"data").mkdirs() |
441 | 441 |
new File(binDir,"registry").mkdirs() |
... | ... | |
458 | 458 |
} |
459 | 459 |
} |
460 | 460 |
println "Using corpus ID: $corpusIDS" |
461 |
|
|
461 |
|
|
462 | 462 |
String cqpName; |
463 | 463 |
for (int group : langGroups.keySet()) { |
464 | 464 |
String lang = langs.get(langGroups.get(group)[0]); |
... | ... | |
479 | 479 |
for (int group : langGroups.keySet()) { |
480 | 480 |
//String lang = langs.get(langGroups.get(group)[0]); |
481 | 481 |
cqpName = basename+"_"+corpusIDS.get(group); |
482 |
|
|
483 |
CorpusBuild corpus = project.getCorpusBuild(project.getName(), MainCorpus.class); |
|
484 |
if (corpus != null) { |
|
485 |
if (project.getDoUpdate()) { |
|
486 |
corpus.clean(); // remove old files |
|
482 |
|
|
483 |
CorpusBuild corpus = project.getCorpusBuild(cqpName, MainCorpus.class); |
|
484 |
if (corpus != null) { |
|
485 |
if (project.getDoUpdate()) { |
|
486 |
corpus.clean(); // remove old files |
|
487 |
} else { |
|
488 |
corpus.delete(); // remove old files and TXMResult children |
|
489 |
} |
|
487 | 490 |
} else { |
488 |
corpus.delete(); // remove old files and TXMResult children |
|
491 |
corpus = new MainCorpus(project); |
|
492 |
corpus.setID(cqpName); |
|
493 |
corpus.setName(cqpName); |
|
489 | 494 |
} |
490 |
} else { |
|
491 |
corpus = new MainCorpus(project); |
|
492 |
corpus.setID(project.getName()); |
|
493 |
corpus.setName(project.getName()); |
|
494 |
} |
|
495 | 495 |
corpus.setDescription("Built with the TMX import module"); |
496 |
|
|
496 |
|
|
497 | 497 |
File cqpFile = new File(binDir,"cqp/"+cqpName+".cqp"); |
498 |
|
|
498 |
|
|
499 | 499 |
def filenames = langGroups.get(group); |
500 | 500 |
filenames.sort() |
501 | 501 |
//println("Process group no $group of files "+filenames) |
... | ... | |
524 | 524 |
} |
525 | 525 |
} |
526 | 526 |
println "" |
527 |
|
|
527 |
|
|
528 | 528 |
for (int group : langGroups.keySet()) { |
529 | 529 |
//String lang = langs.get(langGroups.get(group)[0]); |
530 | 530 |
createOutput(rootDir+"/cqp", basename+"_"+corpusIDS.get(group)+".cqp"); |
... | ... | |
543 | 543 |
def cqpFiles = [:] |
544 | 544 |
for (File cqpfile : new File(rootDir, "cqp").listFiles()) { |
545 | 545 |
String corpusname = cqpfile.getName(); |
546 |
|
|
546 |
|
|
547 | 547 |
corpusname = corpusname.substring(0, corpusname.length()-4); |
548 | 548 |
def corpusAnaTypes = anaTypesPerCqp[corpusname]; |
549 |
|
|
549 |
|
|
550 | 550 |
CwbEncode cwbEn = new CwbEncode(); |
551 | 551 |
cwbEn.setDebug(debug); |
552 | 552 |
CwbMakeAll cwbMa = new CwbMakeAll(); |
... | ... | |
614 | 614 |
if (!corpusName.equals(targetName)) { |
615 | 615 |
try { |
616 | 616 |
PatchCwbRegistry.patchAlignment(new File(registryDirectory, corpusName.getName()), targetName.getName()); |
617 |
|
|
617 |
|
|
618 | 618 |
File cqpFile1 = new File(cqpFiles[corpusName.getName()]); |
619 | 619 |
File cqpFile2 = new File(cqpFiles[targetName.getName()]); |
620 | 620 |
File alignOutFile = new File(outDir, "align.out"); |
621 |
|
|
621 |
|
|
622 | 622 |
BuildAlignOut bao = new BuildAlignOut(cqpFile1, cqpFile2); |
623 | 623 |
if (!bao.process(alignOutFile, "seg", "id")) { |
624 | 624 |
println "Error while creating alignement file of $corpusName. Aborting." |
625 | 625 |
return false; |
626 | 626 |
} |
627 |
|
|
627 |
|
|
628 | 628 |
CwbAlign tt = new CwbAlign(); |
629 | 629 |
// ./cwb-align-encode -D -r ~/TXM/corpora/tmxtest/registry/ -v out.align |
630 | 630 |
tt.setD(); |
Formats disponibles : Unified diff