Révision 803
tmp/org.txm.groovy.core/META-INF/MANIFEST.MF (revision 803) | ||
---|---|---|
68 | 68 |
org.txm.importer.bfm, |
69 | 69 |
org.txm.importer.bvh, |
70 | 70 |
org.txm.importer.corptef, |
71 |
org.txm.importer.cqp, |
|
71 | 72 |
org.txm.importer.discours, |
72 | 73 |
org.txm.importer.doc, |
73 | 74 |
org.txm.importer.factiva, |
... | ... | |
83 | 84 |
org.txm.importer.tmx, |
84 | 85 |
org.txm.importer.transcriber, |
85 | 86 |
org.txm.importer.txt, |
86 |
org.txm.importer.wtc, |
|
87 | 87 |
org.txm.importer.xml, |
88 | 88 |
org.txm.importer.xmltxm, |
89 | 89 |
org.txm.importer.xmltxmpara, |
tmp/org.txm.groovy.core/src/java/org/txm/groovy/core/InstallGroovyFiles.java (revision 803) | ||
---|---|---|
40 | 40 |
publishedImports.put(IMPORTERPACKAGE+"transcriber","transcriberLoader.groovy"); |
41 | 41 |
publishedImports.put(IMPORTERPACKAGE+"tmx","tmxLoader.groovy"); |
42 | 42 |
publishedImports.put(IMPORTERPACKAGE+"doc","docLoader.groovy"); |
43 |
publishedImports.put(IMPORTERPACKAGE+"wtc","wtcLoader.groovy");
|
|
43 |
publishedImports.put(IMPORTERPACKAGE+"cqp","cqpLoader.groovy");
|
|
44 | 44 |
|
45 | 45 |
String bundle_id = "org.txm.groovy.core"; |
46 | 46 |
for (String p : publishedImports.keySet()) { |
tmp/org.txm.groovy.core/src/groovy/org/txm/macroproto/RunImportMacro.groovy (revision 803) | ||
---|---|---|
39 | 39 |
scripts["CNRCSV"] = "discoursLoader.groovy" |
40 | 40 |
scripts["HYPERBASE"] = "hyperbaseLoader.groovy" |
41 | 41 |
scripts["ALCESTE"] = "alcesteLoader.groovy" |
42 |
scripts["CWB"] = "wtcLoader.groovy"
|
|
42 |
scripts["CWB"] = "cqpLoader.groovy"
|
|
43 | 43 |
scripts["DOC"] = "docLoader.groovy" |
44 | 44 |
|
45 | 45 |
def scriptName = scripts.get(importName) |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/testTT.groovy (revision 803) | ||
---|---|---|
78 | 78 |
|
79 | 79 |
//Import to CWB |
80 | 80 |
BuildXmlRGAQCJ.process( anafiles, rootDir) |
81 |
println("build wtc RGAQCJ");
|
|
81 |
println("build cqp RGAQCJ");
|
|
82 | 82 |
println("encode + makeall"); |
83 | 83 |
|
84 | 84 |
//create TT TRAIN files |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/tmx/compiler.groovy (revision 803) | ||
---|---|---|
159 | 159 |
private boolean createOutput(String dirPathName, String fileName){ |
160 | 160 |
try { |
161 | 161 |
File f = new File(dirPathName, fileName) |
162 |
//We don't want to concatenate the wtc files
|
|
162 |
//We don't want to concatenate the cqp files
|
|
163 | 163 |
output = new OutputStreamWriter(new FileOutputStream(f, f.exists()) , "UTF-8"); |
164 | 164 |
return true; |
165 | 165 |
} catch (Exception e) { |
... | ... | |
183 | 183 |
} |
184 | 184 |
|
185 | 185 |
/** |
186 |
* Transfom file wtc.
|
|
186 |
* Transfom file cqp.
|
|
187 | 187 |
* |
188 | 188 |
* @param dirPathName the dir path name |
189 | 189 |
* @param fileName the file name |
... | ... | |
439 | 439 |
return false; |
440 | 440 |
} |
441 | 441 |
|
442 |
new File(binDir,"/wtc/").deleteDir();
|
|
443 |
new File(binDir,"/wtc/").mkdir();
|
|
442 |
new File(binDir,"/cqp/").deleteDir();
|
|
443 |
new File(binDir,"/cqp/").mkdir();
|
|
444 | 444 |
new File(binDir,"/data/").deleteDir(); |
445 | 445 |
new File(binDir,"/data/").mkdir(); |
446 | 446 |
new File(binDir,"registry/").mkdir(); |
... | ... | |
464 | 464 |
} |
465 | 465 |
println "Using corpus ID: $corpusIDS" |
466 | 466 |
|
467 |
String wtcName;
|
|
467 |
String cqpName;
|
|
468 | 468 |
for (int group : langGroups.keySet()) { |
469 | 469 |
String lang = langs.get(langGroups.get(group)[0]); |
470 | 470 |
segs_id.put(group, 0); |
471 |
wtcName = basename+"_"+corpusIDS.get(group);
|
|
472 |
createOutput(rootDir+"/wtc", "${wtcName}.wtc");
|
|
473 |
output.write("<txmcorpus id=\"${wtcName}\" lang=\"$lang\">\n")
|
|
471 |
cqpName = basename+"_"+corpusIDS.get(group);
|
|
472 |
createOutput(rootDir+"/cqp", "${cqpName}.cqp");
|
|
473 |
output.write("<txmcorpus id=\"${cqpName}\" lang=\"$lang\">\n")
|
|
474 | 474 |
output.close(); |
475 | 475 |
|
476 | 476 |
//create txmDirs |
477 |
new File(rootDir, "txm/"+wtcName.toUpperCase()).mkdir();
|
|
477 |
new File(rootDir, "txm/"+cqpName.toUpperCase()).mkdir();
|
|
478 | 478 |
} |
479 | 479 |
|
480 | 480 |
//1- Transform into WTC file and build a corpus per file |
... | ... | |
483 | 483 |
def builder = null; |
484 | 484 |
for (int group : langGroups.keySet()) { |
485 | 485 |
//String lang = langs.get(langGroups.get(group)[0]); |
486 |
wtcName = basename+"_"+corpusIDS.get(group);
|
|
486 |
cqpName = basename+"_"+corpusIDS.get(group);
|
|
487 | 487 |
def filenames = langGroups.get(group); |
488 | 488 |
filenames.sort() |
489 | 489 |
//println("Process group no $group of files "+filenames) |
... | ... | |
496 | 496 |
} |
497 | 497 |
|
498 | 498 |
filename = filename.substring(0, filename.length()-4); |
499 |
String corpusname = (wtcName).toLowerCase();
|
|
499 |
String corpusname = (cqpName).toLowerCase();
|
|
500 | 500 |
counttext++; |
501 | 501 |
|
502 | 502 |
String txtname = f.getName(); |
503 | 503 |
txtname = txtname.substring(0, txtname.lastIndexOf("_")); |
504 | 504 |
seg_id = segs_id.get(group); |
505 | 505 |
builder = new compiler(f.toURI().toURL(), txtname, basename, "default", tuprops); |
506 |
builder.transfomFileWtc(rootDir+"/wtc",wtcName+".wtc");
|
|
507 |
anaTypesPerWtc[wtcName] = builder.getAnaTypes();
|
|
506 |
builder.transfomFileWtc(rootDir+"/cqp",cqpName+".cqp");
|
|
507 |
anaTypesPerWtc[cqpName] = builder.getAnaTypes();
|
|
508 | 508 |
segs_id.put(group, seg_id); |
509 | 509 |
|
510 | 510 |
// move xml-txm file |
511 |
File txmCorpusDir = new File(rootDir, "txm/"+wtcName.toUpperCase())
|
|
511 |
File txmCorpusDir = new File(rootDir, "txm/"+cqpName.toUpperCase())
|
|
512 | 512 |
f.renameTo(new File(txmCorpusDir, f.getName())); |
513 | 513 |
} |
514 | 514 |
} |
... | ... | |
516 | 516 |
|
517 | 517 |
for (int group : langGroups.keySet()) { |
518 | 518 |
//String lang = langs.get(langGroups.get(group)[0]); |
519 |
createOutput(rootDir+"/wtc", basename+"_"+corpusIDS.get(group)+".wtc");
|
|
519 |
createOutput(rootDir+"/cqp", basename+"_"+corpusIDS.get(group)+".cqp");
|
|
520 | 520 |
output.write("</txmcorpus>") |
521 | 521 |
output.close(); |
522 | 522 |
} |
... | ... | |
529 | 529 |
def outDir = rootDir; |
530 | 530 |
def outDirTxm = rootDir; |
531 | 531 |
|
532 |
def wtcFiles = [:]
|
|
533 |
for (File wtcfile : new File(rootDir, "wtc").listFiles()) {
|
|
534 |
String corpusname = wtcfile.getName();
|
|
532 |
def cqpFiles = [:]
|
|
533 |
for (File cqpfile : new File(rootDir, "cqp").listFiles()) {
|
|
534 |
String corpusname = cqpfile.getName();
|
|
535 | 535 |
|
536 | 536 |
corpusname = corpusname.substring(0, corpusname.length()-4); |
537 | 537 |
def corpusAnaTypes = anaTypesPerWtc[corpusname]; |
... | ... | |
583 | 583 |
println "S-attributes: "+sargs |
584 | 584 |
|
585 | 585 |
try { |
586 |
wtcFiles[corpusname.toLowerCase()] = outDir + "/wtc/"+wtcfile.getName();
|
|
586 |
cqpFiles[corpusname.toLowerCase()] = outDir + "/cqp/"+cqpfile.getName();
|
|
587 | 587 |
String regPath = outDirTxm + "/registry/"+corpusname.toLowerCase() |
588 |
cwbEn.run(outDirTxm + "/data/"+corpusname.toUpperCase()+"/", outDir + "/wtc/"+wtcfile.getName(), regPath, pAttributes, sAttributes);
|
|
588 |
cwbEn.run(outDirTxm + "/data/"+corpusname.toUpperCase()+"/", outDir + "/cqp/"+cqpfile.getName(), regPath, pAttributes, sAttributes);
|
|
589 | 589 |
if (!new File(regPath).exists()) { |
590 | 590 |
println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq" |
591 | 591 |
return false; |
... | ... | |
604 | 604 |
try { |
605 | 605 |
PatchCwbRegistry.patchAlignment(new File(registryDirectory, corpusName.getName()), targetName.getName()); |
606 | 606 |
|
607 |
File wtcFile1 = new File(wtcFiles[corpusName.getName()]);
|
|
608 |
File wtcFile2 = new File(wtcFiles[targetName.getName()]);
|
|
607 |
File cqpFile1 = new File(cqpFiles[corpusName.getName()]);
|
|
608 |
File cqpFile2 = new File(cqpFiles[targetName.getName()]);
|
|
609 | 609 |
File alignOutFile = new File(outDir, "align.out"); |
610 | 610 |
|
611 |
BuildAlignOut bao = new BuildAlignOut(wtcFile1, wtcFile2);
|
|
611 |
BuildAlignOut bao = new BuildAlignOut(cqpFile1, cqpFile2);
|
|
612 | 612 |
if (!bao.process(alignOutFile, "seg", "id")) { |
613 | 613 |
println "Error while creating alignement file of $corpusName. Aborting." |
614 | 614 |
return false; |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/bvh/compiler.groovy (revision 803) | ||
---|---|---|
162 | 162 |
} |
163 | 163 |
|
164 | 164 |
/** |
165 |
* Transfom file wtc.
|
|
165 |
* Transfom file cqp.
|
|
166 | 166 |
* |
167 | 167 |
* @param dirPathName the dir path name |
168 | 168 |
* @param fileName the file name |
... | ... | |
545 | 545 |
println ("binary directory does not exists: "+rootDir) |
546 | 546 |
return false; |
547 | 547 |
} |
548 |
new File(rootDir+"/wtc/","bvh.wtc").delete();//cleaning&preparing
|
|
549 |
new File(rootDir,"/wtc/").deleteDir();
|
|
550 |
new File(rootDir,"/wtc/").mkdir();
|
|
548 |
new File(rootDir+"/cqp/","bvh.cqp").delete();//cleaning&preparing
|
|
549 |
new File(rootDir,"/cqp/").deleteDir();
|
|
550 |
new File(rootDir,"/cqp/").mkdir();
|
|
551 | 551 |
new File(rootDir,"/data/").deleteDir(); |
552 | 552 |
new File(rootDir,"/data/").mkdir(); |
553 | 553 |
new File(rootDir,"registry/").mkdir(); |
... | ... | |
570 | 570 |
String txtname = f.getName().substring(0,f.getName().length()-4); |
571 | 571 |
def builder = new compiler(f.toURL(),txtname, "bvh", "default"); |
572 | 572 |
builder.setLang lang |
573 |
builder.transfomFileWtc(rootDir+"/wtc","bvh.wtc");
|
|
573 |
builder.transfomFileWtc(rootDir+"/cqp","bvh.cqp");
|
|
574 | 574 |
} |
575 | 575 |
} |
576 | 576 |
|
... | ... | |
586 | 586 |
String[] sAttributes = ["txmcorpus:0+lang", "text:2+id+type+base+project","div:2+id+type","p:0+id","q:0+id","said:0+id","foreign:0+id","lg:0+id","l:0+id","sp:0+id","speaker:0+id","stage:0+id"]; |
587 | 587 |
try { |
588 | 588 |
String regPath = outDirTxm + "/registry/"+"bvh" |
589 |
cwbEn.run(outDirTxm + "/data", outDir + "/wtc/"+"bvh.wtc", regPath, pAttributes, sAttributes);
|
|
589 |
cwbEn.run(outDirTxm + "/data", outDir + "/cqp/"+"bvh.cqp", regPath, pAttributes, sAttributes);
|
|
590 | 590 |
if (!new File(regPath).exists()) { |
591 | 591 |
println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq" |
592 | 592 |
return false; |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/discours/compiler.groovy (revision 803) | ||
---|---|---|
180 | 180 |
println ("Error: CWB executables not well set.") |
181 | 181 |
return false; |
182 | 182 |
} |
183 |
File wtcFile = new File(binDir,"wtc/${corpusname}.wtc");
|
|
184 |
new File(binDir, "wtc.").deleteDir();
|
|
185 |
new File(binDir, "wtc").mkdirs();
|
|
183 |
File cqpFile = new File(binDir,"cqp/${corpusname}.cqp");
|
|
184 |
new File(binDir, "cqp.").deleteDir();
|
|
185 |
new File(binDir, "cqp").mkdirs();
|
|
186 | 186 |
new File(binDir, "data/$corpusname").deleteDir(); |
187 | 187 |
new File(binDir, "data/$corpusname").mkdir(); |
188 | 188 |
new File(binDir, "registry").deleteDir(); |
189 | 189 |
new File(binDir, "registry").mkdir(); |
190 | 190 |
|
191 | 191 |
//start corpus |
192 |
if (createOutput(wtcFile)) {
|
|
192 |
if (createOutput(cqpFile)) {
|
|
193 | 193 |
output.write("<txmcorpus lang=\""+lang+"\">\n"); |
194 | 194 |
output.close(); |
195 | 195 |
} |
... | ... | |
198 | 198 |
//1- Transform into WTC file |
199 | 199 |
|
200 | 200 |
Collections.sort(files); |
201 |
XMLTXM2WTC wtcbuilder = null;
|
|
201 |
XMLTXM2WTC cqpbuilder = null;
|
|
202 | 202 |
for (File f : files) { |
203 | 203 |
if (second) { print(", ") } |
204 | 204 |
if (second > 0 && (second % 5) == 0) println "" |
205 | 205 |
print(f.getName().replaceFirst("\\.xml", "")); |
206 | 206 |
second++ |
207 | 207 |
|
208 |
wtcbuilder = new XMLTXM2WTC(f.toURI().toURL());
|
|
208 |
cqpbuilder = new XMLTXM2WTC(f.toURI().toURL());
|
|
209 | 209 |
String txtname = f.getName().substring(0,f.getName().length()-4); |
210 |
wtcbuilder.setTextInfo(txtname, corpusname, "project");
|
|
210 |
cqpbuilder.setTextInfo(txtname, corpusname, "project");
|
|
211 | 211 |
|
212 |
wtcbuilder.setBalisesToKeep(["text","p","s"]);
|
|
213 |
wtcbuilder.setSendToPAttributes(["s":["id"], "p":["id"]]);
|
|
214 |
wtcbuilder.setLang(lang);
|
|
215 |
if (!wtcbuilder.transformFile(wtcFile)) {
|
|
212 |
cqpbuilder.setBalisesToKeep(["text","p","s"]);
|
|
213 |
cqpbuilder.setSendToPAttributes(["s":["id"], "p":["id"]]);
|
|
214 |
cqpbuilder.setLang(lang);
|
|
215 |
if (!cqpbuilder.transformFile(cqpFile)) {
|
|
216 | 216 |
println("Failed to compile "+f) |
217 | 217 |
} |
218 | 218 |
|
219 | 219 |
} |
220 | 220 |
|
221 | 221 |
//end corpus |
222 |
if (createOutput(wtcFile)) {
|
|
222 |
if (createOutput(cqpFile)) {
|
|
223 | 223 |
output.write("</txmcorpus>\n"); |
224 | 224 |
output.close(); |
225 | 225 |
} |
226 | 226 |
|
227 |
if (wtcbuilder == null) {
|
|
227 |
if (cqpbuilder == null) {
|
|
228 | 228 |
println "there was no files to process: "+files |
229 | 229 |
return false; |
230 | 230 |
} |
... | ... | |
235 | 235 |
CwbEncode cwbEn = new CwbEncode(); |
236 | 236 |
CwbMakeAll cwbMa = new CwbMakeAll(); |
237 | 237 |
|
238 |
List<String> pAttributesList = wtcbuilder.getpAttributs();
|
|
239 |
List<String> sAttributesList = wtcbuilder.getsAttributs();
|
|
238 |
List<String> pAttributesList = cqpbuilder.getpAttributs();
|
|
239 |
List<String> sAttributesList = cqpbuilder.getsAttributs();
|
|
240 | 240 |
println "pAttrs : "+pAttributesList |
241 | 241 |
println "sAttrs : "+sAttributesList |
242 | 242 |
String[] pAttributes = pAttributesList.toArray(new String[pAttributesList.size()]) |
... | ... | |
248 | 248 |
String regPath = outDir + "registry/"+corpusname.toLowerCase() |
249 | 249 |
cwbEn.run( |
250 | 250 |
outDir + "data/$corpusname", |
251 |
outDir + "/wtc/"+corpusname+".wtc",
|
|
251 |
outDir + "/cqp/"+corpusname+".cqp",
|
|
252 | 252 |
regPath, |
253 | 253 |
pAttributes, |
254 | 254 |
sAttributes); |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/RGAQCJ/compiler.groovy (revision 803) | ||
---|---|---|
161 | 161 |
} |
162 | 162 |
|
163 | 163 |
/** |
164 |
* Transfom file wtc.
|
|
164 |
* Transfom file cqp.
|
|
165 | 165 |
* |
166 | 166 |
* @param dirPathName the dir path name |
167 | 167 |
* @param fileName the file name |
... | ... | |
516 | 516 |
println ("binary directory does not exists: "+rootDir) |
517 | 517 |
return false; |
518 | 518 |
} |
519 |
new File(rootDir+"/wtc/","bfm3tt.wtc").delete();//cleaning&preparing
|
|
520 |
new File(rootDir,"/wtc/").deleteDir();
|
|
521 |
new File(rootDir,"/wtc/").mkdir();
|
|
519 |
new File(rootDir+"/cqp/","bfm3tt.cqp").delete();//cleaning&preparing
|
|
520 |
new File(rootDir,"/cqp/").deleteDir();
|
|
521 |
new File(rootDir,"/cqp/").mkdir();
|
|
522 | 522 |
new File(rootDir,"/data/").deleteDir(); |
523 | 523 |
new File(rootDir,"/data/").mkdir(); |
524 | 524 |
new File(rootDir,"registry/").mkdir(); |
... | ... | |
539 | 539 |
println("process file "+f) |
540 | 540 |
def builder = new compiler(f.toURL(),f.getName(),"bfm3tt","default"); |
541 | 541 |
builder.setLang(lang); |
542 |
builder.transfomFileWtc(rootDir+"/wtc","bfm3tt.wtc");
|
|
542 |
builder.transfomFileWtc(rootDir+"/cqp","bfm3tt.cqp");
|
|
543 | 543 |
} |
544 | 544 |
} |
545 | 545 |
|
... | ... | |
554 | 554 |
String[] sAttributes = ["txmcorpus:0+lang","text:0+id+titre+auteur+datecompo+ssiecle+domaine+genre+forme+dialecte+base+project","front:0+n","body:0+n","ab:0+n+rend","div:0+id+type","q:1+n","p:0+n","back:0+n"]; |
555 | 555 |
try |
556 | 556 |
{ |
557 |
cwbEn.run(outDirTxm + "/data", outDir + "/wtc/"+"bfm3tt.wtc", outDirTxm + "/registry/"+"bfm3tt",pAttributes, sAttributes);
|
|
557 |
cwbEn.run(outDirTxm + "/data", outDir + "/cqp/"+"bfm3tt.cqp", outDirTxm + "/registry/"+"bfm3tt",pAttributes, sAttributes);
|
|
558 | 558 |
cwbMa.run("BFM3TT", outDirTxm + "/registry"); |
559 | 559 |
} catch (Exception ex) {System.out.println(ex); return false;} |
560 | 560 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/RGAQCJ/importRGAQCJ.groovy (revision 803) | ||
---|---|---|
169 | 169 |
} |
170 | 170 |
|
171 | 171 |
/** |
172 |
* Transfom file wtc.
|
|
172 |
* Transfom file cqp.
|
|
173 | 173 |
* |
174 | 174 |
* @param dirPathName the dir path name |
175 | 175 |
* @param fileName the file name |
... | ... | |
345 | 345 |
*/ |
346 | 346 |
public static void process(List<String> files, String rootDir) |
347 | 347 |
{ |
348 |
new File(rootDir+"wtc/").mkdir();
|
|
348 |
new File(rootDir+"cqp/").mkdir();
|
|
349 | 349 |
new File(rootDir+"registry/").mkdir(); |
350 | 350 |
//String[] files = ["roland-ana.xml","qgraal_cm-ana.xml","artu-ana.xml","qjm-ana.xml","commyn1-ana.xml","jehpar-ana.xml"]; |
351 | 351 |
//String rootDir = "~/xml/rgaqcj/"; |
... | ... | |
353 | 353 |
println(types); |
354 | 354 |
|
355 | 355 |
//1- Transform into WTC file |
356 |
File f = new File(rootDir+"wtc/","RGAQCJ.wtc");
|
|
356 |
File f = new File(rootDir+"cqp/","RGAQCJ.cqp");
|
|
357 | 357 |
f.delete(); |
358 | 358 |
|
359 | 359 |
def output = new OutputStreamWriter(new FileOutputStream(f) , "UTF-8"); |
... | ... | |
367 | 367 |
println("process file "+files[i]) |
368 | 368 |
String file = files[i]; |
369 | 369 |
def builder = new BuildXmlRGAQCJ(new File(rootDir+"/anainline/",file).toURL(),""+initiales.charAt(i),types); |
370 |
builder.transfomFileWtc(rootDir+"wtc","RGAQCJ.wtc");
|
|
370 |
builder.transfomFileWtc(rootDir+"cqp","RGAQCJ.cqp");
|
|
371 | 371 |
} |
372 | 372 |
|
373 | 373 |
output = new OutputStreamWriter(new FileOutputStream(f,true) , "UTF-8"); |
... | ... | |
391 | 391 |
{ |
392 | 392 |
if (System.getProperty("os.name").contains("Windows")) |
393 | 393 |
{ |
394 |
cwbEn.run(outDirTxm + "data/"+"RGAQCJ", outDir +"wtc/"+"RGAQCJ"+".wtc", outDirTxm + "registry/"+"rgaqcj",pAttributes, sAttributes);
|
|
394 |
cwbEn.run(outDirTxm + "data/"+"RGAQCJ", outDir +"cqp/"+"RGAQCJ"+".cqp", outDirTxm + "registry/"+"rgaqcj",pAttributes, sAttributes);
|
|
395 | 395 |
cwbMa.run("RGAQCJ", outDirTxm + "registry"); |
396 | 396 |
} |
397 | 397 |
else |
398 | 398 |
{ |
399 |
cwbEn.run(outDirTxm + "data/"+"RGAQCJ", outDir + "/wtc/"+"RGAQCJ.wtc", outDirTxm + "registry/"+"rgaqcj",pAttributes, sAttributes);
|
|
399 |
cwbEn.run(outDirTxm + "data/"+"RGAQCJ", outDir + "/cqp/"+"RGAQCJ.cqp", outDirTxm + "registry/"+"rgaqcj",pAttributes, sAttributes);
|
|
400 | 400 |
cwbMa.run("RGAQCJ", outDirTxm + "registry"); |
401 | 401 |
|
402 | 402 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/ImportXmlTag.groovy (revision 803) | ||
---|---|---|
179 | 179 |
public static void main(String[] args) { |
180 | 180 |
|
181 | 181 |
String rootDir = "~/xml/rgaqcj/"; |
182 |
new File(rootDir+"/wtc/").mkdir();
|
|
182 |
new File(rootDir+"/cqp/").mkdir();
|
|
183 | 183 |
new File(rootDir,"/registry/").mkdir(); |
184 | 184 |
|
185 | 185 |
File srcfile = new File(rootDir+"/src/","roland.xml"); |
186 |
File resultfile = new File(rootDir+"/wtc/","roland-xmltag.wtc");
|
|
186 |
File resultfile = new File(rootDir+"/cqp/","roland-xmltag.cqp");
|
|
187 | 187 |
println("importXmlTag : "+srcfile+" to : "+resultfile ); |
188 | 188 |
|
189 | 189 |
def builder = new ImportXmlTag(srcfile.toURL()); |
190 | 190 |
builder.process(resultfile); |
191 | 191 |
|
192 | 192 |
def inDir = rootDir; |
193 |
def outDir =rootDir+"wtc/";
|
|
193 |
def outDir =rootDir+"cqp/";
|
|
194 | 194 |
def outDirTxm = rootDir; |
195 | 195 |
CwbEncode cwbEn = new CwbEncode(); |
196 | 196 |
CwbMakeAll cwbMa = new CwbMakeAll(); |
... | ... | |
199 | 199 |
String[] sAttributes = ["children:"+builder.maxprof+"+n"]; |
200 | 200 |
|
201 | 201 |
try { |
202 |
cwbEn.run(outDirTxm + "data/"+"ROLANDXML", outDir +"roland-xmltag.wtc", outDirTxm + "registry/"+"rolandxml",pAttributes, sAttributes);
|
|
202 |
cwbEn.run(outDirTxm + "data/"+"ROLANDXML", outDir +"roland-xmltag.cqp", outDirTxm + "registry/"+"rolandxml",pAttributes, sAttributes);
|
|
203 | 203 |
cwbMa.run("ROLANDXML", outDirTxm + "registry"); |
204 | 204 |
} catch (Exception ex) {System.out.println(ex);} |
205 | 205 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/cqp/compiler.groovy (revision 803) | ||
---|---|---|
1 |
|
|
2 |
|
|
3 |
// Copyright © 2010-2013 ENS de Lyon. |
|
4 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
|
5 |
// Lyon 2, University of Franche-Comté, University of Nice |
|
6 |
// Sophia Antipolis, University of Paris 3. |
|
7 |
// |
|
8 |
// The TXM platform is free software: you can redistribute it |
|
9 |
// and/or modify it under the terms of the GNU General Public |
|
10 |
// License as published by the Free Software Foundation, |
|
11 |
// either version 2 of the License, or (at your option) any |
|
12 |
// later version. |
|
13 |
// |
|
14 |
// The TXM platform is distributed in the hope that it will be |
|
15 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
|
16 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
|
17 |
// PURPOSE. See the GNU General Public License for more |
|
18 |
// details. |
|
19 |
// |
|
20 |
// You should have received a copy of the GNU General |
|
21 |
// Public License along with the TXM platform. If not, see |
|
22 |
// http://www.gnu.org/licenses. |
|
23 |
// |
|
24 |
// |
|
25 |
// |
|
26 |
// $LastChangedDate: 2012-01-05 14:27:34 +0100 (jeu., 05 janv. 2012) $ |
|
27 |
// $LastChangedRevision: 2096 $ |
|
28 |
// $LastChangedBy: mdecorde $ |
|
29 |
// |
|
30 |
package org.txm.importer.cqp |
|
31 |
|
|
32 |
import org.txm.Toolbox; |
|
33 |
import org.txm.importer.cwb.* |
|
34 |
import org.txm.importer.*; |
|
35 |
import org.txm.scripts.*; |
|
36 |
import org.txm.scripts.teitxm.*; |
|
37 |
import org.txm.utils.io.FileCopy; |
|
38 |
import org.txm.utils.treetagger.TreeTagger; |
|
39 |
import java.util.ArrayList; |
|
40 |
import javax.xml.stream.*; |
|
41 |
import java.net.URL; |
|
42 |
import java.io.File; |
|
43 |
import java.util.HashMap; |
|
44 |
import java.util.List; |
|
45 |
|
|
46 |
// TODO: Auto-generated Javadoc |
|
47 |
/** |
|
48 |
* The Class compiler. |
|
49 |
*/ |
|
50 |
class compiler |
|
51 |
{ |
|
52 |
/** The debug. */ |
|
53 |
boolean debug = false; |
|
54 |
|
|
55 |
/** The dir. */ |
|
56 |
private def dir; |
|
57 |
|
|
58 |
File srcWTCFile, srcRegistryFile; |
|
59 |
|
|
60 |
public def pAttributesList = []; |
|
61 |
public def sAttributesList = []; |
|
62 |
|
|
63 |
public compiler(File cqpFile, File registryFile) { |
|
64 |
this.srcWTCFile = cqpFile; |
|
65 |
this.srcRegistryFile = registryFile |
|
66 |
} |
|
67 |
|
|
68 |
/** |
|
69 |
* Sets the debug. |
|
70 |
* |
|
71 |
* @return the java.lang. object |
|
72 |
*/ |
|
73 |
public setDebug() |
|
74 |
{ |
|
75 |
debug =true; |
|
76 |
} |
|
77 |
|
|
78 |
/** |
|
79 |
* Run. |
|
80 |
* |
|
81 |
* @param rootDirFile the root dir file |
|
82 |
* @param basename the basename |
|
83 |
* @return true, if successful |
|
84 |
*/ |
|
85 |
public boolean run(File binDir, String corpusname) |
|
86 |
{ |
|
87 |
if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) { |
|
88 |
println ("Error: CWB executables not well set.") |
|
89 |
return false; |
|
90 |
} |
|
91 |
if (!binDir.exists()) { |
|
92 |
println ("binary directory does not exists: "+binDir) |
|
93 |
return false; |
|
94 |
} |
|
95 |
|
|
96 |
if (!srcWTCFile.exists()) { |
|
97 |
println "Error: cannot find the WTC file $srcWTCFile" |
|
98 |
return false; |
|
99 |
} |
|
100 |
|
|
101 |
File cqpFile = new File(binDir,"cqp/"+corpusname+".cqp"); |
|
102 |
new File(binDir, "cqp").deleteDir(); |
|
103 |
new File(binDir, "cqp").mkdir(); |
|
104 |
new File(binDir, "data").deleteDir(); |
|
105 |
new File(binDir, "data").mkdir(); |
|
106 |
new File(binDir, "registry").mkdir(); |
|
107 |
|
|
108 |
FileCopy.copy(srcWTCFile, cqpFile); |
|
109 |
|
|
110 |
//2- Import into CWB |
|
111 |
def outDir = binDir.getAbsolutePath()+"/"; |
|
112 |
|
|
113 |
CwbEncode cwbEn = new CwbEncode(); |
|
114 |
CwbMakeAll cwbMa = new CwbMakeAll(); |
|
115 |
cwbEn.setDebug(debug); |
|
116 |
cwbMa.setDebug(debug); |
|
117 |
|
|
118 |
String[] pAttributes; |
|
119 |
String[] sAttributes; |
|
120 |
|
|
121 |
if (srcRegistryFile == null) { |
|
122 |
println "WARNING: No registry file found in source directory" |
|
123 |
println "We'll search for positional attributes and structural attributes in the WTC file" |
|
124 |
|
|
125 |
// s attributes |
|
126 |
BuildCwbEncodeArgsFromWTC argsgetter = new BuildCwbEncodeArgsFromWTC(); // XML stream |
|
127 |
argsgetter.process(cqpFile); //$NON-NLS-1$ |
|
128 |
sAttributesList = argsgetter.getSAttributes(); |
|
129 |
|
|
130 |
// p attributes |
|
131 |
int nbAttr = -1; |
|
132 |
File tmp = File.createTempFile("txm", ".cqp", cqpFile.getParentFile()); |
|
133 |
int wcounter = 1; |
|
134 |
println "Adding the 'id' property to the WTC file and getting word properties number." |
|
135 |
tmp.withWriter("UTF-8") { writer -> |
|
136 |
cqpFile.eachLine("UTF-8") { line -> |
|
137 |
if (!line.startsWith("<")) { |
|
138 |
if (nbAttr == -1) nbAttr = line.split("\t").size(); |
|
139 |
writer.println(line+"\tw_"+(wcounter++)) |
|
140 |
} else { |
|
141 |
writer.println(line) |
|
142 |
} |
|
143 |
writer.flush(); |
|
144 |
} |
|
145 |
} |
|
146 |
cqpFile.delete() |
|
147 |
tmp.renameTo(cqpFile) |
|
148 |
|
|
149 |
System.out.println("Found "+(nbAttr-1)+" word properties, $nbAttr with the 'id'"); |
|
150 |
for (int i = 1; i < nbAttr ; i++) { |
|
151 |
pAttributesList << "p$i"; |
|
152 |
} |
|
153 |
pAttributesList << "id" |
|
154 |
} else { |
|
155 |
ReadRegistryFile reader = new ReadRegistryFile(srcRegistryFile); |
|
156 |
pAttributesList = reader.getPAttributes(); |
|
157 |
sAttributesList = reader.getSAttributes(); |
|
158 |
pAttributesList.remove(0) // remove word |
|
159 |
|
|
160 |
if (!pAttributesList.contains("id")) { |
|
161 |
System.out.println("Error: The registry file does not declare the 'id' word property"); |
|
162 |
return false; |
|
163 |
} |
|
164 |
} |
|
165 |
|
|
166 |
pAttributes = pAttributesList; // cast to array |
|
167 |
sAttributes = sAttributesList; // cast to array |
|
168 |
|
|
169 |
println "pAttrs : "+Arrays.toString(pAttributes) |
|
170 |
println "sAttrs : "+Arrays.toString(sAttributes) |
|
171 |
|
|
172 |
try { |
|
173 |
cwbEn.setDebug(debug); |
|
174 |
cwbMa.setDebug(debug); |
|
175 |
String regPath =outDir + "/registry/"+corpusname.toLowerCase() |
|
176 |
cwbEn.run(outDir + "/data/$corpusname", |
|
177 |
cqpFile.getAbsolutePath(), |
|
178 |
regPath, pAttributes, sAttributes); |
|
179 |
if (!new File(regPath).exists()) { |
|
180 |
println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq" |
|
181 |
return false; |
|
182 |
} |
|
183 |
cwbMa.run(corpusname, outDir + "/registry"); |
|
184 |
return true; |
|
185 |
} catch (Exception ex) {System.out.println(ex); return false;} |
|
186 |
|
|
187 |
return true; |
|
188 |
} |
|
189 |
|
|
190 |
/** |
|
191 |
* The main method. |
|
192 |
* |
|
193 |
* @param args the arguments |
|
194 |
*/ |
|
195 |
public static void main(String[] args) |
|
196 |
{ |
|
197 |
File dir = new File("~/xml/perrault/txm/"); |
|
198 |
List<File> files = dir.listFiles(); |
|
199 |
new compiler().run(files); |
|
200 |
} |
|
201 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/cqp/cqpLoader.groovy (revision 803) | ||
---|---|---|
1 |
package org.txm.importer.cqp |
|
2 |
|
|
3 |
import org.txm.importer.cwb.BuildCwbEncodeArgs; |
|
4 |
import java.io.File; |
|
5 |
import java.util.ArrayList; |
|
6 |
|
|
7 |
import org.txm.objects.*; |
|
8 |
import org.txm.scripts.teitxm.*; |
|
9 |
import org.txm.*; |
|
10 |
import org.txm.utils.i18n.*; |
|
11 |
import org.txm.metadatas.*; |
|
12 |
import org.txm.utils.io.FileCopy; |
|
13 |
import org.w3c.dom.Element |
|
14 |
import org.txm.utils.xml.DomUtils; |
|
15 |
|
|
16 |
String userDir = System.getProperty("user.home"); |
|
17 |
|
|
18 |
def MONITOR; |
|
19 |
boolean debug = org.txm.utils.logger.Log.isPrintingErrors(); |
|
20 |
BaseParameters params; |
|
21 |
try {params = paramsBinding;MONITOR=monitor} catch (Exception) |
|
22 |
{ println "DEV MODE";//exception means we debug |
|
23 |
debug = true |
|
24 |
params = new BaseParameters(new File(userDir, "xml/cqp/import.xml")) |
|
25 |
params.load() |
|
26 |
if (!org.txm.Toolbox.isInitialized()) { |
|
27 |
Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM")); |
|
28 |
//Toolbox.setParam(Toolbox.INSTALL_DIR,new File("C:\\Program Files\\TXM"));//For Windows |
|
29 |
Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger")); |
|
30 |
//Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File("C:\\Program Files\\treetagger"));//for Windows |
|
31 |
Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models")); |
|
32 |
Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8"); |
|
33 |
Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ","); |
|
34 |
Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\""); |
|
35 |
//Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File("C:\\Program Files\\treetagger\\models"));//for Windows |
|
36 |
Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM")); |
|
37 |
} |
|
38 |
} |
|
39 |
if (params == null) { println "no parameters. Aborting"; return; } |
|
40 |
|
|
41 |
String corpusname = params.getCorpusName(); |
|
42 |
Element corpusElem = params.corpora.get(corpusname); |
|
43 |
String basename = params.name; |
|
44 |
String rootDir = params.rootDir; |
|
45 |
String lang = corpusElem.getAttribute("lang"); |
|
46 |
String model = lang |
|
47 |
String encoding = corpusElem.getAttribute("encoding"); |
|
48 |
boolean annotate = "true" == corpusElem.getAttribute("annotate"); |
|
49 |
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl") |
|
50 |
def xslParams = params.getXsltParams(corpusElem); |
|
51 |
int wordsPerPage = params.getWordsPerPage("default") |
|
52 |
boolean build_edition = params.getDoEdition("default") |
|
53 |
|
|
54 |
File srcDir = new File(rootDir); |
|
55 |
File binDir = new File(Toolbox.getTXMHOMEPATH(),"corpora/"+basename); |
|
56 |
binDir.deleteDir(); |
|
57 |
binDir.mkdirs(); |
|
58 |
if (!binDir.exists()) { |
|
59 |
println "Could not create binDir "+binDir |
|
60 |
return; |
|
61 |
} |
|
62 |
|
|
63 |
// FAKE: will contains no XML-TXM files |
|
64 |
File txmDir = new File(binDir, "txm/$corpusname"); |
|
65 |
txmDir.deleteDir(); |
|
66 |
txmDir.mkdirs(); |
|
67 |
|
|
68 |
// Will contains the edition files |
|
69 |
File htmlDir = new File(binDir, "HTML/$corpusname"); |
|
70 |
htmlDir.deleteDir(); |
|
71 |
htmlDir.mkdirs(); |
|
72 |
|
|
73 |
// get the cqp file and the registry file |
|
74 |
File cqpFile; |
|
75 |
File registryFile; |
|
76 |
println "Looking for files to process..." |
|
77 |
for(File f : srcDir.listFiles()) { |
|
78 |
if (f.isHidden()) continue; |
|
79 |
if (f.isDirectory()) continue; |
|
80 |
if (f.getName().endsWith(".cqp")) { |
|
81 |
if (cqpFile == null) { |
|
82 |
cqpFile = f; |
|
83 |
println "Using WTC file : $cqpFile" |
|
84 |
} |
|
85 |
} else if (f.getName().endsWith(".xml")) { |
|
86 |
|
|
87 |
} else if (!f.getName().contains(".")){ // its the registry file |
|
88 |
if (registryFile == null) { |
|
89 |
registryFile = f; |
|
90 |
println "Using registry file : $f" |
|
91 |
} |
|
92 |
} |
|
93 |
} |
|
94 |
|
|
95 |
if (cqpFile == null) { |
|
96 |
println "No WTC file in source directory: "+srcDir; |
|
97 |
return false; |
|
98 |
} |
|
99 |
|
|
100 |
// get pAttributes an run cwb-encode + cwb-makeall |
|
101 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
102 |
if (MONITOR != null) MONITOR.worked(25, "COMPILING") |
|
103 |
println "-- COMPILING - Building Search Engine indexes" |
|
104 |
def c = new compiler(cqpFile, registryFile) |
|
105 |
c.setDebug(debug) |
|
106 |
if (!c.run(binDir, corpusname)) { |
|
107 |
println "Import stopped" |
|
108 |
return; |
|
109 |
} |
|
110 |
|
|
111 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
112 |
|
|
113 |
new File(binDir,"HTML/$corpusname").deleteDir(); |
|
114 |
new File(binDir,"HTML/$corpusname").mkdirs(); |
|
115 |
if (build_edition) { |
|
116 |
|
|
117 |
println "-- EDITION - Building edition" |
|
118 |
if (MONITOR != null) MONITOR.worked(20, "EDITION - Building edition") |
|
119 |
|
|
120 |
File outdir = new File(binDir,"/HTML/$corpusname/default/"); |
|
121 |
outdir.mkdirs(); |
|
122 |
|
|
123 |
print "." |
|
124 |
|
|
125 |
List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang); |
|
126 |
List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang); |
|
127 |
|
|
128 |
File binWtcFile = new File(binDir,"cqp/"+corpusname+".cqp"); |
|
129 |
def ed = new pager(binWtcFile, outdir, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, registryFile !=null, c.pAttributesList); |
|
130 |
def allIndexes = ed.getIdx(); |
|
131 |
def allPages = ed.getPageFiles(); |
|
132 |
for (String txtname : ed.getTextNames()) { |
|
133 |
def txtIndexes = allIndexes[txtname]; |
|
134 |
def txtPages = allPages[txtname]; |
|
135 |
|
|
136 |
Element text = params.addText(params.corpora.get(corpusname), txtname, cqpFile); |
|
137 |
Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html"); |
|
138 |
|
|
139 |
for (i = 0 ; i < txtPages.size();) { |
|
140 |
File f = txtPages.get(i); |
|
141 |
String wordid = txtIndexes.get(i); |
|
142 |
params.addPage(edition, ""+(++i), wordid); |
|
143 |
} |
|
144 |
} |
|
145 |
} |
|
146 |
println "" |
|
147 |
|
|
148 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
149 |
if (MONITOR != null) MONITOR.worked(20, "FINALIZING") |
|
150 |
File paramFile = new File(binDir, "import.xml"); |
|
151 |
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true; |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/cqp/package.html (revision 803) | ||
---|---|---|
1 |
<html> |
|
2 |
<body> |
|
3 |
<p>CWB import module.</p> |
|
4 |
</body> |
|
5 |
</html> |
|
0 | 6 |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/cqp/pager.groovy (revision 803) | ||
---|---|---|
1 |
package org.txm.importer.cqp; |
|
2 |
//Copyright © - ANR Textométrie - http://textometrie.ens-lyon.fr |
|
3 |
// |
|
4 |
//This file is part of the TXM platform. |
|
5 |
// |
|
6 |
//The TXM platform is free software: you can redistribute it and/or modif y |
|
7 |
//it under the terms of the GNU General Public License as published by |
|
8 |
//the Free Software Foundation, either version 3 of the License, or |
|
9 |
//(at your option) any later version. |
|
10 |
// |
|
11 |
//The TXM platform is distributed in the hope that it will be useful, |
|
12 |
//but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13 |
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
14 |
//GNU General Public License for more details. |
|
15 |
// |
|
16 |
//You should have received a copy of the GNU General Public License |
|
17 |
//along with the TXM platform. If not, see <http://www.gnu.org/licenses/>. |
|
18 |
// |
|
19 |
// |
|
20 |
// |
|
21 |
//$LastChangedDate: 2012-06-01 17:47:31 +0200 (ven., 01 juin 2012) $ |
|
22 |
//$LastChangedRevision: 2185 $ |
|
23 |
//$LastChangedBy: mdecorde $ |
|
24 |
// |
|
25 |
|
|
26 |
import org.txm.importer.*; |
|
27 |
import org.xml.sax.Attributes; |
|
28 |
import org.txm.importer.filters.*; |
|
29 |
import java.io.File; |
|
30 |
import java.io.IOException; |
|
31 |
import java.util.ArrayList; |
|
32 |
import javax.xml.parsers.SAXParserFactory; |
|
33 |
import javax.xml.parsers.ParserConfigurationException; |
|
34 |
import javax.xml.parsers.SAXParser; |
|
35 |
import javax.xml.stream.*; |
|
36 |
import java.net.URL; |
|
37 |
import org.xml.sax.InputSource; |
|
38 |
import org.xml.sax.helpers.DefaultHandler; |
|
39 |
|
|
40 |
/** Build WTC corpus simple edition from a WTC file. |
|
41 |
* |
|
42 |
* @author mdecorde |
|
43 |
*/ |
|
44 |
class pager { |
|
45 |
List<String> NoSpaceBefore; |
|
46 |
|
|
47 |
/** The No space after. */ |
|
48 |
List<String> NoSpaceAfter; |
|
49 |
|
|
50 |
/** The wordcount. */ |
|
51 |
int wordcount = 0; |
|
52 |
|
|
53 |
/** The pagecount. */ |
|
54 |
int pagecount = 0; |
|
55 |
|
|
56 |
/** The wordmax. */ |
|
57 |
int wordmax = 0; |
|
58 |
|
|
59 |
/** The basename. */ |
|
60 |
String basename = ""; |
|
61 |
String txtname = ""; |
|
62 |
File outdir; |
|
63 |
|
|
64 |
/** The wordid. */ |
|
65 |
String wordid; |
|
66 |
|
|
67 |
/** The first word. */ |
|
68 |
boolean firstWord = true; |
|
69 |
|
|
70 |
/** The wordvalue. */ |
|
71 |
String wordvalue; |
|
72 |
|
|
73 |
/** The interpvalue. */ |
|
74 |
String interpvalue; |
|
75 |
|
|
76 |
/** The lastword. */ |
|
77 |
String lastword = " "; |
|
78 |
|
|
79 |
/** The wordtype. */ |
|
80 |
String wordtype; |
|
81 |
|
|
82 |
/** The flagform. */ |
|
83 |
boolean flagform = false; |
|
84 |
|
|
85 |
/** The flaginterp. */ |
|
86 |
boolean flaginterp = false; |
|
87 |
|
|
88 |
/** The url. */ |
|
89 |
private def url; |
|
90 |
|
|
91 |
/** The input data. */ |
|
92 |
private def inputData; |
|
93 |
|
|
94 |
/** The factory. */ |
|
95 |
private def factory; |
|
96 |
|
|
97 |
/** The parser. */ |
|
98 |
private XMLStreamReader parser; |
|
99 |
|
|
100 |
/** The writer. */ |
|
101 |
OutputStreamWriter writer; |
|
102 |
|
|
103 |
/** The pagedWriter. */ |
|
104 |
OutputStreamWriter pagedWriter = null; |
|
105 |
|
|
106 |
/** The cqpFile. */ |
|
107 |
File cqpFile; |
|
108 |
|
|
109 |
/** The outfile. */ |
|
110 |
File outfile; |
|
111 |
|
|
112 |
/** The pages. */ |
|
113 |
def pages = [:]; |
|
114 |
|
|
115 |
/** The idxstart. */ |
|
116 |
def idxstart = [:] |
|
117 |
String editionPage; |
|
118 |
ArrayList<Integer> splitTUs; // contains the tu ids used to split pages |
|
119 |
boolean shouldSplit = false; |
|
120 |
boolean useSplitTUs = false; |
|
121 |
boolean hasWordId = false; |
|
122 |
int noWordIdProperty = -1; |
|
123 |
/** |
|
124 |
* Instantiates a new pager. |
|
125 |
* |
|
126 |
* @param cqpFile the cqpFile |
|
127 |
* @param outfile the outfile |
|
128 |
* @param NoSpaceBefore the no space before |
|
129 |
* @param NoSpaceAfter the no space after |
|
130 |
* @param max the max |
|
131 |
* @param basename the basename |
|
132 |
*/ |
|
133 |
pager(File cqpFile, File outdir, List<String> NoSpaceBefore, |
|
134 |
List<String> NoSpaceAfter, int max, String basename, boolean hasWordId, List pAttrs) { |
|
135 |
this.editionPage = editionPage; |
|
136 |
this.basename = basename; |
|
137 |
this.txtname = "text1"; |
|
138 |
this.outdir = outdir; |
|
139 |
this.wordmax = max; |
|
140 |
this.NoSpaceBefore = NoSpaceBefore; |
|
141 |
this.NoSpaceAfter = NoSpaceAfter; |
|
142 |
this.cqpFile = cqpFile; |
|
143 |
this.hasWordId = hasWordId; |
|
144 |
if (hasWordId) { |
|
145 |
noWordIdProperty = pAttrs.indexOf("id") + 1; |
|
146 |
if (noWordIdProperty == 0) |
|
147 |
hasWordId = false; |
|
148 |
} |
|
149 |
|
|
150 |
process(); |
|
151 |
} |
|
152 |
|
|
153 |
private void closeMultiWriter() |
|
154 |
{ |
|
155 |
if (pagedWriter != null) { |
|
156 |
if (firstWord) { // there was no words |
|
157 |
this.idxstart[txtname] = ["w_0"] |
|
158 |
pagedWriter.write("<span id=\"w_0\"/>"); |
|
159 |
} |
|
160 |
pagedWriter.write("</p>\n") |
|
161 |
pagedWriter.write("</body>"); |
|
162 |
pagedWriter.write("</html>"); |
|
163 |
pagedWriter.close(); |
|
164 |
} |
|
165 |
} |
|
166 |
|
|
167 |
/** |
|
168 |
* Creates the next output. |
|
169 |
* |
|
170 |
* @return true, if successful |
|
171 |
*/ |
|
172 |
private boolean createNextOutput() |
|
173 |
{ |
|
174 |
wordcount = 0; |
|
175 |
shouldSplit = false; |
|
176 |
try { |
|
177 |
closeMultiWriter(); |
|
178 |
File outfile = new File(outdir, txtname+"_"+(++pagecount)+".html") |
|
179 |
//println "outfile: "+outfile |
|
180 |
if (pages[txtname] == null) pages[txtname] = [] |
|
181 |
pages[txtname] << outfile; |
|
182 |
firstWord = true; // waiting for next word |
|
183 |
|
|
184 |
pagedWriter = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(outfile)) , "UTF-8"); |
|
185 |
pagedWriter.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"); |
|
186 |
pagedWriter.write("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n") |
|
187 |
pagedWriter.write("<html>"); |
|
188 |
pagedWriter.write("<head>"); |
|
189 |
pagedWriter.write("<title>"+basename.toUpperCase()+" $txtname Edition - Page "+pagecount+"</title>"); |
|
190 |
pagedWriter.write("<link rel=\"stylesheet\" type=\"text/css\" href=\"tmx.css\"/>"); |
|
191 |
pagedWriter.write("<link rel=\"stylesheet\" type=\"text/css\" href=\"style.css\"/>"); |
|
192 |
pagedWriter.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>"); |
|
193 |
pagedWriter.write("</head>"); |
|
194 |
pagedWriter.write("<body>"); |
|
195 |
|
|
196 |
return true; |
|
197 |
} catch (Exception e) { |
|
198 |
System.out.println(e.getLocalizedMessage()); |
|
199 |
return false; |
|
200 |
} |
|
201 |
} |
|
202 |
|
|
203 |
/** |
|
204 |
* Creates the output. |
|
205 |
* |
|
206 |
* @param outfile the outfile |
|
207 |
* @return true, if successful |
|
208 |
*/ |
|
209 |
private boolean createOutput() { |
|
210 |
try { |
|
211 |
return createNextOutput(); |
|
212 |
} catch (Exception e) { |
|
213 |
System.out.println(e.getLocalizedMessage()); |
|
214 |
return false; |
|
215 |
} |
|
216 |
} |
|
217 |
|
|
218 |
/** |
|
219 |
* Gets the page files. |
|
220 |
* |
|
221 |
* @return the page files |
|
222 |
*/ |
|
223 |
public def getPageFiles() { |
|
224 |
return pages; |
|
225 |
} |
|
226 |
|
|
227 |
/** |
|
228 |
* Gets the idx. |
|
229 |
* |
|
230 |
* @return the idx |
|
231 |
*/ |
|
232 |
public def getIdx() { |
|
233 |
return idxstart; |
|
234 |
} |
|
235 |
|
|
236 |
def texts = [] |
|
237 |
|
|
238 |
/** |
|
239 |
* Process. |
|
240 |
*/ |
|
241 |
void process() { |
|
242 |
|
|
243 |
String localname = ""; |
|
244 |
String lastword= ""; |
|
245 |
createNextOutput(); |
|
246 |
int wcounter = 1; |
|
247 |
int txtwcounter = 1; |
|
248 |
|
|
249 |
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(cqpFile) , "UTF-8")); |
|
250 |
String line = reader.readLine(); |
|
251 |
int cline = 0; |
|
252 |
while (line != null) { |
|
253 |
pagedWriter.flush(); |
|
254 |
cline++; |
|
255 |
//println "line: "+line |
|
256 |
if (line.startsWith("<")) { |
|
257 |
if (line.startsWith("<lb/") || line.startsWith("<br/") || line.startsWith("</p>") || |
|
258 |
line.startsWith("<lb>") || line.startsWith("<br>")) { |
|
259 |
pagedWriter.write("<br/>"); |
|
260 |
} |
|
261 |
if (line.startsWith("<text>") || line.startsWith("<text ")) { |
|
262 |
String tmp = line; |
|
263 |
int idx = tmp.indexOf("id=\""); |
|
264 |
if (idx > 0) { |
|
265 |
tmp = tmp.substring(idx+4); |
|
266 |
//println "tmp1: "+tmp |
|
267 |
int idx2 = tmp.indexOf("\""); |
|
268 |
//println "tmp2: "+tmp |
|
269 |
if (idx2 > 0) { |
|
270 |
tmp = tmp.substring(0, idx2); |
|
271 |
txtname = tmp |
|
272 |
texts << txtname; |
|
273 |
pagecount = 0; |
|
274 |
createNextOutput(); |
|
275 |
} |
|
276 |
} |
|
277 |
} else if (line.startsWith("<s>") || line.startsWith("<s ")) { |
|
278 |
pagedWriter.write(line.replaceAll("<s", "<sent ")); |
|
279 |
} else if (line.startsWith("</s>")) { |
|
280 |
pagedWriter.write("</sent>"); |
|
281 |
} else if (line.startsWith("<pb/>") || line.startsWith("<pb>")) { |
|
282 |
pagedWriter.write(line); |
|
283 |
txtwcounter = 1; |
|
284 |
createNextOutput(); |
|
285 |
} else { |
|
286 |
pagedWriter.write(line); |
|
287 |
} |
|
288 |
} else { // word |
|
289 |
if (txtwcounter > wordmax) { |
|
290 |
txtwcounter = 1; |
|
291 |
createNextOutput(); |
|
292 |
} |
|
293 |
def split = line.split("\t"); |
|
294 |
if (split == null || split.size() == 0) { |
|
295 |
// empty line |
|
296 |
} else { |
|
297 |
String wordid; |
|
298 |
if (hasWordId) { |
|
299 |
if (split.size() <= noWordIdProperty) { // +1 since line contains word |
|
300 |
println "Error: line $cline" |
|
301 |
} else { |
|
302 |
wordid = split[noWordIdProperty]; |
|
303 |
} |
|
304 |
} else { |
|
305 |
wordid = "w_"+(wcounter++); |
|
306 |
} |
|
307 |
txtwcounter++; |
|
308 |
String wordvalue = split[0]; |
|
309 |
|
|
310 |
if (firstWord) { |
|
311 |
firstWord = false; |
|
312 |
if (this.idxstart[txtname] == null) { |
|
313 |
this.idxstart[txtname] = [] |
|
314 |
} |
|
315 |
this.idxstart[txtname] << [wordid]; |
|
316 |
} |
|
317 |
|
|
318 |
int l = lastword.length(); |
|
319 |
String endOfLastWord = ""; |
|
320 |
if (l > 0) { |
|
321 |
endOfLastWord = lastword.subSequence(l-1, l); |
|
322 |
} |
|
323 |
|
|
324 |
String interpvalue = ""; |
|
325 |
if (split.size() > 1) { |
|
326 |
interpvalue = split[1..split.size()-1].join(" ") |
|
327 |
interpvalue = interpvalue.replace("&", "&").replace("<", "<").replace("\"","""); |
|
328 |
} |
|
329 |
|
|
330 |
|
|
331 |
if (NoSpaceBefore.contains(wordvalue) || |
|
332 |
NoSpaceAfter.contains(lastword) || |
|
333 |
wordvalue.startsWith("-") || |
|
334 |
NoSpaceAfter.contains(endOfLastWord)) { |
|
335 |
pagedWriter.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\">"); |
|
336 |
} else { |
|
337 |
pagedWriter.write(" <span title=\""+interpvalue+"\" id=\""+wordid+"\">"); |
|
338 |
} |
|
339 |
|
|
340 |
pagedWriter.write(wordvalue.replace("&", "&").replace("\"",""").replace("<", "<")+"</span>\n"); |
|
341 |
pagedWriter.flush() |
|
342 |
lastword = wordvalue; |
|
343 |
} |
|
344 |
} |
|
345 |
line = reader.readLine(); |
|
346 |
} |
|
347 |
|
|
348 |
closeMultiWriter(); |
|
349 |
} |
|
350 |
|
|
351 |
def getTextNames() { |
|
352 |
return texts; |
|
353 |
} |
|
354 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/corptef/compiler.groovy (revision 803) | ||
---|---|---|
160 | 160 |
} |
161 | 161 |
|
162 | 162 |
/** |
163 |
* Transfom file wtc.
|
|
163 |
* Transfom file cqp.
|
|
164 | 164 |
* |
165 | 165 |
* @param dirPathName the dir path name |
166 | 166 |
* @param fileName the file name |
... | ... | |
515 | 515 |
println ("binary directory does not exists: "+rootDir) |
516 | 516 |
return false; |
517 | 517 |
} |
518 |
new File(rootDir+"/wtc/","corptef.wtc").delete();//cleaning&preparing
|
|
519 |
new File(rootDir,"/wtc/").deleteDir();
|
|
520 |
new File(rootDir,"/wtc/").mkdir();
|
|
518 |
new File(rootDir+"/cqp/","corptef.cqp").delete();//cleaning&preparing
|
|
519 |
new File(rootDir,"/cqp/").deleteDir();
|
|
520 |
new File(rootDir,"/cqp/").mkdir();
|
|
521 | 521 |
new File(rootDir,"/data/").deleteDir(); |
522 | 522 |
new File(rootDir,"/data/").mkdir(); |
523 | 523 |
new File(rootDir,"registry/").mkdir(); |
... | ... | |
540 | 540 |
String txtname = f.getName().substring(0,f.getName().length()-4); |
541 | 541 |
def builder = new compiler(f.toURL(),txtname, "corptef", "default"); |
542 | 542 |
builder.setLang lang |
543 |
builder.transfomFileWtc(rootDir+"/wtc","corptef.wtc");
|
|
543 |
builder.transfomFileWtc(rootDir+"/cqp","corptef.cqp");
|
|
544 | 544 |
} |
545 | 545 |
} |
546 | 546 |
|
... | ... | |
557 | 557 |
String regPath = new File(outDir , "registry/corptef").getAbsolutePath(); |
558 | 558 |
cwbEn.run( |
559 | 559 |
new File(outDir, "/data").getAbsolutePath(), |
560 |
new File(outDir, "wtc/"+"corptef.wtc").getAbsolutePath(),
|
|
560 |
new File(outDir, "cqp/"+"corptef.cqp").getAbsolutePath(),
|
|
561 | 561 |
regPath, |
562 | 562 |
pAttributes, sAttributes); |
563 | 563 |
if (!new File(regPath).exists()) { |
... | ... | |
568 | 568 |
"CORPTEF", |
569 | 569 |
new File(outDir, "registry").getAbsolutePath()); |
570 | 570 |
} catch (Exception ex) {System.out.println(ex); return false;} |
571 |
// "C:\Documents and Settings\alavrent\TXM\cwb\bin\cwb-encode" -d "C:\Documents and Settings\alavrent\TXM\corpora\corptef\data" -f "C:\Documents and Settings\alavrent\TXM\corpora\corptef\wtc\corpte.wtc" -R "C:\Documents and Settings\alavrent\TXM\corpora\corptef\registry\corptef" -c utf8 -xsB -xsB -P ttpos -P ttlemme -P id -P pb -P lb -P orig -P sic -P abbr -S text:0+id+titre+auteur+datecompo+ssiecle+domaine+genre+forme+dialecte+base+project -S front:0+n -S body:0+n -S ab:0+n+rend -S div:0+id+type -S p:0+n -S back:0+n
|
|
571 |
// "C:\Documents and Settings\alavrent\TXM\cwb\bin\cwb-encode" -d "C:\Documents and Settings\alavrent\TXM\corpora\corptef\data" -f "C:\Documents and Settings\alavrent\TXM\corpora\corptef\cqp\corpte.cqp" -R "C:\Documents and Settings\alavrent\TXM\corpora\corptef\registry\corptef" -c utf8 -xsB -xsB -P ttpos -P ttlemme -P id -P pb -P lb -P orig -P sic -P abbr -S text:0+id+titre+auteur+datecompo+ssiecle+domaine+genre+forme+dialecte+base+project -S front:0+n -S body:0+n -S ab:0+n+rend -S div:0+id+type -S p:0+n -S back:0+n
|
|
572 | 572 |
System.out.println("Done.") |
573 | 573 |
|
574 | 574 |
return true; |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/transcriber/compiler.groovy (revision 803) | ||
---|---|---|
131 | 131 |
|
132 | 132 |
sectionAttrs = new HashSet<String>() // reset section attributs set |
133 | 133 |
|
134 |
File wtcDir = new File(binDir, "wtc")
|
|
135 |
new File(binDir, "wtc").delete();
|
|
136 |
wtcDir.mkdir();
|
|
137 |
if (!new File(binDir, "wtc").exists()) {
|
|
138 |
println "Can't create wtc directory"
|
|
134 |
File cqpDir = new File(binDir, "cqp")
|
|
135 |
new File(binDir, "cqp").delete();
|
|
136 |
cqpDir.mkdir();
|
|
137 |
if (!new File(binDir, "cqp").exists()) {
|
|
138 |
println "Can't create cqp directory"
|
|
139 | 139 |
return false; |
140 | 140 |
} |
141 | 141 |
|
142 |
File wtcFile = new File(wtcDir, corpusname.toLowerCase()+".wtc")
|
|
142 |
File cqpFile = new File(cqpDir, corpusname.toLowerCase()+".cqp")
|
|
143 | 143 |
|
144 | 144 |
// get all anatypes |
145 | 145 |
for (File f : xmlfiles) { |
146 | 146 |
getAnaTypes(f) |
147 | 147 |
} |
148 | 148 |
//println "ANATYPES: "+anatypes |
149 |
if (!createOutput(wtcFile)) return false;
|
|
149 |
if (!createOutput(cqpFile)) return false;
|
|
150 | 150 |
output.write("<txmcorpus lang=\"fr\">\n") |
151 | 151 |
output.close(); |
152 | 152 |
|
... | ... | |
158 | 158 |
} |
159 | 159 |
} |
160 | 160 |
|
161 |
if (!createOutput(wtcFile)) return false;
|
|
161 |
if (!createOutput(cqpFile)) return false;
|
|
162 | 162 |
output.write("</txmcorpus>\n") |
163 | 163 |
output.close(); |
164 | 164 |
|
... | ... | |
216 | 216 |
//return; |
217 | 217 |
try { |
218 | 218 |
cwbEn.run(dataDir.getAbsolutePath(), |
219 |
wtcFile.getAbsolutePath(),
|
|
219 |
cqpFile.getAbsolutePath(),
|
|
220 | 220 |
registryFile.getAbsolutePath(), pAttributes, sAttributes); |
221 | 221 |
if (!registryFile.exists()) { |
222 | 222 |
println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq" |
... | ... | |
311 | 311 |
String filename = xmlfile.getName() |
312 | 312 |
String textid = filename.substring(0, filename.length() - 4); |
313 | 313 |
|
314 |
createOutput(new File(outdir, "wtc/"+corpusname.toLowerCase()+".wtc"));
|
|
314 |
createOutput(new File(outdir, "cqp/"+corpusname.toLowerCase()+".cqp"));
|
|
315 | 315 |
String localname; |
316 | 316 |
|
317 | 317 |
//get all metadatas declared before Episode tag |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/alceste/compiler.groovy (revision 803) | ||
---|---|---|
47 | 47 |
|
48 | 48 |
// TODO: Auto-generated Javadoc |
49 | 49 |
/** |
50 |
* build wtc
|
|
50 |
* build cqp
|
|
51 | 51 |
* build CWB indexes |
52 | 52 |
* create registry file. |
53 | 53 |
* |
... | ... | |
184 | 184 |
println ("binary directory does not exists: "+binDir) |
185 | 185 |
return false; |
186 | 186 |
} |
187 |
File wtcFile = new File(binDir,"wtc/"+corpusname+".wtc");
|
|
188 |
new File(binDir, "wtc").deleteDir();
|
|
189 |
new File(binDir, "wtc").mkdir();
|
|
187 |
File cqpFile = new File(binDir,"cqp/"+corpusname+".cqp");
|
|
188 |
new File(binDir, "cqp").deleteDir();
|
|
189 |
new File(binDir, "cqp").mkdir();
|
|
190 | 190 |
new File(binDir, "data/$corpusname").deleteDir(); |
191 | 191 |
new File(binDir, "data/$corpusname").mkdir(); |
192 | 192 |
new File(binDir, "registry").mkdir(); |
... | ... | |
200 | 200 |
} |
201 | 201 |
|
202 | 202 |
//start corpus |
203 |
if (createOutput(wtcFile)) {
|
|
203 |
if (createOutput(cqpFile)) {
|
|
204 | 204 |
output.write("<txmcorpus lang=\""+lang+"\">\n"); |
205 | 205 |
output.close(); |
206 | 206 |
} |
207 | 207 |
|
208 | 208 |
Collections.sort(files); |
209 | 209 |
//1- Transform into WTC file |
210 |
XMLTXM2WTC wtcbuilder = null;
|
|
210 |
XMLTXM2WTC cqpbuilder = null;
|
|
211 | 211 |
println("process "+files.size()+" files") |
212 | 212 |
for (File f : files) { |
213 | 213 |
print "." |
... | ... | |
215 | 215 |
if (!f.exists()) { |
216 | 216 |
println("file "+f+ " does not exists") |
217 | 217 |
} else { |
218 |
wtcbuilder = new XMLTXM2WTC(f.toURI().toURL());
|
|
218 |
cqpbuilder = new XMLTXM2WTC(f.toURI().toURL());
|
|
219 | 219 |
String txtname = f.getName().substring(0, f.getName().length()-4); |
220 |
wtcbuilder.setTextInfo(txtname, corpusname, "default");
|
|
220 |
cqpbuilder.setTextInfo(txtname, corpusname, "default");
|
|
221 | 221 |
|
222 |
wtcbuilder.setBalisesToKeep(["text", "s", "p", "lb"]);
|
|
223 |
wtcbuilder.setSendToPAttributes(["lb":["n"], "p":["n"], "s":["n"]])
|
|
224 |
wtcbuilder.setLang(lang);
|
|
225 |
if (!wtcbuilder.transformFile(wtcFile)) {
|
|
222 |
cqpbuilder.setBalisesToKeep(["text", "s", "p", "lb"]);
|
|
223 |
cqpbuilder.setSendToPAttributes(["lb":["n"], "p":["n"], "s":["n"]])
|
|
224 |
cqpbuilder.setLang(lang);
|
|
225 |
if (!cqpbuilder.transformFile(cqpFile)) {
|
|
226 | 226 |
println("Failed to compile "+f) |
227 | 227 |
} |
228 | 228 |
} |
... | ... | |
230 | 230 |
println "" |
231 | 231 |
|
232 | 232 |
//end corpus |
233 |
if (createOutput(wtcFile)) {
|
|
233 |
if (createOutput(cqpFile)) {
|
|
234 | 234 |
output.write("</txmcorpus>\n"); |
235 | 235 |
output.close(); |
236 | 236 |
} |
237 | 237 |
|
238 |
if (wtcbuilder == null) {
|
|
238 |
if (cqpbuilder == null) {
|
|
239 | 239 |
println "there was no files in bin dir : "+txmDir |
240 | 240 |
return false; |
241 | 241 |
} |
... | ... | |
247 | 247 |
CwbMakeAll cwbMa = new CwbMakeAll(); |
248 | 248 |
cwbMa.setDebug(debug); |
249 | 249 |
|
250 |
List<String> pAttributesList = wtcbuilder.getpAttributs();
|
|
251 |
List<String> sAttributesList = wtcbuilder.getsAttributs();
|
|
250 |
List<String> pAttributesList = cqpbuilder.getpAttributs();
|
|
251 |
List<String> sAttributesList = cqpbuilder.getsAttributs();
|
|
252 | 252 |
println "pAttrs : "+pAttributesList |
253 | 253 |
println "sAttrs : "+sAttributesList |
254 | 254 |
String[] pAttributes = pAttributesList.toArray(new String[pAttributesList.size()]) |
... | ... | |
256 | 256 |
|
257 | 257 |
try { |
258 | 258 |
String regPath = rootDir + "/registry/"+corpusname.toLowerCase(); // CQP wants lowercase registry name |
259 |
cwbEn.run(rootDir + "/data/${corpusname}", rootDir + "/wtc/"+corpusname+".wtc", regPath,pAttributes, sAttributes);
|
|
259 |
cwbEn.run(rootDir + "/data/${corpusname}", rootDir + "/cqp/"+corpusname+".cqp", regPath,pAttributes, sAttributes);
|
|
260 | 260 |
if (!new File(regPath).exists()) { |
261 | 261 |
println "The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq" |
262 | 262 |
return false; |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xmltxm/compiler.groovy (revision 803) | ||
---|---|---|
176 | 176 |
} |
177 | 177 |
|
178 | 178 |
/** |
179 |
* Transfom file wtc.
|
|
179 |
* Transfom file cqp.
|
|
180 | 180 |
* |
181 | 181 |
* @param dirPathName the dir path name |
182 | 182 |
* @param fileName the file name |
183 | 183 |
* @return true, if successful |
184 | 184 |
*/ |
185 |
public boolean transfomFileWtc(File wtcFile)
|
|
185 |
public boolean transfomFileWtc(File cqpFile)
|
|
186 | 186 |
{ |
187 |
createOutput(wtcFile);
|
|
187 |
createOutput(cqpFile);
|
|
188 | 188 |
String headvalue="" |
189 | 189 |
String vAna = ""; |
190 | 190 |
String vForm = ""; |
... | ... | |
356 | 356 |
return false; |
357 | 357 |
} |
358 | 358 |
|
359 |
File wtcFile = new File(binDir,"wtc/"+corpusname.toLowerCase()+".wtc");
|
|
360 |
new File(rootDir,"/wtc/").deleteDir();
|
|
361 |
new File(rootDir,"/wtc/").mkdir();
|
|
359 |
File cqpFile = new File(binDir,"cqp/"+corpusname.toLowerCase()+".cqp");
|
|
360 |
new File(rootDir,"/cqp/").deleteDir();
|
|
361 |
new File(rootDir,"/cqp/").mkdir();
|
|
362 | 362 |
new File(rootDir,"data/"+corpusname).deleteDir(); |
363 | 363 |
new File(rootDir,"data/"+corpusname).mkdir(); |
364 | 364 |
new File(rootDir,"registry/").mkdir(); |
... | ... | |
373 | 373 |
} |
374 | 374 |
|
375 | 375 |
//0 set Lang |
376 |
if (createOutput(wtcFile)) {
|
|
376 |
if (createOutput(cqpFile)) {
|
|
377 | 377 |
output.write("<txmcorpus lang=\""+lang+"\">\n"); |
378 | 378 |
output.close(); |
379 | 379 |
} |
... | ... | |
388 | 388 |
String txtname = f.getName().substring(0,f.getName().length()-4); |
389 | 389 |
builder = new compiler(f.toURI().toURL(), txtname, corpusname.toLowerCase(), "default"); |
390 | 390 |
builder.setLang(lang); |
391 |
if(!builder.transfomFileWtc(wtcFile))
|
|
391 |
if(!builder.transfomFileWtc(cqpFile))
|
|
392 | 392 |
return false; |
393 | 393 |
} |
394 | 394 |
} |
395 | 395 |
|
396 | 396 |
//end corpus |
397 |
if (createOutput(wtcFile)) {
|
|
397 |
if (createOutput(cqpFile)) {
|
|
398 | 398 |
output.write("</txmcorpus>\n"); |
399 | 399 |
output.close(); |
400 | 400 |
} |
... | ... | |
448 | 448 |
String regPath = rootDir + "/registry/"+corpusname.toLowerCase() |
449 | 449 |
cwbEn.run( |
450 | 450 |
rootDir + "/data/$corpusname", |
451 |
wtcFile.getAbsolutePath(),
|
|
451 |
cqpFile.getAbsolutePath(),
|
|
452 | 452 |
regPath, pAttributes, sAttributes); |
453 | 453 |
if (!new File(regPath).exists()) { |
454 | 454 |
println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq" |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/fleurs/compiler.groovy (revision 803) | ||
---|---|---|
140 | 140 |
} |
141 | 141 |
|
142 | 142 |
/** |
143 |
* Transfom file wtc.
|
|
143 |
* Transfom file cqp.
|
|
144 | 144 |
* |
145 | 145 |
* @param dirPathName the dir path name |
146 | 146 |
* @param fileName the file name |
... | ... | |
240 | 240 |
println ("Error: CWB executables not well set.") |
241 | 241 |
return false; |
Formats disponibles : Unified diff