Révision 804
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/corptef/compiler.groovy (revision 804) | ||
---|---|---|
166 | 166 |
* @param fileName the file name |
167 | 167 |
* @return true, if successful |
168 | 168 |
*/ |
169 |
public boolean transfomFileWtc(String dirPathName, String fileName)
|
|
169 |
public boolean transfomFileCqp(String dirPathName, String fileName)
|
|
170 | 170 |
{ |
171 | 171 |
createOutput(dirPathName, fileName); |
172 | 172 |
|
... | ... | |
526 | 526 |
int counttext =0; |
527 | 527 |
List<File> files = new File(rootDirFile,"txm").listFiles(); |
528 | 528 |
Collections.sort(files); |
529 |
//1- Transform into WTC file
|
|
529 |
//1- Transform into CQP file
|
|
530 | 530 |
for(File f : files) |
531 | 531 |
{ |
532 | 532 |
counttext++; |
... | ... | |
540 | 540 |
String txtname = f.getName().substring(0,f.getName().length()-4); |
541 | 541 |
def builder = new compiler(f.toURL(),txtname, "corptef", "default"); |
542 | 542 |
builder.setLang lang |
543 |
builder.transfomFileWtc(rootDir+"/cqp","corptef.cqp");
|
|
543 |
builder.transfomFileCqp(rootDir+"/cqp","corptef.cqp");
|
|
544 | 544 |
} |
545 | 545 |
} |
546 | 546 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/alceste/compiler.groovy (revision 804) | ||
---|---|---|
206 | 206 |
} |
207 | 207 |
|
208 | 208 |
Collections.sort(files); |
209 |
//1- Transform into WTC file
|
|
210 |
XMLTXM2WTC cqpbuilder = null;
|
|
209 |
//1- Transform into CQP file
|
|
210 |
XMLTXM2CQP cqpbuilder = null;
|
|
211 | 211 |
println("process "+files.size()+" files") |
212 | 212 |
for (File f : files) { |
213 | 213 |
print "." |
... | ... | |
215 | 215 |
if (!f.exists()) { |
216 | 216 |
println("file "+f+ " does not exists") |
217 | 217 |
} else { |
218 |
cqpbuilder = new XMLTXM2WTC(f.toURI().toURL());
|
|
218 |
cqpbuilder = new XMLTXM2CQP(f.toURI().toURL());
|
|
219 | 219 |
String txtname = f.getName().substring(0, f.getName().length()-4); |
220 | 220 |
cqpbuilder.setTextInfo(txtname, corpusname, "default"); |
221 | 221 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xmltxm/compiler.groovy (revision 804) | ||
---|---|---|
154 | 154 |
output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f,f.exists())) , "UTF-8"); |
155 | 155 |
return true; |
156 | 156 |
} catch (Exception e) { |
157 |
println "Error while create WTC otput file: "+e
|
|
157 |
println "Error while create CQP otput file: "+e
|
|
158 | 158 |
e.printStackTrace(); |
159 | 159 |
|
160 | 160 |
return false; |
... | ... | |
182 | 182 |
* @param fileName the file name |
183 | 183 |
* @return true, if successful |
184 | 184 |
*/ |
185 |
public boolean transfomFileWtc(File cqpFile)
|
|
185 |
public boolean transfomFileCqp(File cqpFile)
|
|
186 | 186 |
{ |
187 | 187 |
createOutput(cqpFile); |
188 | 188 |
String headvalue="" |
... | ... | |
324 | 324 |
parser.close(); |
325 | 325 |
} |
326 | 326 |
catch (Exception ex) { |
327 |
System.out.println("Error while writing WTC file $ex");
|
|
327 |
System.out.println("Error while writing CQP file $ex");
|
|
328 | 328 |
ex.printStackTrace(); |
329 | 329 |
return false; |
330 | 330 |
} |
... | ... | |
377 | 377 |
output.write("<txmcorpus lang=\""+lang+"\">\n"); |
378 | 378 |
output.close(); |
379 | 379 |
} |
380 |
//1- Transform into WTC file
|
|
380 |
//1- Transform into CQP file
|
|
381 | 381 |
def builder = null; |
382 | 382 |
for (File f : files) { |
383 | 383 |
counttext++; |
... | ... | |
388 | 388 |
String txtname = f.getName().substring(0,f.getName().length()-4); |
389 | 389 |
builder = new compiler(f.toURI().toURL(), txtname, corpusname.toLowerCase(), "default"); |
390 | 390 |
builder.setLang(lang); |
391 |
if(!builder.transfomFileWtc(cqpFile))
|
|
391 |
if(!builder.transfomFileCqp(cqpFile))
|
|
392 | 392 |
return false; |
393 | 393 |
} |
394 | 394 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/fleurs/compiler.groovy (revision 804) | ||
---|---|---|
146 | 146 |
* @param fileName the file name |
147 | 147 |
* @return true, if successful |
148 | 148 |
*/ |
149 |
public boolean transfomFileWtc(String dirPathName, String fileName)
|
|
149 |
public boolean transfomFileCqp(String dirPathName, String fileName)
|
|
150 | 150 |
{ |
151 | 151 |
createOutput(dirPathName, fileName); |
152 | 152 |
|
... | ... | |
245 | 245 |
new File(rootDir,"cqp/").mkdir(); |
246 | 246 |
new File(rootDir,"registry/").mkdir(); |
247 | 247 |
|
248 |
//1- Transform into WTC file
|
|
248 |
//1- Transform into CQP file
|
|
249 | 249 |
List<File> files = new File(rootDir,"txm").listFiles(); |
250 | 250 |
for (File f : files) { |
251 | 251 |
if (!f.exists()) { |
... | ... | |
257 | 257 |
String txtname = f.getName().substring(0,f.getName().length()-4); |
258 | 258 |
def builder = new compiler(f.toURL(), txtname, "fleurs", "default"); |
259 | 259 |
builder.setLang lang |
260 |
builder.transfomFileWtc(rootDir,"cqp/fleurs.cqp");
|
|
260 |
builder.transfomFileCqp(rootDir,"cqp/fleurs.cqp");
|
|
261 | 261 |
} |
262 | 262 |
} |
263 | 263 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/XMLTXM2WTC.groovy (revision 804) | ||
---|---|---|
38 | 38 |
// TODO: Auto-generated Javadoc |
39 | 39 |
|
40 | 40 |
/** |
41 |
* The Class XMLTXM2WTC.
|
|
41 |
* The Class XMLTXM2CQP.
|
|
42 | 42 |
* |
43 | 43 |
* @author mdecorde |
44 | 44 |
* simple transofmration of a xml-tei-txm file into cqp file |
45 | 45 |
*/ |
46 | 46 |
|
47 |
class XMLTXM2WTC
|
|
47 |
class XMLTXM2CQP
|
|
48 | 48 |
{ |
49 | 49 |
|
50 | 50 |
/** The url. */ |
... | ... | |
119 | 119 |
* |
120 | 120 |
* @param url the url |
121 | 121 |
*/ |
122 |
public XMLTXM2WTC(URL url){
|
|
122 |
public XMLTXM2CQP(URL url){
|
|
123 | 123 |
try { |
124 | 124 |
this.url = url; |
125 | 125 |
inputData = url.openStream(); |
... | ... | |
494 | 494 |
new File(rootDir,"out").deleteDir() |
495 | 495 |
new File(rootDir,"out").mkdir() |
496 | 496 |
|
497 |
System.out.println("XMLTXM2WTC : "+srcfile+" >> "+cqpfile);
|
|
498 |
def builder = new XMLTXM2WTC(srcfile.toURL());
|
|
497 |
System.out.println("XMLTXM2CQP : "+srcfile+" >> "+cqpfile);
|
|
498 |
def builder = new XMLTXM2CQP(srcfile.toURL());
|
|
499 | 499 |
def balises = ["text", "s"]; |
500 | 500 |
builder.setBalisesToKeep(balises); |
501 | 501 |
builder.transformFile(cqpfile); |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/limsi/compiler.groovy (revision 804) | ||
---|---|---|
106 | 106 |
new File(binDir, "data").mkdir() |
107 | 107 |
new File(binDir, "registry").mkdir() |
108 | 108 |
|
109 |
//1- Merge WTC files
|
|
109 |
//1- Merge CQP files
|
|
110 | 110 |
if (annotationSuccess) { |
111 | 111 |
File annotationDir = new File(binDir, "annotations") |
112 | 112 |
if (annotationDir.exists()) txmDir = annotationDir; |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/limsi/limsiLoader.groovy (revision 804) | ||
---|---|---|
157 | 157 |
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE") |
158 | 158 |
println "-- ANNOTATE CQP- Running NLP tools" |
159 | 159 |
boolean annotationSuccess = false; |
160 |
if (annotate && new AnnotateWTC().run(binDir, txmDir, model+".par")) {
|
|
160 |
if (annotate && new AnnotateCQP().run(binDir, txmDir, model+".par")) {
|
|
161 | 161 |
annotationSuccess = true; |
162 | 162 |
} |
163 | 163 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/txt/compiler.groovy (revision 804) | ||
---|---|---|
199 | 199 |
output.close(); |
200 | 200 |
} |
201 | 201 |
|
202 |
//1- Transform into WTC file
|
|
202 |
//1- Transform into CQP file
|
|
203 | 203 |
println("Compiling "+txmDir.listFiles().length+" files") |
204 |
XMLTXM2WTC cqpbuilder = null;
|
|
204 |
XMLTXM2CQP cqpbuilder = null;
|
|
205 | 205 |
ArrayList<File> files = txmDir.listFiles(); |
206 | 206 |
Collections.sort(files); |
207 | 207 |
for (File txmfile : files) { |
208 | 208 |
print "." |
209 |
cqpbuilder = new XMLTXM2WTC(txmfile.toURI().toURL());
|
|
209 |
cqpbuilder = new XMLTXM2CQP(txmfile.toURI().toURL());
|
|
210 | 210 |
String txtname = txmfile.getName().substring(0,txmfile.getName().length()-4); |
211 | 211 |
cqpbuilder.setTextInfo(txtname, this.base, this.project); |
212 | 212 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/hyperbase/compiler.groovy (revision 804) | ||
---|---|---|
212 | 212 |
int counttext =0; |
213 | 213 |
List<File> files = txmDir.listFiles(); |
214 | 214 |
Collections.sort(files); |
215 |
//1- Transform into WTC file
|
|
216 |
XMLTXM2WTC cqpbuilder = null;
|
|
215 |
//1- Transform into CQP file
|
|
216 |
XMLTXM2CQP cqpbuilder = null;
|
|
217 | 217 |
println("Compiling "+files.size()+" files") |
218 | 218 |
for (File f : files) { |
219 | 219 |
print "." |
... | ... | |
222 | 222 |
println("file "+f+ " does not exists") |
223 | 223 |
} |
224 | 224 |
else { |
225 |
cqpbuilder = new XMLTXM2WTC(f.toURI().toURL());
|
|
225 |
cqpbuilder = new XMLTXM2CQP(f.toURI().toURL());
|
|
226 | 226 |
String txtname = f.getName().substring(0,f.getName().length()-4); |
227 | 227 |
cqpbuilder.setTextInfo(txtname, corpusname, "project"); |
228 | 228 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xtz/Compiler.java (revision 804) | ||
---|---|---|
6 | 6 |
import org.txm.utils.DeleteDir; |
7 | 7 |
|
8 | 8 |
/** |
9 |
* Takes XML-TXM files, build the WTC files and call cwb utils
|
|
9 |
* Takes XML-TXM files, build the CQP files and call cwb utils
|
|
10 | 10 |
* |
11 | 11 |
* @author mdecorde |
12 | 12 |
* |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xtz/XTZCompiler.groovy (revision 804) | ||
---|---|---|
41 | 41 |
if (files == null) files = inputDirectory.listFiles(); |
42 | 42 |
|
43 | 43 |
if (!doScanStep()) return; |
44 |
if (!doWTCStep()) return;
|
|
44 |
if (!doCQPStep()) return;
|
|
45 | 45 |
if (!doCWBEncodeStep()) return; |
46 | 46 |
if (!doCWBMakeAllStep()) return; |
47 | 47 |
|
... | ... | |
108 | 108 |
} |
109 | 109 |
|
110 | 110 |
def cqpFiles = [] |
111 |
public boolean doWTCStep() {
|
|
112 |
println "-- Building WTC files $inputDirectory..."
|
|
111 |
public boolean doCQPStep() {
|
|
112 |
println "-- Building CQP files $inputDirectory..."
|
|
113 | 113 |
cqpDirectory.mkdir(); // if not created |
114 | 114 |
|
115 | 115 |
ConsoleProgressBar cpb = new ConsoleProgressBar(files.size()) |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/graal/ImportInCWB.groovy (revision 804) | ||
---|---|---|
122 | 122 |
CwbMakeAll cwbMa = new CwbMakeAll(); |
123 | 123 |
|
124 | 124 |
GraalImportCqp traitTxt = new GraalImportCqp(new URL("file:///"+ inDir + "/" + inFile+ ".xml")); |
125 |
traitTxt.transfomFileWtc(outDir+"/cqp/", outFile + ".cqp");
|
|
125 |
traitTxt.transfomFileCqp(outDir+"/cqp/", outFile + ".cqp");
|
|
126 | 126 |
|
127 | 127 |
traitTxt = new GraalImportCqp(new URL("file:///"+ inDir + "/" + inFile2+ ".xml")); |
128 |
traitTxt.transfomFileWtcBrut(outDir+"/cqp/", outFile2 + ".cqp");
|
|
128 |
traitTxt.transfomFileCqpBrut(outDir+"/cqp/", outFile2 + ".cqp");
|
|
129 | 129 |
|
130 | 130 |
System.out.println("--Building index files ...") |
131 | 131 |
String[] pAttributecm = ["pos", "q", "supplied", "col", "line", "id", "dipl", "facs"]; |
... | ... | |
159 | 159 |
*/ |
160 | 160 |
public void run() |
161 | 161 |
{ |
162 |
System.out.println "Starting script : importWTC.groovy ...";
|
|
162 |
System.out.println "Starting script : importCQP.groovy ...";
|
|
163 | 163 |
System.setProperty("javax.xml.transform.TransformerFactory", |
164 | 164 |
"net.sf.saxon.TransformerFactoryImpl"); |
165 |
this.importWTC();
|
|
165 |
this.importCQP();
|
|
166 | 166 |
System.out.println "GraalImport.groovy done." |
167 | 167 |
} |
168 | 168 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/graal/qgraalLoader.groovy (revision 804) | ||
---|---|---|
56 | 56 |
// |
57 | 57 |
//String pathToCwbBinaries = org.txm.Toolbox.getParam(org.txm.Toolbox.INSTALL_DIR)+"/cwb/bin/";//chemin vers executable cqp de l'installation |
58 | 58 |
// |
59 |
// //2- create GSP & import to WTC
|
|
59 |
// //2- create GSP & import to CQP
|
|
60 | 60 |
// println "EDITION" |
61 | 61 |
// def edition = new BuildGraalGSP(rootDir,//rootdir |
62 | 62 |
// pathToCwbBinaries// cwb utils dir |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/graal/BuildGraalGSP.groovy (revision 804) | ||
---|---|---|
128 | 128 |
return false; |
129 | 129 |
} |
130 | 130 |
|
131 |
def timportWTCestG
|
|
131 |
def timportCQPestG
|
|
132 | 132 |
def nPage; |
133 | 133 |
def temp; |
134 | 134 |
def BBM; |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/graal/GraalImportCqp.groovy (revision 804) | ||
---|---|---|
101 | 101 |
* @param fileName the outfile name |
102 | 102 |
* @return true, if successful |
103 | 103 |
*/ |
104 |
public boolean transfomFileWtc(String dirPathName, String fileName){
|
|
104 |
public boolean transfomFileCqp(String dirPathName, String fileName){
|
|
105 | 105 |
if(findBody() && createOutput(dirPathName, fileName)){ |
106 | 106 |
String idColumn; |
107 | 107 |
int idParagraph = 0; |
... | ... | |
259 | 259 |
* @param fileName the file name |
260 | 260 |
* @return true, if successful |
261 | 261 |
*/ |
262 |
public boolean transfomFileWtcBrut(String dirPathName, String fileName){
|
|
262 |
public boolean transfomFileCqpBrut(String dirPathName, String fileName){
|
|
263 | 263 |
if(findBody() && createOutput(dirPathName, fileName)){ |
264 | 264 |
String idColumn = ""; |
265 | 265 |
int idParagraph = 0; |
... | ... | |
383 | 383 |
|
384 | 384 |
if (args.length == 0) { |
385 | 385 |
GraalImportCqp traitTxt = new GraalImportCqp(new URL("file://"+ inDir + "/" + inFile)); |
386 |
traitTxt.transfomFileWtc("/home/ayepdieu/srcQuete/result", "graal.cqp");
|
|
386 |
traitTxt.transfomFileCqp("/home/ayepdieu/srcQuete/result", "graal.cqp");
|
|
387 | 387 |
}else if(args.length == 3){ |
388 | 388 |
GraalImportCqp traitTxt = new GraalImportCqp(new URL(args[0])); |
389 |
traitTxt.transfomFileWtc(args[1], args[2]);
|
|
389 |
traitTxt.transfomFileCqp(args[1], args[2]);
|
|
390 | 390 |
}else{ |
391 | 391 |
System.err.println("Usage: java XHTMLOutliner url" ); |
392 | 392 |
return; |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/hyperprince/compiler.groovy (revision 804) | ||
---|---|---|
162 | 162 |
* @param idtext the idtext |
163 | 163 |
* @return true, if successful |
164 | 164 |
*/ |
165 |
public boolean transfomFileWtc(String dirPathName, String fileName,idtext)
|
|
165 |
public boolean transfomFileCqp(String dirPathName, String fileName,idtext)
|
|
166 | 166 |
{ |
167 | 167 |
createOutput(dirPathName, fileName); |
168 | 168 |
|
... | ... | |
328 | 328 |
new File(rootDir,"cqp/").mkdir(); |
329 | 329 |
new File(rootDir,"registry/").mkdir(); |
330 | 330 |
|
331 |
//1- Transform into WTC file
|
|
331 |
//1- Transform into CQP file
|
|
332 | 332 |
List<File> files = new File(rootDir,"txm").listFiles(); |
333 | 333 |
for(File f : files) { |
334 | 334 |
if(!f.exists()) { |
... | ... | |
340 | 340 |
String txtname = f.getName().substring(0,f.getName().length()-4); |
341 | 341 |
def builder = new compiler(f.toURL(),txtname, "hyperprince", "default"); |
342 | 342 |
builder.setLang lang |
343 |
builder.transfomFileWtc(rootDir,"cqp/hyperprince.cqp",f.getName());
|
|
343 |
builder.transfomFileCqp(rootDir,"cqp/hyperprince.cqp",f.getName());
|
|
344 | 344 |
} |
345 | 345 |
} |
346 | 346 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/lasla/compiler.groovy (revision 804) | ||
---|---|---|
199 | 199 |
String textid=""; |
200 | 200 |
int counttext =0; |
201 | 201 |
List<File> files = new File(rootDirFile,"txm").listFiles(); |
202 |
//1- Transform into WTC file
|
|
202 |
//1- Transform into CQP file
|
|
203 | 203 |
def builder = null; |
204 | 204 |
for(File f : files) |
205 | 205 |
{ |
... | ... | |
210 | 210 |
} |
211 | 211 |
else |
212 | 212 |
{ |
213 |
cqpbuilder = new XMLTXM2WTC(f.toURL());
|
|
213 |
cqpbuilder = new XMLTXM2CQP(f.toURL());
|
|
214 | 214 |
String txtname = f.getName().substring(0,f.getName().length()-4); |
215 | 215 |
cqpbuilder.setTextInfo(txtname, basename, "project"); |
216 | 216 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xmltxmpara/compiler.groovy (revision 804) | ||
---|---|---|
177 | 177 |
* @param fileName the file name |
178 | 178 |
* @return true, if successful |
179 | 179 |
*/ |
180 |
public boolean transfomFileWtc(File cqpfile)
|
|
180 |
public boolean transfomFileCqp(File cqpfile)
|
|
181 | 181 |
{ |
182 | 182 |
createOutput(cqpfile); |
183 | 183 |
String headvalue="" |
... | ... | |
379 | 379 |
output.close(); |
380 | 380 |
} |
381 | 381 |
|
382 |
//1- Transform into WTC file
|
|
382 |
//1- Transform into CQP file
|
|
383 | 383 |
def builder = null; |
384 | 384 |
for (File f : files) { |
385 | 385 |
counttext++; |
... | ... | |
390 | 390 |
String txtname = f.getName().substring(0,f.getName().length()-4); |
391 | 391 |
builder = new compiler(f.toURL(), txtname, basename, "default"); |
392 | 392 |
builder.setLang(lang); |
393 |
if (!builder.transfomFileWtc(cqpfile))
|
|
393 |
if (!builder.transfomFileCqp(cqpfile))
|
|
394 | 394 |
return false; |
395 | 395 |
} |
396 | 396 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/perrault/allimport.groovy (revision 804) | ||
---|---|---|
420 | 420 |
* @param fileName the file name |
421 | 421 |
* @return true, if successful |
422 | 422 |
*/ |
423 |
public boolean transfomFileWtc(String dirPathName, String fileName)
|
|
423 |
public boolean transfomFileCqp(String dirPathName, String fileName)
|
|
424 | 424 |
{ |
425 | 425 |
createOutput(dirPathName, fileName); |
426 | 426 |
|
... | ... | |
597 | 597 |
|
598 | 598 |
String textid=""; |
599 | 599 |
int counttext =0; |
600 |
//1- Transform into WTC file
|
|
600 |
//1- Transform into CQP file
|
|
601 | 601 |
for(File f : files) |
602 | 602 |
{ |
603 | 603 |
counttext++; |
... | ... | |
609 | 609 |
{ |
610 | 610 |
println("process file "+f) |
611 | 611 |
compil(f.toURL(),"text"+counttext,"perrault","default"); |
612 |
transfomFileWtc(rootDir+"cqp","perrault.cqp");
|
|
612 |
transfomFileCqp(rootDir+"cqp","perrault.cqp");
|
|
613 | 613 |
} |
614 | 614 |
} |
615 | 615 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/perrault/compiler.groovy (revision 804) | ||
---|---|---|
162 | 162 |
* @param fileName the file name |
163 | 163 |
* @return true, if successful |
164 | 164 |
*/ |
165 |
public boolean transfomFileWtc(String dirPathName, String fileName)
|
|
165 |
public boolean transfomFileCqp(String dirPathName, String fileName)
|
|
166 | 166 |
{ |
167 | 167 |
createOutput(dirPathName, fileName); |
168 | 168 |
|
... | ... | |
340 | 340 |
|
341 | 341 |
String textid=""; |
342 | 342 |
int counttext =0; |
343 |
//1- Transform into WTC file
|
|
343 |
//1- Transform into CQP file
|
|
344 | 344 |
for(File f : files) |
345 | 345 |
{ |
346 | 346 |
counttext++; |
... | ... | |
354 | 354 |
String txtname = f.getName().substring(0,f.getName().length()-4); |
355 | 355 |
def builder = new compiler(f.toURL(), txtname, "perrault", "default"); |
356 | 356 |
builder.setLang(lang); |
357 |
builder.transfomFileWtc(rootDir+"cqp","perrault.cqp");
|
|
357 |
builder.transfomFileCqp(rootDir+"cqp","perrault.cqp");
|
|
358 | 358 |
} |
359 | 359 |
} |
360 | 360 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/xml/compiler.groovy (revision 804) | ||
---|---|---|
212 | 212 |
* @param fileName the file name |
213 | 213 |
* @return true, if successful |
214 | 214 |
*/ |
215 |
public boolean transfomFileWtc(File cqpFile, HashMap<String, String> textmetadata)
|
|
215 |
public boolean transfomFileCqp(File cqpFile, HashMap<String, String> textmetadata)
|
|
216 | 216 |
{ |
217 | 217 |
if (!createOutput(cqpFile)) |
218 | 218 |
return false; |
... | ... | |
465 | 465 |
String textid = "" |
466 | 466 |
int counttext = 0 |
467 | 467 |
List<File> files = txmDir.listFiles() |
468 |
//1- Transform into WTC file
|
|
468 |
//1- Transform into CQP file
|
|
469 | 469 |
def builder = null |
470 | 470 |
|
471 | 471 |
//start corpus |
... | ... | |
522 | 522 |
String txtname = f.getName().substring(0,f.getName().length()-4) |
523 | 523 |
builder = new compiler(f.toURI().toURL(), txtname, corpusname, "default") |
524 | 524 |
builder.setLang(lang); |
525 |
if (!builder.transfomFileWtc(cqpFile, textmetadata)) {
|
|
525 |
if (!builder.transfomFileCqp(cqpFile, textmetadata)) {
|
|
526 | 526 |
println("Failed to compile "+f) |
527 | 527 |
} |
528 | 528 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/bfm/compiler.groovy (revision 804) | ||
---|---|---|
60 | 60 |
|
61 | 61 |
// TODO: Auto-generated Javadoc |
62 | 62 |
/** |
63 |
* Produce WTC files from the TEI-TXM files. <br/>
|
|
63 |
* Produce CQP files from the TEI-TXM files. <br/>
|
|
64 | 64 |
* - Read texts metadata with XPath queries <br/> |
65 | 65 |
* - Add the following word properties : sic, abbr, orig, lb and pb <br/> |
66 | 66 |
* - Keep <front>, <body> and <back> for each text <br/> |
... | ... | |
250 | 250 |
* @param fileName the file name |
251 | 251 |
* @return true, if successful |
252 | 252 |
*/ |
253 |
private boolean transfomFileWtc(File cqpFile)
|
|
253 |
private boolean transfomFileCqp(File cqpFile)
|
|
254 | 254 |
{ |
255 | 255 |
try { |
256 | 256 |
if (!createOutput(cqpFile)) return false; |
... | ... | |
838 | 838 |
output.close(); |
839 | 839 |
} |
840 | 840 |
|
841 |
//1- Transform into WTC file
|
|
841 |
//1- Transform into CQP file
|
|
842 | 842 |
for (File f : files) { |
843 | 843 |
counttext++; |
844 | 844 |
if (!f.exists()) { |
... | ... | |
848 | 848 |
String txtname = f.getName().substring(0, f.getName().length()-4); |
849 | 849 |
def builder = new compiler(f.toURI().toURL(), txtname, corpusname, "default", metadataXPath); |
850 | 850 |
builder.setLang(lang) |
851 |
if (!builder.transfomFileWtc(cqpFile)) {
|
|
851 |
if (!builder.transfomFileCqp(cqpFile)) {
|
|
852 | 852 |
println "Failed to compile "+f |
853 | 853 |
} |
854 | 854 |
builder.setAnnotationDone(this.annotate_status); |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/tmx/compiler.groovy (revision 804) | ||
---|---|---|
189 | 189 |
* @param fileName the file name |
190 | 190 |
* @return true, if successful |
191 | 191 |
*/ |
192 |
public boolean transfomFileWtc(String dirPathName, String fileName)
|
|
192 |
public boolean transfomFileCqp(String dirPathName, String fileName)
|
|
193 | 193 |
{ |
194 | 194 |
createOutput(dirPathName, fileName); |
195 | 195 |
|
... | ... | |
477 | 477 |
new File(rootDir, "txm/"+cqpName.toUpperCase()).mkdir(); |
478 | 478 |
} |
479 | 479 |
|
480 |
//1- Transform into WTC file and build a corpus per file
|
|
480 |
//1- Transform into CQP file and build a corpus per file
|
|
481 | 481 |
Collections.sort(files); |
482 |
def anaTypesPerWtc = [:];
|
|
482 |
def anaTypesPerCqp = [:];
|
|
483 | 483 |
def builder = null; |
484 | 484 |
for (int group : langGroups.keySet()) { |
485 | 485 |
//String lang = langs.get(langGroups.get(group)[0]); |
... | ... | |
503 | 503 |
txtname = txtname.substring(0, txtname.lastIndexOf("_")); |
504 | 504 |
seg_id = segs_id.get(group); |
505 | 505 |
builder = new compiler(f.toURI().toURL(), txtname, basename, "default", tuprops); |
506 |
builder.transfomFileWtc(rootDir+"/cqp",cqpName+".cqp");
|
|
507 |
anaTypesPerWtc[cqpName] = builder.getAnaTypes();
|
|
506 |
builder.transfomFileCqp(rootDir+"/cqp",cqpName+".cqp");
|
|
507 |
anaTypesPerCqp[cqpName] = builder.getAnaTypes();
|
|
508 | 508 |
segs_id.put(group, seg_id); |
509 | 509 |
|
510 | 510 |
// move xml-txm file |
... | ... | |
534 | 534 |
String corpusname = cqpfile.getName(); |
535 | 535 |
|
536 | 536 |
corpusname = corpusname.substring(0, corpusname.length()-4); |
537 |
def corpusAnaTypes = anaTypesPerWtc[corpusname];
|
|
537 |
def corpusAnaTypes = anaTypesPerCqp[corpusname];
|
|
538 | 538 |
|
539 | 539 |
CwbEncode cwbEn = new CwbEncode(); |
540 | 540 |
cwbEn.setDebug(debug); |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/bvh/compiler.groovy (revision 804) | ||
---|---|---|
44 | 44 |
|
45 | 45 |
// TODO: Auto-generated Javadoc |
46 | 46 |
/** |
47 |
* Produce WTC files from TEI-TXM files. <br/>
|
|
47 |
* Produce CQP files from TEI-TXM files. <br/>
|
|
48 | 48 |
* Add the following word properties : "frpos","frlemme","afrpos","afrlemme","id","page","line","orig","sic","abbr". <br/> |
49 | 49 |
* |
50 | 50 |
* @author mdecorde |
... | ... | |
168 | 168 |
* @param fileName the file name |
169 | 169 |
* @return true, if successful |
170 | 170 |
*/ |
171 |
private boolean transfomFileWtc(String dirPathName, String fileName)
|
|
171 |
private boolean transfomFileCqp(String dirPathName, String fileName)
|
|
172 | 172 |
{ |
173 | 173 |
createOutput(dirPathName, fileName); |
174 | 174 |
|
... | ... | |
556 | 556 |
int counttext =0; |
557 | 557 |
List<File> files = new File(rootDirFile,"txm").listFiles(); |
558 | 558 |
Collections.sort(files); |
559 |
//1- Transform into WTC file
|
|
559 |
//1- Transform into CQP file
|
|
560 | 560 |
for(File f : files) |
561 | 561 |
{ |
562 | 562 |
counttext++; |
... | ... | |
570 | 570 |
String txtname = f.getName().substring(0,f.getName().length()-4); |
571 | 571 |
def builder = new compiler(f.toURL(),txtname, "bvh", "default"); |
572 | 572 |
builder.setLang lang |
573 |
builder.transfomFileWtc(rootDir+"/cqp","bvh.cqp");
|
|
573 |
builder.transfomFileCqp(rootDir+"/cqp","bvh.cqp");
|
|
574 | 574 |
} |
575 | 575 |
} |
576 | 576 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/discours/compiler.groovy (revision 804) | ||
---|---|---|
195 | 195 |
} |
196 | 196 |
|
197 | 197 |
def second = 0 |
198 |
//1- Transform into WTC file
|
|
198 |
//1- Transform into CQP file
|
|
199 | 199 |
|
200 | 200 |
Collections.sort(files); |
201 |
XMLTXM2WTC cqpbuilder = null;
|
|
201 |
XMLTXM2CQP cqpbuilder = null;
|
|
202 | 202 |
for (File f : files) { |
203 | 203 |
if (second) { print(", ") } |
204 | 204 |
if (second > 0 && (second % 5) == 0) println "" |
205 | 205 |
print(f.getName().replaceFirst("\\.xml", "")); |
206 | 206 |
second++ |
207 | 207 |
|
208 |
cqpbuilder = new XMLTXM2WTC(f.toURI().toURL());
|
|
208 |
cqpbuilder = new XMLTXM2CQP(f.toURI().toURL());
|
|
209 | 209 |
String txtname = f.getName().substring(0,f.getName().length()-4); |
210 | 210 |
cqpbuilder.setTextInfo(txtname, corpusname, "project"); |
211 | 211 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/RGAQCJ/importRGAQCJ.groovy (revision 804) | ||
---|---|---|
175 | 175 |
* @param fileName the file name |
176 | 176 |
* @return true, if successful |
177 | 177 |
*/ |
178 |
public boolean transfomFileWtc(String dirPathName, String fileName)
|
|
178 |
public boolean transfomFileCqp(String dirPathName, String fileName)
|
|
179 | 179 |
{ |
180 | 180 |
if(findBody() && createOutput(dirPathName, fileName)) |
181 | 181 |
{ |
... | ... | |
352 | 352 |
ArrayList<String> types = BuildXmlRGAQCJ.getAnatypes(rootDir, files); |
353 | 353 |
println(types); |
354 | 354 |
|
355 |
//1- Transform into WTC file
|
|
355 |
//1- Transform into CQP file
|
|
356 | 356 |
File f = new File(rootDir+"cqp/","RGAQCJ.cqp"); |
357 | 357 |
f.delete(); |
358 | 358 |
|
... | ... | |
367 | 367 |
println("process file "+files[i]) |
368 | 368 |
String file = files[i]; |
369 | 369 |
def builder = new BuildXmlRGAQCJ(new File(rootDir+"/anainline/",file).toURL(),""+initiales.charAt(i),types); |
370 |
builder.transfomFileWtc(rootDir+"cqp","RGAQCJ.cqp");
|
|
370 |
builder.transfomFileCqp(rootDir+"cqp","RGAQCJ.cqp");
|
|
371 | 371 |
} |
372 | 372 |
|
373 | 373 |
output = new OutputStreamWriter(new FileOutputStream(f,true) , "UTF-8"); |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/RGAQCJ/compiler.groovy (revision 804) | ||
---|---|---|
167 | 167 |
* @param fileName the file name |
168 | 168 |
* @return true, if successful |
169 | 169 |
*/ |
170 |
public boolean transfomFileWtc(String dirPathName, String fileName)
|
|
170 |
public boolean transfomFileCqp(String dirPathName, String fileName)
|
|
171 | 171 |
{ |
172 | 172 |
createOutput(dirPathName, fileName); |
173 | 173 |
|
... | ... | |
526 | 526 |
String textid=""; |
527 | 527 |
int counttext =0; |
528 | 528 |
List<File> files = new File(rootDirFile,"txm").listFiles(); |
529 |
//1- Transform into WTC file
|
|
529 |
//1- Transform into CQP file
|
|
530 | 530 |
for(File f : files) |
531 | 531 |
{ |
532 | 532 |
counttext++; |
... | ... | |
539 | 539 |
println("process file "+f) |
540 | 540 |
def builder = new compiler(f.toURL(),f.getName(),"bfm3tt","default"); |
541 | 541 |
builder.setLang(lang); |
542 |
builder.transfomFileWtc(rootDir+"/cqp","bfm3tt.cqp");
|
|
542 |
builder.transfomFileCqp(rootDir+"/cqp","bfm3tt.cqp");
|
|
543 | 543 |
} |
544 | 544 |
} |
545 | 545 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/cqp/pager.groovy (revision 804) | ||
---|---|---|
37 | 37 |
import org.xml.sax.InputSource; |
38 | 38 |
import org.xml.sax.helpers.DefaultHandler; |
39 | 39 |
|
40 |
/** Build WTC corpus simple edition from a WTC file.
|
|
40 |
/** Build CQP corpus simple edition from a CQP file.
|
|
41 | 41 |
* |
42 | 42 |
* @author mdecorde |
43 | 43 |
*/ |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/cqp/compiler.groovy (revision 804) | ||
---|---|---|
55 | 55 |
/** The dir. */ |
56 | 56 |
private def dir; |
57 | 57 |
|
58 |
File srcWTCFile, srcRegistryFile;
|
|
58 |
File srcCQPFile, srcRegistryFile;
|
|
59 | 59 |
|
60 | 60 |
public def pAttributesList = []; |
61 | 61 |
public def sAttributesList = []; |
62 | 62 |
|
63 | 63 |
public compiler(File cqpFile, File registryFile) { |
64 |
this.srcWTCFile = cqpFile;
|
|
64 |
this.srcCQPFile = cqpFile;
|
|
65 | 65 |
this.srcRegistryFile = registryFile |
66 | 66 |
} |
67 | 67 |
|
... | ... | |
93 | 93 |
return false; |
94 | 94 |
} |
95 | 95 |
|
96 |
if (!srcWTCFile.exists()) {
|
|
97 |
println "Error: cannot find the WTC file $srcWTCFile"
|
|
96 |
if (!srcCQPFile.exists()) {
|
|
97 |
println "Error: cannot find the CQP file $srcCQPFile"
|
|
98 | 98 |
return false; |
99 | 99 |
} |
100 | 100 |
|
... | ... | |
105 | 105 |
new File(binDir, "data").mkdir(); |
106 | 106 |
new File(binDir, "registry").mkdir(); |
107 | 107 |
|
108 |
FileCopy.copy(srcWTCFile, cqpFile);
|
|
108 |
FileCopy.copy(srcCQPFile, cqpFile);
|
|
109 | 109 |
|
110 | 110 |
//2- Import into CWB |
111 | 111 |
def outDir = binDir.getAbsolutePath()+"/"; |
... | ... | |
120 | 120 |
|
121 | 121 |
if (srcRegistryFile == null) { |
122 | 122 |
println "WARNING: No registry file found in source directory" |
123 |
println "We'll search for positional attributes and structural attributes in the WTC file"
|
|
123 |
println "We'll search for positional attributes and structural attributes in the CQP file"
|
|
124 | 124 |
|
125 | 125 |
// s attributes |
126 |
BuildCwbEncodeArgsFromWTC argsgetter = new BuildCwbEncodeArgsFromWTC(); // XML stream
|
|
126 |
BuildCwbEncodeArgsFromCQP argsgetter = new BuildCwbEncodeArgsFromCQP(); // XML stream
|
|
127 | 127 |
argsgetter.process(cqpFile); //$NON-NLS-1$ |
128 | 128 |
sAttributesList = argsgetter.getSAttributes(); |
129 | 129 |
|
... | ... | |
131 | 131 |
int nbAttr = -1; |
132 | 132 |
File tmp = File.createTempFile("txm", ".cqp", cqpFile.getParentFile()); |
133 | 133 |
int wcounter = 1; |
134 |
println "Adding the 'id' property to the WTC file and getting word properties number."
|
|
134 |
println "Adding the 'id' property to the CQP file and getting word properties number."
|
|
135 | 135 |
tmp.withWriter("UTF-8") { writer -> |
136 | 136 |
cqpFile.eachLine("UTF-8") { line -> |
137 | 137 |
if (!line.startsWith("<")) { |
tmp/org.txm.groovy.core/src/groovy/org/txm/importer/cqp/cqpLoader.groovy (revision 804) | ||
---|---|---|
80 | 80 |
if (f.getName().endsWith(".cqp")) { |
81 | 81 |
if (cqpFile == null) { |
82 | 82 |
cqpFile = f; |
83 |
println "Using WTC file : $cqpFile"
|
|
83 |
println "Using CQP file : $cqpFile"
|
|
84 | 84 |
} |
85 | 85 |
} else if (f.getName().endsWith(".xml")) { |
86 | 86 |
|
... | ... | |
93 | 93 |
} |
94 | 94 |
|
95 | 95 |
if (cqpFile == null) { |
96 |
println "No WTC file in source directory: "+srcDir;
|
|
96 |
println "No CQP file in source directory: "+srcDir;
|
|
97 | 97 |
return false; |
98 | 98 |
} |
99 | 99 |
|
... | ... | |
125 | 125 |
List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang); |
126 | 126 |
List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang); |
127 | 127 |
|
128 |
File binWtcFile = new File(binDir,"cqp/"+corpusname+".cqp");
|
|
129 |
def ed = new pager(binWtcFile, outdir, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, registryFile !=null, c.pAttributesList);
|
|
128 |
File binCqpFile = new File(binDir,"cqp/"+corpusname+".cqp");
|
|
129 |
def ed = new pager(binCqpFile, outdir, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, registryFile !=null, c.pAttributesList);
|
|
130 | 130 |
def allIndexes = ed.getIdx(); |
131 | 131 |
def allPages = ed.getPageFiles(); |
132 | 132 |
for (String txtname : ed.getTextNames()) { |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/teitxm/AnnotateWTC.groovy (revision 804) | ||
---|---|---|
46 | 46 |
* creates $rootDirFile/interp and $rootDirFile/treetagger |
47 | 47 |
* |
48 | 48 |
*/ |
49 |
class AnnotateWTC {
|
|
49 |
class AnnotateCQP {
|
|
50 | 50 |
|
51 | 51 |
/** The debug. */ |
52 | 52 |
boolean debug = true; |
tmp/org.txm.searchengine.cqp.core/src/org/txm/importer/cwb/BuildCwbEncodeArgsFromWTC.java (revision 804) | ||
---|---|---|
1 |
// Copyright © 2010-2013 ENS de Lyon. |
|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice |
|
4 |
// Sophia Antipolis, University of Paris 3. |
|
5 |
// |
|
6 |
// The TXM platform is free software: you can redistribute it |
|
7 |
// and/or modify it under the terms of the GNU General Public |
|
8 |
// License as published by the Free Software Foundation, |
|
9 |
// either version 2 of the License, or (at your option) any |
|
10 |
// later version. |
|
11 |
// |
|
12 |
// The TXM platform is distributed in the hope that it will be |
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
|
15 |
// PURPOSE. See the GNU General Public License for more |
|
16 |
// details. |
|
17 |
// |
|
18 |
// You should have received a copy of the GNU General |
|
19 |
// Public License along with the TXM platform. If not, see |
|
20 |
// http://www.gnu.org/licenses. |
|
21 |
// |
|
22 |
// |
|
23 |
// |
|
24 |
// $LastChangedDate: 2013-08-29 14:23:55 +0200 (Thu, 29 Aug 2013) $ |
|
25 |
// $LastChangedRevision: 2519 $ |
|
26 |
// $LastChangedBy: nilskredens $ |
|
27 |
// |
|
28 |
package org.txm.importer.cwb; |
|
29 |
|
|
30 |
import java.io.BufferedReader; |
|
31 |
import java.io.File; |
|
32 |
import java.io.FileInputStream; |
|
33 |
import java.io.IOException; |
|
34 |
import java.io.InputStreamReader; |
|
35 |
import java.util.ArrayList; |
|
36 |
import java.util.Collection; |
|
37 |
import java.util.HashMap; |
|
38 |
import java.util.HashSet; |
|
39 |
import java.util.LinkedList; |
|
40 |
import java.util.regex.Matcher; |
|
41 |
import java.util.regex.Pattern; |
|
42 |
|
|
43 |
import javax.xml.stream.FactoryConfigurationError; |
|
44 |
import javax.xml.stream.XMLStreamException; |
|
45 |
|
|
46 |
import org.txm.core.messages.TXMCoreMessages; |
|
47 |
|
|
48 |
// TODO: Auto-generated Javadoc |
|
49 |
/** |
|
50 |
* build the sAttributes and pAttributs List from an xmlfile pAttributes |
|
51 |
* (attributes of tei:w tags) and the sAttributes (attributes + max inclusion of |
|
52 |
* tags). |
|
53 |
* |
|
54 |
* @author mdecorde |
|
55 |
*/ |
|
56 |
public class BuildCwbEncodeArgsFromWTC { |
|
57 |
|
|
58 |
/** The stack. */ |
|
59 |
LinkedList<String> stack = new LinkedList<String>(); |
|
60 |
|
|
61 |
/** The tagcount. */ |
|
62 |
HashMap<String, Integer> tagcount = new HashMap<String, Integer>(); |
|
63 |
|
|
64 |
/** The tagmax. */ |
|
65 |
HashMap<String, Integer> tagmax = new HashMap<String, Integer>(); |
|
66 |
|
|
67 |
/** The tagattrs. */ |
|
68 |
HashMap<String, HashSet<String>> tagattrs = new HashMap<String, HashSet<String>>(); |
|
69 |
int nWordProps = 0; |
|
70 |
|
|
71 |
/** |
|
72 |
* process !!!. |
|
73 |
* |
|
74 |
* @param cqpfile the xmlfile |
|
75 |
* @param wordstag the wordstag |
|
76 |
* @return true if successful |
|
77 |
* @throws XMLStreamException the xML stream exception |
|
78 |
* @throws FactoryConfigurationError the factory configuration error |
|
79 |
* @throws IOException Signals that an I/O exception has occurred. |
|
80 |
*/ |
|
81 |
public boolean process(File cqpfile) |
|
82 |
throws XMLStreamException, FactoryConfigurationError, IOException { |
|
83 |
if (cqpfile == null || !cqpfile.exists()) { |
|
84 |
System.out.println(TXMCoreMessages.bind(TXMCoreMessages.BuildCwbEncodeArgs_0, cqpfile)); |
|
85 |
return false; |
|
86 |
} |
|
87 |
|
|
88 |
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(cqpfile), "UTF-8")); |
|
89 |
|
|
90 |
String line = reader.readLine(); |
|
91 |
String localname; |
|
92 |
|
|
93 |
|
|
94 |
Pattern p = Pattern.compile("(([^= ]+)=\"([^\"]+)\")"); |
|
95 |
//for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
|
96 |
for(; line != null ; line = reader.readLine()) { |
|
97 |
|
|
98 |
if (line.startsWith("</") && line.endsWith(">")) { // END_ELEMENT |
|
99 |
int idx = line.indexOf(" "); |
|
100 |
if (idx == -1) idx = line.indexOf(">"); |
|
101 |
localname = line.substring(2, idx); |
|
102 |
|
|
103 |
if (stack.size() > 0) { |
|
104 |
stack.pop(); |
|
105 |
int count = tagcount.get(localname); |
|
106 |
count--; |
|
107 |
tagcount.put(localname, count);// decrement the |
|
108 |
// recursion counter of the tag |
|
109 |
} |
|
110 |
|
|
111 |
} else if (line.startsWith("<") && line.endsWith("/>")) { // EMPTY_ELEMENT |
|
112 |
// ignore milestones |
|
113 |
} else if (line.startsWith("<") && line.endsWith(">")) { // START_ELEMENT |
|
114 |
int idx = line.indexOf(" "); |
|
115 |
if (idx == -1) idx = line.indexOf(">"); // no attributes |
|
116 |
localname = line.substring(1, idx); |
|
117 |
|
|
118 |
stack.push(localname); |
|
119 |
|
|
120 |
if (!tagattrs.containsKey(localname))// initialize |
|
121 |
// HashMaps |
|
122 |
{ |
|
123 |
tagattrs.put(localname, new HashSet<String>()); |
|
124 |
tagcount.put(localname, 0); |
|
125 |
tagmax.put(localname, 0); |
|
126 |
} |
|
127 |
|
|
128 |
Matcher m = p.matcher(line); |
|
129 |
while (m.find()) { |
|
130 |
if (m.groupCount() == 3) { |
|
131 |
tagattrs.get(localname).add(m.group(2)); |
|
132 |
} |
|
133 |
} |
|
134 |
|
|
135 |
int count = tagcount.get(localname); |
|
136 |
count++; |
|
137 |
tagcount.put(localname, count);// increment the |
|
138 |
// recursion counter of the tag |
|
139 |
int max = tagmax.get(localname); |
|
140 |
if (max < count) |
|
141 |
tagmax.put(localname, count);// update max |
|
142 |
// recursion of the tag |
|
143 |
|
|
144 |
} else { // WORD LINE |
|
145 |
if (nWordProps == 0) { |
|
146 |
//System.out.println("Line: "+line); |
|
147 |
String[] split = line.split("\t"); |
|
148 |
nWordProps = split.length; |
|
149 |
//System.out.println("nprops: "+nWordProps); |
|
150 |
} |
|
151 |
|
|
152 |
} |
|
153 |
} // end while |
|
154 |
reader.close(); |
|
155 |
return true; |
|
156 |
} |
|
157 |
|
|
158 |
/** |
|
159 |
* Gets the p attributes. |
|
160 |
* |
|
161 |
* @return the p attributes |
|
162 |
*/ |
|
163 |
public Collection<String> getPAttributes() { |
|
164 |
ArrayList<String> props = new ArrayList<String>(); |
|
165 |
for( int i = 1 ; i < nWordProps ; i++) { // don't count the "word" property |
|
166 |
props.add("p"+i); |
|
167 |
} |
|
168 |
return props; |
|
169 |
} |
|
170 |
|
|
171 |
/** |
|
172 |
* Gets the s attributes. |
|
173 |
* |
|
174 |
* @return the s attributes |
|
175 |
*/ |
|
176 |
public Collection<String> getSAttributes() { |
|
177 |
ArrayList<String> rez = new ArrayList<String>(); |
|
178 |
|
|
179 |
for (String tag : tagattrs.keySet()) { |
|
180 |
String s = tag.toLowerCase(); |
|
181 |
s += ":"; //$NON-NLS-1$ |
|
182 |
|
|
183 |
if (tagmax.get(tag) >= 0) |
|
184 |
s += "" + (tagmax.get(tag) - 1); //$NON-NLS-1$ |
|
185 |
|
|
186 |
for (String attr : tagattrs.get(tag)) |
|
187 |
s += "+" + attr.toLowerCase(); //$NON-NLS-1$ |
|
188 |
rez.add(s); |
|
189 |
} |
|
190 |
return rez; |
|
191 |
} |
|
192 |
|
|
193 |
/** |
|
194 |
* The main method. |
|
195 |
* |
|
196 |
* @param args the arguments |
|
197 |
*/ |
|
198 |
public static void main(String[] args) { |
|
199 |
try { |
|
200 |
BuildCwbEncodeArgsFromWTC argsgetter = new BuildCwbEncodeArgsFromWTC(); |
|
201 |
argsgetter.process(new File( |
|
202 |
"/home/mdecorde/xml/cqp/discours.cqp")); //$NON-NLS-1$ |
|
203 |
|
|
204 |
System.out.println("pAttributes : " + argsgetter.getPAttributes()); //$NON-NLS-1$ |
|
205 |
System.out.println("sAttributes : " + argsgetter.getSAttributes()); //$NON-NLS-1$ |
|
206 |
} catch (XMLStreamException e) { |
|
207 |
// TODO Auto-generated catch block |
|
208 |
org.txm.utils.logger.Log.printStackTrace(e); |
|
209 |
} catch (FactoryConfigurationError e) { |
|
210 |
// TODO Auto-generated catch block |
|
211 |
org.txm.utils.logger.Log.printStackTrace(e); |
|
212 |
} catch (IOException e) { |
|
213 |
// TODO Auto-generated catch block |
|
214 |
org.txm.utils.logger.Log.printStackTrace(e); |
|
215 |
} |
|
216 |
} |
|
217 |
} |
tmp/org.txm.searchengine.cqp.core/src/org/txm/importer/cwb/BuildAlignOut.java (revision 804) | ||
---|---|---|
8 | 8 |
|
9 | 9 |
import org.txm.utils.io.IOUtils; |
10 | 10 |
/** |
11 |
* Create the CWB "align.out" file usually created with cwb-align using 2 WTC files. The alignement structure and property is not checked. Only structure order is used.
|
|
11 |
* Create the CWB "align.out" file usually created with cwb-align using 2 CQP files. The alignement structure and property is not checked. Only structure order is used.
|
|
12 | 12 |
* |
13 | 13 |
* @author mdecorde |
14 | 14 |
* |
tmp/org.txm.searchengine.cqp.core/src/org/txm/importer/cwb/CwbEncode.java (revision 804) | ||
---|---|---|
338 | 338 |
} |
339 | 339 |
|
340 | 340 |
/** |
341 |
* concatenates WTC files into one master WTC file and wraps with the 'txmcorpus' structure.
|
|
341 |
* concatenates CQP files into one master CQP file and wraps with the 'txmcorpus' structure.
|
|
342 | 342 |
* |
343 | 343 |
* @param cqpFiles |
344 | 344 |
* @param allcqpFile |
345 |
* @return true if the WTC files are concatenated
|
|
345 |
* @return true if the CQP files are concatenated
|
|
346 | 346 |
* @throws IOException |
347 | 347 |
*/ |
348 | 348 |
public static boolean concat(ArrayList<File> cqpFiles, File allcqpFile) throws IOException { |
tmp/org.txm.searchengine.cqp.core/src/org/txm/importer/cwb/BuildCwbEncodeArgsFromCQP.java (revision 804) | ||
---|---|---|
1 |
// Copyright © 2010-2013 ENS de Lyon. |
|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice |
|
4 |
// Sophia Antipolis, University of Paris 3. |
|
5 |
// |
|
6 |
// The TXM platform is free software: you can redistribute it |
|
7 |
// and/or modify it under the terms of the GNU General Public |
|
8 |
// License as published by the Free Software Foundation, |
|
9 |
// either version 2 of the License, or (at your option) any |
|
10 |
// later version. |
|
11 |
// |
|
12 |
// The TXM platform is distributed in the hope that it will be |
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
|
15 |
// PURPOSE. See the GNU General Public License for more |
|
16 |
// details. |
|
17 |
// |
|
18 |
// You should have received a copy of the GNU General |
|
19 |
// Public License along with the TXM platform. If not, see |
|
20 |
// http://www.gnu.org/licenses. |
|
21 |
// |
|
22 |
// |
|
23 |
// |
|
24 |
// $LastChangedDate: 2013-08-29 14:23:55 +0200 (Thu, 29 Aug 2013) $ |
|
25 |
// $LastChangedRevision: 2519 $ |
|
26 |
// $LastChangedBy: nilskredens $ |
|
27 |
// |
|
28 |
package org.txm.importer.cwb; |
|
29 |
|
|
30 |
import java.io.BufferedReader; |
|
31 |
import java.io.File; |
|
32 |
import java.io.FileInputStream; |
|
33 |
import java.io.IOException; |
|
34 |
import java.io.InputStreamReader; |
|
35 |
import java.util.ArrayList; |
|
36 |
import java.util.Collection; |
|
37 |
import java.util.HashMap; |
|
38 |
import java.util.HashSet; |
|
39 |
import java.util.LinkedList; |
|
40 |
import java.util.regex.Matcher; |
|
41 |
import java.util.regex.Pattern; |
|
42 |
|
|
43 |
import javax.xml.stream.FactoryConfigurationError; |
|
44 |
import javax.xml.stream.XMLStreamException; |
|
45 |
|
|
46 |
import org.txm.core.messages.TXMCoreMessages; |
|
47 |
|
|
48 |
// TODO: Auto-generated Javadoc |
|
49 |
/** |
|
50 |
* build the sAttributes and pAttributs List from an xmlfile pAttributes |
|
51 |
* (attributes of tei:w tags) and the sAttributes (attributes + max inclusion of |
|
52 |
* tags). |
|
53 |
* |
|
54 |
* @author mdecorde |
|
55 |
*/ |
|
56 |
public class BuildCwbEncodeArgsFromCQP { |
|
57 |
|
|
58 |
/** The stack. */ |
|
59 |
LinkedList<String> stack = new LinkedList<String>(); |
|
60 |
|
|
61 |
/** The tagcount. */ |
|
62 |
HashMap<String, Integer> tagcount = new HashMap<String, Integer>(); |
|
63 |
|
|
64 |
/** The tagmax. */ |
|
65 |
HashMap<String, Integer> tagmax = new HashMap<String, Integer>(); |
|
66 |
|
|
67 |
/** The tagattrs. */ |
|
68 |
HashMap<String, HashSet<String>> tagattrs = new HashMap<String, HashSet<String>>(); |
|
69 |
int nWordProps = 0; |
|
70 |
|
|
71 |
/** |
|
72 |
* process !!!. |
|
73 |
* |
|
74 |
* @param cqpfile the xmlfile |
|
75 |
* @param wordstag the wordstag |
|
76 |
* @return true if successful |
|
77 |
* @throws XMLStreamException the xML stream exception |
|
78 |
* @throws FactoryConfigurationError the factory configuration error |
|
79 |
* @throws IOException Signals that an I/O exception has occurred. |
|
80 |
*/ |
|
81 |
public boolean process(File cqpfile) |
|
82 |
throws XMLStreamException, FactoryConfigurationError, IOException { |
|
83 |
if (cqpfile == null || !cqpfile.exists()) { |
|
84 |
System.out.println(TXMCoreMessages.bind(TXMCoreMessages.BuildCwbEncodeArgs_0, cqpfile)); |
|
85 |
return false; |
|
86 |
} |
|
87 |
|
|
88 |
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(cqpfile), "UTF-8")); |
|
89 |
|
|
90 |
String line = reader.readLine(); |
|
91 |
String localname; |
|
92 |
|
|
93 |
|
|
94 |
Pattern p = Pattern.compile("(([^= ]+)=\"([^\"]+)\")"); |
|
95 |
//for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
|
96 |
for(; line != null ; line = reader.readLine()) { |
|
97 |
|
|
98 |
if (line.startsWith("</") && line.endsWith(">")) { // END_ELEMENT |
|
99 |
int idx = line.indexOf(" "); |
|
100 |
if (idx == -1) idx = line.indexOf(">"); |
|
101 |
localname = line.substring(2, idx); |
|
102 |
|
|
103 |
if (stack.size() > 0) { |
|
104 |
stack.pop(); |
|
105 |
int count = tagcount.get(localname); |
|
106 |
count--; |
|
107 |
tagcount.put(localname, count);// decrement the |
|
108 |
// recursion counter of the tag |
|
109 |
} |
|
110 |
|
|
111 |
} else if (line.startsWith("<") && line.endsWith("/>")) { // EMPTY_ELEMENT |
|
112 |
// ignore milestones |
|
113 |
} else if (line.startsWith("<") && line.endsWith(">")) { // START_ELEMENT |
|
114 |
int idx = line.indexOf(" "); |
|
115 |
if (idx == -1) idx = line.indexOf(">"); // no attributes |
|
116 |
localname = line.substring(1, idx); |
|
117 |
|
|
118 |
stack.push(localname); |
|
119 |
|
|
120 |
if (!tagattrs.containsKey(localname))// initialize |
|
121 |
// HashMaps |
|
122 |
{ |
|
123 |
tagattrs.put(localname, new HashSet<String>()); |
|
124 |
tagcount.put(localname, 0); |
|
125 |
tagmax.put(localname, 0); |
|
126 |
} |
|
127 |
|
|
128 |
Matcher m = p.matcher(line); |
|
129 |
while (m.find()) { |
|
130 |
if (m.groupCount() == 3) { |
|
131 |
tagattrs.get(localname).add(m.group(2)); |
|
132 |
} |
|
133 |
} |
|
134 |
|
|
135 |
int count = tagcount.get(localname); |
|
136 |
count++; |
|
137 |
tagcount.put(localname, count);// increment the |
|
138 |
// recursion counter of the tag |
|
139 |
int max = tagmax.get(localname); |
|
140 |
if (max < count) |
|
141 |
tagmax.put(localname, count);// update max |
|
142 |
// recursion of the tag |
|
143 |
|
|
144 |
} else { // WORD LINE |
|
145 |
if (nWordProps == 0) { |
|
146 |
//System.out.println("Line: "+line); |
|
147 |
String[] split = line.split("\t"); |
|
148 |
nWordProps = split.length; |
|
149 |
//System.out.println("nprops: "+nWordProps); |
|
150 |
} |
|
151 |
|
|
152 |
} |
|
153 |
} // end while |
|
154 |
reader.close(); |
|
155 |
return true; |
|
156 |
} |
|
157 |
|
|
158 |
/** |
|
159 |
* Gets the p attributes. |
|
160 |
* |
|
161 |
* @return the p attributes |
|
162 |
*/ |
|
163 |
public Collection<String> getPAttributes() { |
|
164 |
ArrayList<String> props = new ArrayList<String>(); |
|
165 |
for( int i = 1 ; i < nWordProps ; i++) { // don't count the "word" property |
|
166 |
props.add("p"+i); |
|
167 |
} |
|
168 |
return props; |
|
169 |
} |
|
170 |
|
|
171 |
/** |
|
172 |
* Gets the s attributes. |
|
173 |
* |
|
174 |
* @return the s attributes |
|
175 |
*/ |
|
176 |
public Collection<String> getSAttributes() { |
|
177 |
ArrayList<String> rez = new ArrayList<String>(); |
|
178 |
|
|
179 |
for (String tag : tagattrs.keySet()) { |
|
180 |
String s = tag.toLowerCase(); |
|
181 |
s += ":"; //$NON-NLS-1$ |
|
182 |
|
|
183 |
if (tagmax.get(tag) >= 0) |
|
184 |
s += "" + (tagmax.get(tag) - 1); //$NON-NLS-1$ |
|
185 |
|
|
186 |
for (String attr : tagattrs.get(tag)) |
|
187 |
s += "+" + attr.toLowerCase(); //$NON-NLS-1$ |
|
188 |
rez.add(s); |
|
189 |
} |
|
190 |
return rez; |
|
191 |
} |
|
192 |
|
|
193 |
/** |
|
194 |
* The main method. |
|
195 |
* |
|
196 |
* @param args the arguments |
|
197 |
*/ |
|
198 |
public static void main(String[] args) { |
|
199 |
try { |
|
200 |
BuildCwbEncodeArgsFromCQP argsgetter = new BuildCwbEncodeArgsFromCQP(); |
|
201 |
argsgetter.process(new File( |
|
202 |
"/home/mdecorde/xml/cqp/discours.cqp")); //$NON-NLS-1$ |
|
203 |
|
|
204 |
System.out.println("pAttributes : " + argsgetter.getPAttributes()); //$NON-NLS-1$ |
|
205 |
System.out.println("sAttributes : " + argsgetter.getSAttributes()); //$NON-NLS-1$ |
|
206 |
} catch (XMLStreamException e) { |
|
207 |
// TODO Auto-generated catch block |
|
208 |
org.txm.utils.logger.Log.printStackTrace(e); |
|
209 |
} catch (FactoryConfigurationError e) { |
|
210 |
// TODO Auto-generated catch block |
|
211 |
org.txm.utils.logger.Log.printStackTrace(e); |
|
212 |
} catch (IOException e) { |
|
213 |
// TODO Auto-generated catch block |
|
214 |
org.txm.utils.logger.Log.printStackTrace(e); |
|
215 |
} |
|
216 |
} |
|
217 |
} |
|
0 | 218 |
tmp/org.txm.rcp.feature/build.properties (revision 804) | ||
---|---|---|
3 | 3 |
TXM.sh |
4 | 4 |
|
5 | 5 |
root.linux.gtk.x86_64 = linux/ |
6 |
root.linux.gtk.x86_64.permissions.755 = jre/bin/java |
|
6 |
root.linux.gtk.x86_64.permissions.755 = jre/bin/java,\ |
|
7 |
plugins/org.txm.libs.cqp.linux/res/linux32/,\ |
|
8 |
plugins/org.txm.libs.cqp.linux/res/linux64/ |
|
7 | 9 |
|
8 | 10 |
root.macosx.cocoa.x86_64 = mac/ |
9 |
|
|
11 |
root.macosx.cocoa.x86_64.permissions.755 = jre/bin/java,\ |
|
12 |
plugins/org.txm.libs.cqp.macosx/res/macosx/ |
|
13 |
|
|
10 | 14 |
root.win32.win32.x86_64 = win/,\ |
11 | 15 |
../org.txm.libs.cqp.win32/res/win64/dependencies/ |
12 | 16 |
|
tmp/org.txm.rcp/src/main/java/org/txm/rcp/editors/imports/ImportModuleCustomization.java (revision 804) | ||
---|---|---|
66 | 66 |
params.put(TEXTUALPLANS, false); |
67 | 67 |
params.put(OPTIONS, false); |
68 | 68 |
allParams.put("cqpLoader.groovy", params); //$NON-NLS-1$ |
69 |
names.put("cqpLoader.groovy", "WTC"); //$NON-NLS-1$ //$NON-NLS-2$
|
|
69 |
names.put("cqpLoader.groovy", "CQP"); //$NON-NLS-1$ //$NON-NLS-2$
|
|
70 | 70 |
|
71 | 71 |
params = new HashMap<String, Boolean>(); |
72 | 72 |
params.put(PARALLEL, false); |
Formats disponibles : Unified diff