Révision 3407
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/transcriberLoader.groovy (revision 3407) | ||
---|---|---|
172 | 172 |
println "Failed to prepare files - Aborting"; |
173 | 173 |
return; |
174 | 174 |
} |
175 |
if (MONITOR != null) MONITOR.worked(20) |
|
175 |
if (MONITOR != null) MONITOR.worked(20) |
|
176 |
|
|
177 |
// File antractXSL = new File(srcDir, "special.xsl") |
|
178 |
// if (antractXSL.exists()) { |
|
179 |
// println "Applying special XSL: $antractXSL" |
|
180 |
// if (!ApplyXsl2.processImportSources(antractXSL, txmDir, txmDir)) { |
|
181 |
// println "Error while processing XML-TXM files." |
|
182 |
// return false; |
|
183 |
// } |
|
184 |
// } |
|
176 | 185 |
|
177 | 186 |
println "-- Xml Validation" |
178 | 187 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
... | ... | |
187 | 196 |
cpb.done() |
188 | 197 |
|
189 | 198 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
190 |
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE") |
|
199 |
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE") |
|
200 |
|
|
201 |
|
|
191 | 202 |
|
192 | 203 |
boolean annotationSuccess = false; |
193 | 204 |
if (annotate) { |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/compiler.groovy (revision 3407) | ||
---|---|---|
46 | 46 |
* The Class compiler. |
47 | 47 |
*/ |
48 | 48 |
class compiler { |
49 |
|
|
49 |
|
|
50 | 50 |
boolean ADD_TEXTID_TO_REF = true |
51 | 51 |
|
52 | 52 |
/** The input data. */ |
53 | 53 |
private def inputData; |
54 |
|
|
54 |
|
|
55 | 55 |
/** The factory. */ |
56 | 56 |
private def factory; |
57 |
|
|
57 |
|
|
58 | 58 |
/** The parser. */ |
59 | 59 |
private XMLStreamReader parser; |
60 |
|
|
60 |
|
|
61 | 61 |
/** The output. */ |
62 | 62 |
OutputStreamWriter output; |
63 |
|
|
63 |
|
|
64 | 64 |
/** The basename. */ |
65 | 65 |
String corpusname; |
66 |
|
|
66 |
|
|
67 | 67 |
/** The projectname. */ |
68 | 68 |
String projectname |
69 |
|
|
69 |
|
|
70 | 70 |
/** The outdir. */ |
71 | 71 |
String outdir; |
72 |
|
|
72 |
|
|
73 | 73 |
/** The debug. */ |
74 | 74 |
boolean debug = false; |
75 |
|
|
75 |
|
|
76 | 76 |
/** The indexInterviewer: index interviewer speech if true. */ |
77 | 77 |
boolean indexInterviewer = true; |
78 |
|
|
78 |
|
|
79 | 79 |
/** The trans. */ |
80 | 80 |
HashMap<String, ArrayList<Pair<String, String>>> trans; |
81 |
|
|
81 |
|
|
82 | 82 |
/** The speakers. */ |
83 | 83 |
HashMap<String, ArrayList<Pair<String, String>>> speakers; |
84 |
|
|
84 |
|
|
85 | 85 |
/** The speakersname. */ |
86 | 86 |
HashMap<String, String> speakersname = new HashMap<String, String>(); |
87 |
|
|
87 |
|
|
88 | 88 |
/** The topics. */ |
89 | 89 |
HashMap<String, ArrayList<Pair<String, String>>> topics; |
90 |
|
|
91 |
|
|
90 |
|
|
91 |
|
|
92 | 92 |
/** The interviewers regex */ |
93 | 93 |
def interviewers = null |
94 | 94 |
static LinkedHashSet<String> sectionAttrs; |
95 | 95 |
static LinkedHashSet<String> spAttrs; |
96 | 96 |
static LinkedHashSet<String> uAttrs; |
97 |
|
|
97 |
|
|
98 | 98 |
/** The anatypes. */ |
99 | 99 |
private static anatypes = [] |
100 | 100 |
private static anavalues = [:] |
101 |
|
|
101 |
|
|
102 | 102 |
/** |
103 | 103 |
* Removes the interviewers. |
104 | 104 |
* |
... | ... | |
108 | 108 |
public setIndexInterviewer(boolean value) { |
109 | 109 |
this.indexInterviewer = value; |
110 | 110 |
} |
111 |
|
|
111 |
|
|
112 | 112 |
File cqpFile |
113 |
LinkedHashMap<String, LinkedHashMap<String, String>> projectionsFromValues = new LinkedHashMap<String, LinkedHashMap<String, String>>(); // values of properties to inject |
|
114 |
LinkedHashMap<String, LinkedHashMap<String, ArrayList<ArrayList>>> projectionsToDo = new LinkedHashMap<String, LinkedHashMap<String, ArrayList<ArrayList>>>(); // list of projections to do |
|
115 |
|
|
113 | 116 |
/** |
114 | 117 |
* Run. |
115 | 118 |
* |
... | ... | |
125 | 128 |
this.outdir = binDir; |
126 | 129 |
this.corpusname = corpusname; |
127 | 130 |
this.projectname = projectname; |
128 |
|
|
131 |
|
|
129 | 132 |
anatypes = ["event"] // reset |
130 | 133 |
anavalues = [:] // reset |
131 |
|
|
134 |
|
|
132 | 135 |
sectionAttrs = new LinkedHashSet<String>() // reset section attributs set |
133 | 136 |
spAttrs = new LinkedHashSet<String>() // reset section attributs set |
134 | 137 |
uAttrs = new LinkedHashSet<String>() // reset section attributs set |
135 |
|
|
138 |
|
|
136 | 139 |
CorpusBuild corpus = project.getCorpusBuild(project.getName(), MainCorpus.class); |
137 | 140 |
if (corpus != null) { |
138 | 141 |
if (project.getDoUpdate()) { |
... | ... | |
152 | 155 |
new File(binDir,"cqp").mkdirs() |
153 | 156 |
new File(binDir,"data").mkdirs() |
154 | 157 |
new File(binDir,"registry").mkdirs() |
155 |
|
|
158 |
|
|
156 | 159 |
// get all anatypes |
157 | 160 |
for (File f : xmlfiles) { |
158 | 161 |
getAnaTypes(f) |
159 | 162 |
} |
163 |
|
|
164 |
// Building projections datas to use for each step |
|
165 |
String projectionsParameterValue = project.getTextualPlan("Projections").trim() |
|
166 |
projectionsParameterValue = projectionsParameterValue.replace("\n", "\t") |
|
167 |
def projectionsParameter = projectionsParameterValue.split("\t"); |
|
168 |
if (projectionsParameterValue.length() > 0) { |
|
169 |
for (def projection : projectionsParameter) { |
|
170 |
if (!projection.contains("->")) continue; |
|
171 |
String[] fromTo = projection.split("->", 2) |
|
172 |
String from = fromTo[0].trim() |
|
173 |
String to = fromTo[1].trim() |
|
174 |
if (projection.contains("->") && from.contains("_") && to.contains("_")) { |
|
175 |
String toStructure = to.substring(0, to.indexOf("_")) |
|
176 |
String toStructureProperty = to.substring(to.indexOf("_") + 1) |
|
177 |
String fromStructure = from.substring(0, from.indexOf("_")) |
|
178 |
String fromStructureProperty = from.substring(from.indexOf("_") + 1) |
|
179 |
|
|
180 |
if (!projectionsToDo.containsKey(toStructure)) { |
|
181 |
projectionsToDo[toStructure] = new LinkedHashMap<String, ArrayList<ArrayList>>(); |
|
182 |
} |
|
183 |
if (!projectionsToDo[toStructure].containsKey(fromStructure)) { |
|
184 |
projectionsToDo[toStructure][fromStructure] = new ArrayList<ArrayList>(); |
|
185 |
} |
|
186 |
projectionsToDo[toStructure][fromStructure].add([toStructureProperty, fromStructureProperty]) |
|
187 |
|
|
188 |
if (!projectionsFromValues.containsKey(fromStructure)) projectionsFromValues[fromStructure] = new LinkedHashMap<String, String>(); |
|
189 |
projectionsFromValues[fromStructure][fromStructureProperty] = ""; |
|
190 |
} |
|
191 |
} |
|
192 |
} |
|
193 |
|
|
160 | 194 |
//println "ANATYPES: "+anatypes |
161 | 195 |
if (!createOutput(cqpFile)) return false; |
162 | 196 |
output.write("<txmcorpus lang=\"fr\">\n") |
163 | 197 |
output.close(); |
164 |
|
|
198 |
|
|
165 | 199 |
println("Compiling "+xmlfiles.size()+" files") |
166 | 200 |
ConsoleProgressBar cpb = new ConsoleProgressBar(xmlfiles.size()) |
167 | 201 |
for (File txmFile :xmlfiles) { |
... | ... | |
176 | 210 |
if (!createOutput(cqpFile)) return false; |
177 | 211 |
output.write("</txmcorpus>\n") |
178 | 212 |
output.close(); |
179 |
|
|
213 |
|
|
180 | 214 |
//2- Import into CWB |
181 | 215 |
File registryFile = new File(binDir, "registry/"+corpusname.toLowerCase()) |
182 | 216 |
File dataDir = new File(binDir, "data/$corpusname") |
183 |
|
|
217 |
|
|
184 | 218 |
new File(binDir, "registry").mkdir(); |
185 | 219 |
if (!new File(binDir, "registry").exists()) { |
186 | 220 |
println "Can't create registry directory" |
187 | 221 |
return false; |
188 | 222 |
} |
189 |
|
|
223 |
|
|
190 | 224 |
if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) { |
191 | 225 |
println ("Error: CWB executables not well set.") |
192 | 226 |
return false; |
... | ... | |
195 | 229 |
CwbMakeAll cwbMa = new CwbMakeAll(); |
196 | 230 |
cwbEn.setDebug(debug); |
197 | 231 |
cwbMa.setDebug(debug); |
198 |
|
|
232 |
|
|
199 | 233 |
String uAttr = "u:0"; |
200 | 234 |
for (String attr : uAttrs) { |
201 | 235 |
uAttr += "+"+attr |
... | ... | |
218 | 252 |
if (ignoreTranscriberMetadata) { |
219 | 253 |
String meta =p.getFirst(); |
220 | 254 |
if (meta != "scribe" && meta != "audio_filename" && |
221 |
meta != "version" && meta != "version_date") |
|
255 |
meta != "version" && meta != "version_date")
|
|
222 | 256 |
textAttr+="+"+meta |
223 | 257 |
} else { |
224 | 258 |
textAttr+="+"+p.getFirst() |
... | ... | |
227 | 261 |
break; |
228 | 262 |
} |
229 | 263 |
} |
230 |
|
|
264 |
|
|
231 | 265 |
List<String> pargs = ["spk", "ref", "id", "entitytype", "entityid"] |
232 | 266 |
for (String ana : anatypes) if (!pargs.contains(ana)) pargs.add(ana) |
233 |
|
|
267 |
|
|
234 | 268 |
String[] pAttributes = pargs |
235 |
|
|
269 |
|
|
236 | 270 |
String[] sAttributes = ["txmcorpus:0+lang", uAttr , textAttr, "event:0+id+desc+type+extent", sectionAttr, spAttr]; |
237 |
|
|
271 |
|
|
272 |
// registering the projected structure properties |
|
273 |
//println "registering: $projectionsToDo" |
|
274 |
for (String struct : projectionsToDo.keySet()) { |
|
275 |
for (String struct2 : projectionsToDo[struct].keySet()) { |
|
276 |
for (def couple : projectionsToDo[struct][struct2]) { |
|
277 |
//sattrsListener.getStructs()[struct].add(couple[0]) |
|
278 |
//println "add $struct $couple" |
|
279 |
for (int i = 0 ; i < sAttributes.size() ; i++) { |
|
280 |
if (sAttributes[i].startsWith(struct+":") && !sAttributes[i].contains(couple[0])) { |
|
281 |
sAttributes[i] = sAttributes[i] + "+"+couple[0] |
|
282 |
} |
|
283 |
} |
|
284 |
} |
|
285 |
} |
|
286 |
} |
|
287 |
|
|
238 | 288 |
println "pAttributes: $pAttributes" |
239 | 289 |
println "sAttributes: $sAttributes" |
240 | 290 |
//return; |
... | ... | |
246 | 296 |
return false; |
247 | 297 |
} |
248 | 298 |
cwbMa.run(corpusname, registryFile.getParent()); |
249 |
|
|
299 |
|
|
250 | 300 |
} catch (Exception ex) {System.out.println(ex); return false;} |
251 |
|
|
301 |
|
|
252 | 302 |
if (project.getCleanAfterBuild()) { |
253 | 303 |
new File(project.getProjectDirectory(), "cqp").deleteDir() |
254 | 304 |
} |
255 | 305 |
|
256 | 306 |
return true; |
257 | 307 |
} |
258 |
|
|
308 |
|
|
259 | 309 |
/** |
260 | 310 |
* Creates the output. |
261 | 311 |
* |
... | ... | |
273 | 323 |
return false; |
274 | 324 |
} |
275 | 325 |
} |
276 |
|
|
326 |
|
|
277 | 327 |
/** The text_id. */ |
278 | 328 |
String text_id |
279 |
|
|
329 |
|
|
280 | 330 |
/** The u opened. */ |
281 | 331 |
boolean uOpened = false; |
282 |
|
|
332 |
|
|
283 | 333 |
/** The idturn. */ |
284 | 334 |
int idturn = 1; |
285 |
|
|
335 |
|
|
286 | 336 |
/** The idsection. */ |
287 | 337 |
int idsection = 1; |
288 |
|
|
338 |
|
|
289 | 339 |
/** The idu. */ |
290 | 340 |
int idu = 1; |
291 |
|
|
341 |
|
|
292 | 342 |
/** The idevent. */ |
293 | 343 |
int idevent = 1; |
294 |
|
|
344 |
|
|
295 | 345 |
/** The events. */ |
296 | 346 |
List<String> events = []; |
297 | 347 |
static int vEntityId = 0; |
298 | 348 |
static int vEntityIdCount = 1; |
299 |
|
|
349 |
|
|
350 |
protected void writeProjections(String localname) { |
|
351 |
if (projectionsToDo.containsKey(localname)) { |
|
352 |
for (String from : projectionsToDo[localname].keySet()) { |
|
353 |
for (def couple : projectionsToDo[localname][from]) { |
|
354 |
def o = couple[0] |
|
355 |
def p = couple[1] |
|
356 |
def r = projectionsFromValues[from][p] |
|
357 |
//println "o=$o p=$p r=$r" |
|
358 |
output.write(" "+o+"=\""+r+"\""); |
|
359 |
} |
|
360 |
} |
|
361 |
} |
|
362 |
} |
|
363 |
|
|
300 | 364 |
/** |
301 | 365 |
* Process. |
302 | 366 |
* |
... | ... | |
306 | 370 |
private boolean process(File xmlfile) { |
307 | 371 |
text_id = xmlfile.getName(); |
308 | 372 |
text_id = text_id.substring(0, text_id.length() -4); |
309 |
|
|
373 |
|
|
310 | 374 |
idturn = 1; |
311 | 375 |
idsection = 1; |
312 | 376 |
idu = 1; |
313 |
|
|
377 |
|
|
314 | 378 |
boolean flagAna; |
315 | 379 |
boolean flagForm; |
316 | 380 |
boolean flagWord; |
... | ... | |
324 | 388 |
String anatype = ""; |
325 | 389 |
String anaresp = ""; |
326 | 390 |
String anavalue = ""; |
327 |
|
|
391 |
|
|
328 | 392 |
String formatedTime; |
329 |
|
|
393 |
|
|
330 | 394 |
LinkedHashMap<String, String> anahash = new LinkedHashMap<String, String>(); |
331 | 395 |
String currentType; |
332 |
|
|
396 |
|
|
333 | 397 |
URL url = xmlfile.toURI().toURL(); |
334 | 398 |
inputData = url.openStream(); |
335 | 399 |
factory = XMLInputFactory.newInstance(); |
336 | 400 |
parser = factory.createXMLStreamReader(inputData); |
337 | 401 |
String filename = xmlfile.getName() |
338 | 402 |
String textid = filename.substring(0, filename.length() - 4); |
339 |
|
|
403 |
|
|
340 | 404 |
createOutput(cqpFile); |
341 | 405 |
String localname; |
342 |
|
|
406 |
|
|
343 | 407 |
//get all metadatas declared before Episode tag |
344 | 408 |
speakers = new HashMap<String, ArrayList<Pair<String, String>>>(); |
345 | 409 |
trans = new HashMap<String, ArrayList<Pair<String, String>>>(); |
346 | 410 |
topics = new HashMap<String, ArrayList<Pair<String, String>>>(); |
347 | 411 |
//println "parse infos" |
348 | 412 |
parseInfos(); |
349 |
|
|
413 |
|
|
350 | 414 |
// println "Trans: $trans" |
351 | 415 |
// println "Topics: $topics" |
352 | 416 |
// println "Speakers: $speakers" |
... | ... | |
358 | 422 |
// } |
359 | 423 |
// println "Trans properties: "+transproperties |
360 | 424 |
List<String> localspeakers; |
361 |
|
|
362 |
|
|
425 |
|
|
426 |
|
|
363 | 427 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
364 | 428 |
//print "event: "+event +" " |
365 | 429 |
switch (event) { |
366 | 430 |
case XMLStreamConstants.START_ELEMENT: |
367 | 431 |
localname = parser.getLocalName(); |
432 |
|
|
433 |
if (projectionsFromValues.containsKey(localname)) { // get projections values |
|
434 |
for (String attr : projectionsFromValues[localname].keySet()) { |
|
435 |
projectionsFromValues[localname][attr] = parser.getAttributeValue(null, attr); |
|
436 |
} |
|
437 |
} |
|
438 |
|
|
368 | 439 |
//println localname |
369 | 440 |
switch(localname) { |
441 |
|
|
370 | 442 |
case "div": |
371 | 443 |
output.write("<div"); |
372 | 444 |
for (int i = 0 ; i < parser.getAttributeCount() ; i ++) { |
... | ... | |
374 | 446 |
output.write(" "+name+"=\""+parser.getAttributeValue(i).replace("\"", """)+"\""); |
375 | 447 |
sectionAttrs << name |
376 | 448 |
} |
449 |
writeProjections(localname) |
|
377 | 450 |
output.write ">\n" |
378 | 451 |
break; |
379 | 452 |
case "sp": |
... | ... | |
383 | 456 |
output.write(" "+name+"=\""+parser.getAttributeValue(i).replace("\"", """)+"\""); |
384 | 457 |
spAttrs << name |
385 | 458 |
} |
459 |
writeProjections(localname) |
|
386 | 460 |
output.write ">\n" |
387 | 461 |
break; |
388 | 462 |
case "u": |
... | ... | |
390 | 464 |
for (int i = 0 ; i < parser.getAttributeCount() ; i ++) { |
391 | 465 |
String name = parser.getAttributeLocalName(i).replace("_","").toLowerCase() |
392 | 466 |
output.write(" "+name+"=\""+parser.getAttributeValue(i).replace("\"", """)+"\""); |
393 |
if (name == "time") {
|
|
394 |
formatedTime = parser.getAttributeValue(i)
|
|
395 |
} else if (name == "who") {
|
|
396 |
u_name = parser.getAttributeValue(i)
|
|
467 |
if (name == "time") { |
|
468 |
formatedTime = parser.getAttributeValue(i) |
|
469 |
} else if (name == "who") { |
|
470 |
u_name = parser.getAttributeValue(i) |
|
397 | 471 |
} |
398 |
|
|
472 |
|
|
399 | 473 |
uAttrs << name |
400 | 474 |
} |
475 |
writeProjections(localname) |
|
401 | 476 |
output.write ">\n" |
402 | 477 |
break; |
403 | 478 |
case "event": |
404 | 479 |
output.write("<event"); |
405 | 480 |
writeAttributes(); |
481 |
writeProjections(localname) |
|
406 | 482 |
output.write ">\n" |
407 |
|
|
483 |
|
|
408 | 484 |
if (parser.getAttributeValue(null, "type") == "entities") { |
409 | 485 |
if (parser.getAttributeValue(null, "extent") == "begin") { |
410 | 486 |
vEntityType = parser.getAttributeValue(null, "desc"); |
... | ... | |
463 | 539 |
break; |
464 | 540 |
case XMLStreamConstants.END_ELEMENT: |
465 | 541 |
localname = parser.getLocalName(); |
466 |
|
|
542 |
|
|
543 |
if (projectionsFromValues.containsKey(localname)) { // reset values |
|
544 |
for (String attr : projectionsFromValues[localname].keySet()) { |
|
545 |
projectionsFromValues[localname][attr] = ""; |
|
546 |
} |
|
547 |
} |
|
548 |
|
|
467 | 549 |
switch (localname) { |
468 | 550 |
case "text": |
469 | 551 |
output.write("</text>\n") |
... | ... | |
507 | 589 |
String ref = (u_name+", "+formatedTime+""+isEnq) |
508 | 590 |
if (ADD_TEXTID_TO_REF) ref = textid+", "+ref |
509 | 591 |
vForm +="\t"+u_name+"\t"+ref |
510 |
|
|
592 |
|
|
511 | 593 |
// concat entity and entity ID |
512 | 594 |
vAna+= "\t"+vEntityType+"\t"+vEntityId; |
513 |
|
|
595 |
|
|
514 | 596 |
//concat ana values |
515 | 597 |
for (String type : anatypes) { |
516 | 598 |
def v = anavalues.get(type); |
517 | 599 |
if (v == null) v = ""; |
518 |
|
|
600 |
|
|
519 | 601 |
if ("event" == type) { |
520 | 602 |
if (v.length() > 0) |
521 | 603 |
vAna+="\t#"+v; |
522 | 604 |
else |
523 | 605 |
vAna+="\t"; |
524 |
|
|
606 |
|
|
525 | 607 |
//concat <Event> values |
526 | 608 |
if (vEvents != null && vEvents.length() > 0 && vEvents != "N/A") |
527 | 609 |
vAna += "#"+vEvents; |
... | ... | |
529 | 611 |
vAna+="\t"+v; |
530 | 612 |
} |
531 | 613 |
} |
532 |
|
|
614 |
|
|
533 | 615 |
vForm = vForm.replaceAll("\n", "").replaceAll("&", "&").replaceAll("<", "<"); |
534 |
|
|
616 |
|
|
535 | 617 |
if (interviewers != null && !indexInterviewer) { // we must remove some words |
536 | 618 |
if (!interviewers.matches(u_name)) { // keep what is now an interviewer |
537 | 619 |
output.write(vForm+"\t"+wordid+vAna+"\n"); |
... | ... | |
539 | 621 |
} else { |
540 | 622 |
output.write(vForm+"\t"+wordid+vAna+"\n"); |
541 | 623 |
} |
542 |
|
|
624 |
|
|
543 | 625 |
vAna = ""; |
544 | 626 |
vForm = ""; |
545 | 627 |
break; |
... | ... | |
554 | 636 |
break; |
555 | 637 |
} |
556 | 638 |
} |
557 |
|
|
639 |
|
|
558 | 640 |
parser.close(); |
559 | 641 |
inputData.close(); |
560 | 642 |
output.close(); |
561 | 643 |
return true; |
562 | 644 |
} |
563 |
|
|
645 |
|
|
564 | 646 |
/** The u_name. */ |
565 | 647 |
String u_name; |
566 |
|
|
648 |
|
|
567 | 649 |
/** |
568 | 650 |
* Write start tag. |
569 | 651 |
*/ |
... | ... | |
572 | 654 |
writeAttributes(); |
573 | 655 |
output.write ">\n" |
574 | 656 |
} |
575 |
|
|
657 |
|
|
576 | 658 |
/** |
577 | 659 |
* Write attributes. |
578 | 660 |
*/ |
... | ... | |
581 | 663 |
output.write(" "+parser.getAttributeLocalName(i).replace("_","").toLowerCase()+"=\""+parser.getAttributeValue(i).replace("\"", """)+"\""); |
582 | 664 |
} |
583 | 665 |
} |
584 |
|
|
666 |
|
|
585 | 667 |
private void getAnaTypes(File xmlFile) { |
586 | 668 |
inputData = xmlFile.toURI().toURL().openStream(); |
587 | 669 |
factory = XMLInputFactory.newInstance(); |
... | ... | |
609 | 691 |
} |
610 | 692 |
} |
611 | 693 |
} |
612 |
|
|
694 |
|
|
613 | 695 |
/** |
614 | 696 |
* Write start tag. |
615 | 697 |
* |
... | ... | |
621 | 703 |
writeAttributes(); |
622 | 704 |
output.write ">\n" |
623 | 705 |
} |
624 |
|
|
706 |
|
|
625 | 707 |
/** |
626 | 708 |
* Write end tag. |
627 | 709 |
*/ |
628 | 710 |
private void writeEndTag() { |
629 | 711 |
output.write("</"+parser.getLocalName().toLowerCase()+">\n"); |
630 | 712 |
} |
631 |
|
|
713 |
|
|
632 | 714 |
/** The ignore transcriber metadata. */ |
633 | 715 |
boolean ignoreTranscriberMetadata = false; |
634 |
|
|
716 |
|
|
635 | 717 |
/** |
636 | 718 |
* Sets the ignore transcriber metadata. |
637 | 719 |
* |
... | ... | |
640 | 722 |
public void setIgnoreTranscriberMetadata(boolean state) { |
641 | 723 |
this.ignoreTranscriberMetadata = state; |
642 | 724 |
} |
643 |
|
|
725 |
|
|
644 | 726 |
public static int MAXATTRIBUTEVALUELENGTH = 8191; |
645 | 727 |
|
646 | 728 |
/** |
... | ... | |
652 | 734 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
653 | 735 |
if (event == XMLStreamConstants.START_ELEMENT) { |
654 | 736 |
localname = parser.getLocalName(); |
737 |
|
|
738 |
if (projectionsFromValues.containsKey(localname)) { // get projections values |
|
739 |
for (String attr : projectionsFromValues[localname].keySet()) { |
|
740 |
projectionsFromValues[localname][attr] = parser.getAttributeValue(null, attr); |
|
741 |
} |
|
742 |
} |
|
743 |
|
|
655 | 744 |
switch (localname) { |
656 | 745 |
case "text": |
657 | 746 |
output.write("<text project=\""+projectname+"\" base=\""+corpusname+"\"") |
... | ... | |
659 | 748 |
|
660 | 749 |
String value = parser.getAttributeValue(i).replace("\"", """); |
661 | 750 |
if (value.length() > MAXATTRIBUTEVALUELENGTH) { |
662 |
// value = value.substring(0, MAXATTRIBUTEVALUELENGTH-1) |
|
751 |
// value = value.substring(0, MAXATTRIBUTEVALUELENGTH-1)
|
|
663 | 752 |
println "WARNING: attribute value is too long ( > $MAXATTRIBUTEVALUELENGTH). The value will be truncated to: $value" |
664 | 753 |
} |
665 | 754 |
|
666 | 755 |
if (ignoreTranscriberMetadata) { |
667 | 756 |
if (parser.getAttributeLocalName(i) != "scribe" && |
668 |
parser.getAttributeLocalName(i) != "audio_filename" && |
|
669 |
parser.getAttributeLocalName(i) != "version" && |
|
670 |
parser.getAttributeLocalName(i) != "version_date") { |
|
757 |
parser.getAttributeLocalName(i) != "audio_filename" &&
|
|
758 |
parser.getAttributeLocalName(i) != "version" &&
|
|
759 |
parser.getAttributeLocalName(i) != "version_date") {
|
|
671 | 760 |
output.write(" "+parser.getAttributeLocalName(i).replace("_","").toLowerCase()+"=\""+value+"\""); |
672 | 761 |
} |
673 | 762 |
} else { |
674 | 763 |
output.write(" "+parser.getAttributeLocalName(i).replace("_","").toLowerCase()+"=\""+value+"\""); |
675 | 764 |
} |
676 | 765 |
} |
677 |
|
|
766 |
|
|
678 | 767 |
output.write ">\n" |
679 |
|
|
768 |
|
|
680 | 769 |
ArrayList list = new ArrayList<Pair<String, String>>() |
681 | 770 |
trans.put("trans", list); |
682 |
|
|
771 |
|
|
683 | 772 |
for (int i = 0 ; i < parser.getAttributeCount() ; i ++) { |
684 | 773 |
list.add(new Pair(parser.getAttributeLocalName(i).replace("_","").toLowerCase(), parser.getAttributeValue(i))); |
685 | 774 |
if (parser.getAttributeLocalName(i).equals("interviewer-id-regex")) |
... | ... | |
708 | 797 |
if (id != null) { |
709 | 798 |
ArrayList list = new ArrayList<Pair<String, String>>() |
710 | 799 |
speakers.put(id, list); |
711 |
|
|
800 |
|
|
712 | 801 |
for (int i = 0 ; i < parser.getAttributeCount() ; i ++) { |
713 | 802 |
list.add(new Pair(parser.getAttributeLocalName(i), parser.getAttributeValue(i))); |
714 | 803 |
} |
... | ... | |
721 | 810 |
} |
722 | 811 |
} |
723 | 812 |
} |
724 |
|
|
813 |
|
|
725 | 814 |
/** |
726 | 815 |
* Sets the debug. |
727 | 816 |
*/ |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZCompilerStep.groovy (revision 3407) | ||
---|---|---|
52 | 52 |
this.normalizeFormValues = n |
53 | 53 |
} |
54 | 54 |
|
55 |
public XTZCompilerStep(File xmlFile, File cqpFile, String textname, String corpusname, String projectname, def anatypes, def wtag) { |
|
55 |
public XTZCompilerStep(ImportStep importStep, File xmlFile, File cqpFile, String textname, String corpusname, String projectname, def anatypes, def wtag) { |
|
56 |
super(importStep) |
|
57 |
|
|
56 | 58 |
this.xmlFile = xmlFile |
57 | 59 |
this.cqpFile = cqpFile |
58 | 60 |
this.textname = textname |
... | ... | |
95 | 97 |
* @return true, if successful |
96 | 98 |
*/ |
97 | 99 |
public boolean process() { |
100 |
|
|
98 | 101 |
if (!createOutput(cqpFile)) { |
99 | 102 |
return false |
100 | 103 |
} |
... | ... | |
122 | 125 |
boolean foundtei = false; |
123 | 126 |
boolean foundtext = false; |
124 | 127 |
int nWords = 0; |
128 |
|
|
129 |
|
|
125 | 130 |
try { |
126 | 131 |
String localname; |
127 | 132 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
128 | 133 |
switch (event) { |
129 | 134 |
case XMLStreamConstants.START_ELEMENT: |
130 | 135 |
localname = parser.getLocalName().toLowerCase(); |
136 |
|
|
137 |
if (importStep.projectionsFromValues.containsKey(localname)) { |
|
138 |
for (String attr : importStep.projectionsFromValues[localname].keySet()) { |
|
139 |
importStep.projectionsFromValues[localname][attr] = parser.getAttributeValue(null, attr); |
|
140 |
} |
|
141 |
} |
|
142 |
|
|
131 | 143 |
if ("tei".equals(localname)) foundtei = true; |
132 | 144 |
switch (localname) { |
133 | 145 |
case "text": |
... | ... | |
180 | 192 |
if (!foundtei || !foundtext) break; |
181 | 193 |
|
182 | 194 |
output.write("<"+localname) |
183 |
|
|
195 |
def toWrite = new LinkedHashMap() |
|
184 | 196 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
185 | 197 |
String attrname = parser.getAttributeLocalName(i) |
186 | 198 |
|
... | ... | |
188 | 200 |
if (normalizeAttributeValues) { |
189 | 201 |
attrvalue = attrvalue.trim() |
190 | 202 |
} |
191 |
output.write(" "+attrname.toLowerCase()+"=\""+attrvalue.replace("\"", """)+QUOTE) |
|
203 |
//output.write(" "+attrname.toLowerCase()+"=\""+attrvalue.replace("\"", """)+QUOTE) |
|
204 |
toWrite[attrname.toLowerCase()] = attrvalue.replace("\"", """) |
|
192 | 205 |
} |
193 | 206 |
if (parser.getAttributeCount() == 0) { // add the n attribute |
194 | 207 |
if (!ncounts.containsKey(localname)) ncounts.put(localname, 0) |
195 | 208 |
int ncount = ncounts.get(localname) |
196 | 209 |
ncounts.put(localname, ncount+1) |
197 | 210 |
output.write(" n=\""+ncount+QUOTE) |
211 |
toWrite["n"] = ncount |
|
198 | 212 |
} |
213 |
|
|
214 |
if (importStep.projectionsToDo.containsKey(localname)) { |
|
215 |
for (String from : importStep.projectionsToDo[localname].keySet()) { |
|
216 |
for (def couple : importStep.projectionsToDo[localname][from]) { |
|
217 |
def o = couple[0] |
|
218 |
def p = couple[1] |
|
219 |
def r = importStep.projectionsFromValues[from][p] |
|
220 |
//println "o=$o p=$p r=$r" |
|
221 |
toWrite[o] = r |
|
222 |
} |
|
223 |
} |
|
224 |
} |
|
225 |
|
|
226 |
//println "toWrite=$toWrite" |
|
227 |
|
|
228 |
for (String attr : toWrite.keySet()) { |
|
229 |
output.write(" "+attr+"=\""+toWrite[attr]+QUOTE) |
|
230 |
} |
|
199 | 231 |
output.write(">\n") |
200 | 232 |
} |
201 | 233 |
break; |
202 | 234 |
|
203 | 235 |
case XMLStreamConstants.END_ELEMENT: |
204 | 236 |
localname = parser.getLocalName().toLowerCase(); |
237 |
|
|
238 |
if (importStep.projectionsFromValues.containsKey(localname)) { // reset projection values |
|
239 |
for (String attr : importStep.projectionsFromValues[localname].keySet()) { |
|
240 |
importStep.projectionsFromValues[localname][attr] = ""; |
|
241 |
} |
|
242 |
} |
|
243 |
|
|
205 | 244 |
switch (localname) { |
206 | 245 |
case WTAG: |
207 | 246 |
for (String type : anatypes) { |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZFacsPagerStep.groovy (revision 3407) | ||
---|---|---|
37 | 37 |
String wtag; |
38 | 38 |
boolean debug = false; |
39 | 39 |
|
40 |
public XTZFacsPagerStep(File xmlFile, File editionDir, File imageDirectory, String txtname, String corpusname, String tag, String attribute, String wtag, boolean debug) { |
|
40 |
public XTZFacsPagerStep(ImportStep importStep, File xmlFile, File editionDir, File imageDirectory, String txtname, String corpusname, String tag, String attribute, String wtag, boolean debug) { |
|
41 |
super(importStep) |
|
42 |
|
|
41 | 43 |
inputData = xmlFile.toURI().toURL().openStream() |
42 | 44 |
factory = XMLInputFactory.newInstance() |
43 | 45 |
parser = factory.createXMLStreamReader(inputData) |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZCompiler.groovy (revision 3407) | ||
---|---|---|
18 | 18 |
import org.txm.searchengine.cqp.corpus.* |
19 | 19 |
|
20 | 20 |
class XTZCompiler extends Compiler { |
21 |
|
|
21 |
|
|
22 | 22 |
SAttributesListener sattrsListener; // store scanned structures |
23 |
|
|
23 |
|
|
24 | 24 |
String regPath; |
25 | 25 |
String corpusname; |
26 | 26 |
String wtag; |
27 |
|
|
27 |
|
|
28 | 28 |
boolean doNormalizeAttributeValues = false; |
29 | 29 |
boolean doNormalizeAnaValues = true; |
30 |
|
|
30 |
|
|
31 | 31 |
public XTZCompiler(ImportModule module) { |
32 | 32 |
super(module); |
33 |
|
|
33 |
|
|
34 | 34 |
corpusname = module.getProject().getName(); |
35 | 35 |
regPath = module.getBinaryDirectory().getAbsolutePath() + "/registry/"+corpusname.toLowerCase() |
36 |
|
|
36 |
|
|
37 | 37 |
wtag = module.getProject().getTokenizerWordElement(); |
38 |
|
|
38 |
|
|
39 | 39 |
doNormalizeAttributeValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEATTRIBUTEVALUES, "false")) |
40 | 40 |
doNormalizeAnaValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEANAVALUES, "false")) |
41 | 41 |
} |
42 |
|
|
42 |
|
|
43 | 43 |
/** |
44 | 44 |
* the Text list |
45 | 45 |
*/ |
... | ... | |
51 | 51 |
def initialTypesValues; |
52 | 52 |
@Override |
53 | 53 |
public void _process() { |
54 |
|
|
54 |
|
|
55 | 55 |
if (orderedTextIDs == null) orderedTextIDs = module.getProject().getTextsID() ; |
56 |
|
|
56 |
|
|
57 | 57 |
Project project = module.getProject(); |
58 |
|
|
58 |
|
|
59 | 59 |
texts = orderedTextIDs.collect() { id -> module.getProject().getText(id) } |
60 | 60 |
textsToProcess = getTextsToProcess(texts) |
61 |
|
|
61 |
|
|
62 | 62 |
// get all structures |
63 | 63 |
sattrsListener = new SAttributesListener() // will store the structure and properties declaration |
64 | 64 |
sattrsListener.W = wtag |
65 |
|
|
65 |
|
|
66 | 66 |
File registryFile = new File(regPath) |
67 | 67 |
|
68 | 68 |
initialTypesValues = new HashSet<String>() |
... | ... | |
87 | 87 |
corpus.setName(project.getName()); |
88 | 88 |
} |
89 | 89 |
corpus.setDescription("Built with the XTZ import module"); |
90 |
|
|
90 |
|
|
91 | 91 |
if (!doScanStep()) return; |
92 | 92 |
if (!doCQPStep()) return; |
93 | 93 |
if (!doCWBEncodeStep()) return; |
94 | 94 |
if (!doCWBMakeAllStep()) return; |
95 |
|
|
96 |
if (module.getProject().getCleanAfterBuild()
|
|
97 |
&& !module.getProject().getDoUpdate()) { // for optimization purpose, don't clean the CQP files |
|
95 |
|
|
96 |
if (module.getProject().getCleanAfterBuild() |
|
97 |
&& !module.getProject().getDoUpdate()) { // for optimization purpose, don't clean the CQP files
|
|
98 | 98 |
new File(module.getBinaryDirectory(), "cqp").deleteDir() |
99 | 99 |
} |
100 |
|
|
100 |
|
|
101 | 101 |
isSuccessFul = true; |
102 | 102 |
} |
103 |
|
|
103 |
|
|
104 | 104 |
/** |
105 | 105 |
* Scan all XML-TXM files to find out structures and word properties |
106 | 106 |
*/ |
107 | 107 |
public boolean doScanStep() { |
108 |
|
|
108 |
|
|
109 | 109 |
println "-- Scanning structures&properties to create for "+texts.size()+" texts..." |
110 |
|
|
111 |
|
|
112 |
|
|
110 |
|
|
111 |
|
|
112 |
|
|
113 | 113 |
// def initialTypesValues = new HashSet<String>() |
114 | 114 |
// initialTypesValues.addAll(sattrsListener.getAnatypes()) |
115 |
|
|
115 |
|
|
116 | 116 |
// get all word properties |
117 | 117 |
ConsoleProgressBar cpb = new ConsoleProgressBar(texts.size()) |
118 | 118 |
for (Text t : texts) { |
... | ... | |
129 | 129 |
return false; |
130 | 130 |
} |
131 | 131 |
} |
132 |
|
|
132 |
|
|
133 | 133 |
if (initialTypesValues.size() == sattrsListener.getAnatypes().size() |
134 |
&& initialTypesValues.containsAll(sattrsListener.getAnatypes())) { // the word properties changed all CQP files must be recreated |
|
134 |
&& initialTypesValues.containsAll(sattrsListener.getAnatypes())) { // the word properties changed all CQP files must be recreated
|
|
135 | 135 |
// no new property |
136 | 136 |
} else { |
137 | 137 |
if (module.isUpdatingCorpus()) { |
... | ... | |
140 | 140 |
textsToProcess.clear() |
141 | 141 |
textsToProcess.addAll(texts) |
142 | 142 |
} |
143 |
|
|
143 |
|
|
144 | 144 |
println "" |
145 | 145 |
return true; |
146 | 146 |
} |
147 |
|
|
147 |
|
|
148 | 148 |
def getTextsToProcess(def texts) { |
149 |
|
|
149 | 150 |
def textsToProcess = texts.findAll() { text -> |
150 | 151 |
File xmlFile = text.getXMLTXMFile() |
151 | 152 |
String textname = text.getName() |
152 |
|
|
153 |
|
|
153 | 154 |
File cqpFile = new File(cqpDirectory, textname + ".cqp") |
154 | 155 |
cqpFiles << cqpFile // insert cqp files to concat later |
155 | 156 |
// skip step if cqpFile exists AND is more recent than the XML-TXM File |
... | ... | |
157 | 158 |
if (!cqpFile.exists() || xmlFile.lastModified() >= cqpFile.lastModified()) { |
158 | 159 |
return true |
159 | 160 |
} |
160 |
|
|
161 |
|
|
161 | 162 |
if (!text.isDirty() && !mustBuild) { |
162 | 163 |
Log.finer("skipping .cqp step of $text"); |
163 | 164 |
return false |
164 | 165 |
} |
165 |
|
|
166 |
|
|
166 | 167 |
return true |
167 | 168 |
} |
168 |
|
|
169 |
|
|
169 | 170 |
return textsToProcess |
170 | 171 |
} |
171 |
|
|
172 |
|
|
172 | 173 |
def cqpFiles = [] // ordered cqp files to concat before calling cwb-encode |
173 | 174 |
int cqpFilesUpdated = 0; |
175 |
LinkedHashMap<String, LinkedHashMap<String, String>> projectionsFromValues = new LinkedHashMap<String, LinkedHashMap<String, String>>(); // values of properties to inject |
|
176 |
LinkedHashMap<String, LinkedHashMap<String, ArrayList<ArrayList>>> projectionsToDo = new LinkedHashMap<String, LinkedHashMap<String, ArrayList<ArrayList>>>(); // list of projections to do |
|
174 | 177 |
public boolean doCQPStep() { |
175 |
|
|
178 |
|
|
176 | 179 |
cqpDirectory.mkdir(); // if not created |
177 |
|
|
180 |
|
|
178 | 181 |
println "-- Building CQP files ${textsToProcess.size()}/${texts.size()}..." |
179 |
|
|
182 |
|
|
183 |
// Building projections datas to use for each step |
|
184 |
String projectionsParameterValue = module.project.getTextualPlan("Projections").trim() |
|
185 |
projectionsParameterValue = projectionsParameterValue.replace("\n", "\t") |
|
186 |
def projectionsParameter = projectionsParameterValue.split("\t"); |
|
187 |
if (projectionsParameterValue.length() > 0) { |
|
188 |
for (def projection : projectionsParameter) { |
|
189 |
if (!projection.contains("->")) continue; |
|
190 |
|
|
191 |
String[] fromTo = projection.split("->", 2) |
|
192 |
String from = fromTo[0].trim() |
|
193 |
String to = fromTo[1].trim() |
|
194 |
if (projection.contains("->") && from.contains("_") && to.contains("_")) { |
|
195 |
String toStructure = to.substring(0, to.indexOf("_")) |
|
196 |
String toStructureProperty = to.substring(to.indexOf("_") + 1) |
|
197 |
String fromStructure = from.substring(0, from.indexOf("_")) |
|
198 |
String fromStructureProperty = from.substring(from.indexOf("_") + 1) |
|
199 |
|
|
200 |
if (!projectionsToDo.containsKey(toStructure)) { |
|
201 |
projectionsToDo[toStructure] = new LinkedHashMap<String, ArrayList<ArrayList>>(); |
|
202 |
} |
|
203 |
if (!projectionsToDo[toStructure].containsKey(fromStructure)) { |
|
204 |
projectionsToDo[toStructure][fromStructure] = new ArrayList<ArrayList>(); |
|
205 |
} |
|
206 |
projectionsToDo[toStructure][fromStructure].add([toStructureProperty, fromStructureProperty]) |
|
207 |
|
|
208 |
if (!projectionsFromValues.containsKey(fromStructure)) projectionsFromValues[fromStructure] = new LinkedHashMap<String, String>(); |
|
209 |
projectionsFromValues[fromStructure][fromStructureProperty] = ""; |
|
210 |
} |
|
211 |
} |
|
212 |
} |
|
213 |
// registering the new structure properties |
|
214 |
for (String struct : projectionsToDo.keySet()) { |
|
215 |
for (String struct2 : projectionsToDo[struct].keySet()) { |
|
216 |
for (def couple : projectionsToDo[struct][struct2]) { |
|
217 |
sattrsListener.getStructs()[struct].add(couple[0]) |
|
218 |
} |
|
219 |
} |
|
220 |
} |
|
221 |
|
|
180 | 222 |
ConsoleProgressBar cpb = new ConsoleProgressBar(textsToProcess.size()) |
181 | 223 |
cqpFilesUpdated = 0; |
182 | 224 |
for (Text text : textsToProcess) { |
183 | 225 |
cpb.tick(); |
184 |
|
|
226 |
|
|
185 | 227 |
File xmlFile = text.getXMLTXMFile() |
186 | 228 |
String textname = text.getName() |
187 |
|
|
229 |
|
|
188 | 230 |
File cqpFile = new File(cqpDirectory, textname + ".cqp") |
189 |
|
|
231 |
|
|
190 | 232 |
cqpFilesUpdated++ |
191 |
|
|
192 |
XTZCompilerStep step = new XTZCompilerStep(xmlFile, cqpFile, textname, corpusname, "default", sattrsListener.getAnatypes(), wtag) |
|
233 |
|
|
234 |
XTZCompilerStep step = new XTZCompilerStep(this, xmlFile, cqpFile, textname, corpusname, "default", sattrsListener.getAnatypes(), wtag)
|
|
193 | 235 |
step.setNormalizeAnaValues(doNormalizeAnaValues) |
194 | 236 |
step.setNormalizeAttributeValues(doNormalizeAttributeValues) |
195 | 237 |
if (!step.process()) { |
... | ... | |
200 | 242 |
println "" |
201 | 243 |
return true; |
202 | 244 |
} |
203 |
|
|
245 |
|
|
204 | 246 |
public boolean doCWBEncodeStep() { |
205 | 247 |
println "-- Running cwb-encode..." |
206 | 248 |
|
... | ... | |
208 | 250 |
DeleteDir.deleteDirectory(outputDirectory); |
209 | 251 |
outputDirectory.mkdirs(); |
210 | 252 |
dataDirectory.mkdirs(); |
211 |
|
|
253 |
|
|
212 | 254 |
DeleteDir.deleteDirectory(registryDirectory); |
213 | 255 |
registryDirectory.mkdirs(); |
214 | 256 |
|
215 | 257 |
CwbEncode cwbEn = new CwbEncode() |
216 | 258 |
cwbEn.setDebug(debug) |
217 |
|
|
259 |
|
|
218 | 260 |
List<String> pargs = ["id"] |
219 | 261 |
for (String ana : sattrsListener.getAnatypes()) { |
220 | 262 |
if (ana == "word") continue; // no need to be added, cwb will declared it automatically |
221 | 263 |
if (ana == "id") continue; // no need to be added, we did it already |
222 | 264 |
pargs.add(ana) |
223 | 265 |
} |
224 |
|
|
266 |
|
|
225 | 267 |
String[] pAttrs = pargs |
226 |
|
|
268 |
|
|
227 | 269 |
def structs = sattrsListener.getStructs() |
228 | 270 |
def structsProf = sattrsListener.getProfs() |
229 |
|
|
271 |
|
|
230 | 272 |
if (debug) { |
231 | 273 |
println structs |
232 | 274 |
println structsProf |
233 | 275 |
} |
234 |
|
|
276 |
|
|
235 | 277 |
List<String> sargs = new ArrayList<String>() |
236 | 278 |
def tmpTextAttrs = [] |
237 | 279 |
for (String name : structs.keySet()) { |
238 | 280 |
if (name == "txmcorpus") continue; |
239 |
|
|
281 |
|
|
240 | 282 |
if (name == "text") { |
241 | 283 |
for (String value : structs.get(name)) // append the attributes |
242 | 284 |
tmpTextAttrs << value // added after |
243 | 285 |
continue; |
244 | 286 |
} |
245 |
|
|
287 |
|
|
246 | 288 |
String concat = name+":"+structsProf.get(name); // append the depth |
247 | 289 |
for (String attributeName : structs.get(name)) { // append the attributes |
248 | 290 |
concat += "+"+attributeName.toLowerCase(); |
249 | 291 |
} |
250 |
|
|
292 |
|
|
251 | 293 |
if (structs.get(name).size() == 0) { |
252 | 294 |
concat += "+n"; |
253 | 295 |
} else { |
... | ... | |
255 | 297 |
concat += "+n" |
256 | 298 |
} |
257 | 299 |
} |
258 |
|
|
300 |
|
|
259 | 301 |
if ((name == "p" || name == "body" || name == "back" || name == "front") |
260 |
&& !concat.contains("+n+") && !concat.endsWith("+n")) { |
|
302 |
&& !concat.contains("+n+") && !concat.endsWith("+n")) {
|
|
261 | 303 |
concat += "+n" |
262 | 304 |
} |
263 | 305 |
sargs.add(concat) |
264 | 306 |
} |
265 |
|
|
307 |
|
|
266 | 308 |
String textSAttributes = "text:0+id+base+project"; |
267 | 309 |
for (String name : tmpTextAttrs) { |
268 | 310 |
if (!("id".equals(name) || "base".equals(name) || "project".equals(name))) { |
269 | 311 |
textSAttributes += "+"+name.toLowerCase() |
270 | 312 |
} |
271 | 313 |
} |
272 |
|
|
314 |
|
|
273 | 315 |
sargs.add(textSAttributes) |
274 | 316 |
sargs.add("txmcorpus:0+lang") |
275 |
|
|
317 |
|
|
276 | 318 |
sargs.sort() |
277 |
|
|
319 |
|
|
278 | 320 |
String[] sAttributes = sargs |
279 | 321 |
String[] pAttributes = pAttrs |
280 | 322 |
println " Word properties: "+pAttributes.join(', ') |
... | ... | |
286 | 328 |
println "Fail to write the master cqp file: "+allcqpFile |
287 | 329 |
return false; |
288 | 330 |
} |
289 |
|
|
331 |
|
|
290 | 332 |
new File(regPath).delete()// ensure the registry file is deleted |
291 |
|
|
333 |
|
|
292 | 334 |
if (!cwbEn.run(outputDirectory.getAbsolutePath() + "/$corpusname", |
293 |
allcqpFile.getAbsolutePath(), regPath, pAttributes, sAttributes, false)) { |
|
335 |
allcqpFile.getAbsolutePath(), regPath, pAttributes, sAttributes, false)) {
|
|
294 | 336 |
println "** cwb-encode did not ends well. Please activate a finer log level to see more details." |
295 | 337 |
return false; |
296 | 338 |
} |
297 |
|
|
339 |
|
|
298 | 340 |
allcqpFile.delete(); // clean |
299 | 341 |
} catch (Exception e) { |
300 | 342 |
println "Error while running cwb-encode: "+e |
... | ... | |
305 | 347 |
println "" |
306 | 348 |
return true; |
307 | 349 |
} |
308 |
|
|
350 |
|
|
309 | 351 |
public boolean doCWBMakeAllStep() { |
310 | 352 |
println "-- Running cwb-makeall..." |
311 | 353 |
try { |
312 | 354 |
CwbMakeAll cwbMa = new CwbMakeAll(); |
313 | 355 |
cwbMa.setDebug(debug); |
314 |
|
|
356 |
|
|
315 | 357 |
if (!new File(regPath).exists()) { |
316 | 358 |
println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq" |
317 | 359 |
return false; |
... | ... | |
320 | 362 |
println "** cwb-makeall did not ends well. Activate finer logs to see details." |
321 | 363 |
return false; |
322 | 364 |
} |
323 |
|
|
365 |
|
|
324 | 366 |
// remove milestones from CWB registry and data files |
325 | 367 |
FixMilestoneDeclarations fm = new FixMilestoneDeclarations( |
326 | 368 |
new File(regPath), new File(outputDirectory.getAbsolutePath(), corpusname)); |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZPager.groovy (revision 3407) | ||
---|---|---|
236 | 236 |
edition.setIndex(outputDirectory.getAbsolutePath()); |
237 | 237 |
|
238 | 238 |
try { |
239 |
def ed = new XTZFacsPagerStep(txmFile, newEditionDirectory, imageDirectory, txtname, corpusname, "pb", "facs", wordTag, debug); |
|
239 |
def ed = new XTZFacsPagerStep(this, txmFile, newEditionDirectory, imageDirectory, txtname, corpusname, "pb", "facs", wordTag, debug);
|
|
240 | 240 |
if (!ed.process()) { |
241 | 241 |
println "Fail to build 'facs' edition for text: $txmFile" |
242 | 242 |
continue; |
TXM/trunk/org.txm.core/src/java/org/txm/importer/xtz/Step.java (revision 3407) | ||
---|---|---|
1 | 1 |
package org.txm.importer.xtz; |
2 | 2 |
|
3 | 3 |
public class Step { |
4 |
|
|
5 |
protected ImportStep importStep; |
|
4 | 6 |
|
5 |
public Step() { |
|
6 |
// TODO Auto-generated constructor stub
|
|
7 |
public Step(ImportStep importStep) {
|
|
8 |
this.importStep = importStep;
|
|
7 | 9 |
} |
8 | 10 |
|
9 | 11 |
public boolean process() { |
TXM/trunk/org.txm.rcp/src/main/java/org/txm/rcp/editors/imports/sections/TextualPlansSection.java (revision 3407) | ||
---|---|---|
26 | 26 |
private Text noteElementsText; |
27 | 27 |
|
28 | 28 |
private Text milestoneElementsText; |
29 |
|
|
30 |
private Text projectionsText; |
|
29 | 31 |
|
30 | 32 |
public TextualPlansSection(ImportFormEditor editor, FormToolkit toolkit2, ScrolledForm form2, Composite parent, int style) { |
31 | 33 |
super(editor, toolkit2, form2, parent, style, "textualplans"); |
... | ... | |
82 | 84 |
milestoneElementsText = toolkit.createText(sectionClient, "", SWT.BORDER); //$NON-NLS-1$ |
83 | 85 |
gdata = getTextGridData(); |
84 | 86 |
milestoneElementsText.setLayoutData(gdata); |
87 |
|
|
88 |
Label label = toolkit.createLabel(sectionClient, "Projections", SWT.WRAP); |
|
89 |
gdata = getLabelGridData(); |
|
90 |
gdata.colspan = 2; |
|
91 |
label.setLayoutData(gdata); |
|
92 |
|
|
93 |
projectionsText = toolkit.createText(sectionClient, "", SWT.BORDER | SWT.MULTI | SWT.V_SCROLL); //$NON-NLS-1$ |
|
94 |
gdata = getTextGridData(); |
|
95 |
gdata.heightHint = 50; |
|
96 |
gdata.colspan = 2; |
|
97 |
|
|
98 |
projectionsText.setLayoutData(gdata); |
|
85 | 99 |
} |
86 | 100 |
|
87 | 101 |
@Override |
... | ... | |
91 | 105 |
outsideTextElementsToEditText.setText(project.getTextualPlan("OutSideTextTagsAndKeepContent")); |
92 | 106 |
noteElementsText.setText(project.getTextualPlan("Note")); |
93 | 107 |
milestoneElementsText.setText(project.getTextualPlan("MileStones")); |
108 |
projectionsText.setText(project.getTextualPlan("Projections")); |
|
94 | 109 |
} |
95 | 110 |
} |
96 | 111 |
|
... | ... | |
106 | 121 |
|
107 | 122 |
project.setTextualPlan("MileStones", milestoneElementsText.getText().trim()); |
108 | 123 |
|
124 |
project.setTextualPlan("Projections", projectionsText.getText().trim()); |
|
125 |
|
|
109 | 126 |
return true; |
110 | 127 |
} |
111 | 128 |
|
TXM/trunk/org.txm.rcp/src/main/java/org/txm/rcp/editors/imports/ImportModuleCustomization.java (revision 3407) | ||
---|---|---|
168 | 168 |
params.put(PREBUILD, true); |
169 | 169 |
params.put(QUERIES, true); |
170 | 170 |
params.put(UI, true); |
171 |
params.put(TEXTUALPLANS, false);
|
|
171 |
params.put(TEXTUALPLANS, true);
|
|
172 | 172 |
params.put(OPTIONS, true); |
173 | 173 |
sectionsPerImportModule.put("transcriberLoader.groovy", params); //$NON-NLS-1$ |
174 | 174 |
names.put("transcriberLoader.groovy", "XML-TRS + CSV"); //$NON-NLS-1$ //$NON-NLS-2$ |
Formats disponibles : Unified diff