Révision 2354
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/pager.groovy (revision 2354) | ||
---|---|---|
33 | 33 |
import javax.xml.stream.* |
34 | 34 |
|
35 | 35 |
import org.txm.importer.ApplyXsl2 |
36 |
import org.txm.metadatas.MetadataGroup |
|
36 | 37 |
import org.txm.metadatas.Metadatas |
37 | 38 |
import org.txm.utils.io.FileCopy; |
38 | 39 |
|
... | ... | |
229 | 230 |
writer.writeAttribute("http-equiv", "Content-Type"); |
230 | 231 |
writer.writeAttribute("content", "text/html"); |
231 | 232 |
writer.writeAttribute("charset", "UTF-8"); |
232 |
writer.writeEndElement(); |
|
233 |
writer.writeEndElement(); // meta
|
|
233 | 234 |
writer.writeStartElement("head"); |
234 |
|
|
235 | 235 |
//<link rel="stylesheet" type="text/css" href="class.css" /> |
236 | 236 |
writer.writeStartElement("link"); |
237 | 237 |
writer.writeAttribute("rel", "stylesheet"); |
238 | 238 |
writer.writeAttribute("type", "text/css"); |
239 | 239 |
writer.writeAttribute("href", "transcriber.css"); |
240 |
writer.writeEndElement(); |
|
241 |
writer.writeEndElement(); |
|
240 |
writer.writeEndElement(); // link
|
|
241 |
writer.writeEndElement(); // head
|
|
242 | 242 |
|
243 | 243 |
nbBreak++ |
244 | 244 |
writer.writeStartElement("body"); |
... | ... | |
254 | 254 |
switch (localname) { |
255 | 255 |
case "text": |
256 | 256 |
|
257 |
writer.writeStartElement("h2");
|
|
257 |
writer.writeStartElement("h1");
|
|
258 | 258 |
writer.writeAttribute("class","title"); |
259 | 259 |
String title = parser.getAttributeValue(null, "title"); |
260 | 260 |
|
... | ... | |
264 | 264 |
writer.writeCharacters("Transcription "+txmfile.getName().substring(0, txmfile.getName().length() - 4)); |
265 | 265 |
} |
266 | 266 |
|
267 |
writer.writeEmptyElement("br"); |
|
268 |
writer.writeStartElement("a"); |
|
269 |
writer.writeAttribute("onclick", "txmcommand('id', 'org.txm.backtomedia.commands.function.BackToMedia', 'corpus', '"+corpusname+"', 'text', '"+txtname+"', 'time', '0.0')"); |
|
270 |
writer.writeAttribute("style", "cursor: pointer;") |
|
271 |
writer.writeAttribute("class", "play-media") |
|
272 |
writer.writeCharacters(" ♪♪"); |
|
273 |
writer.writeEndElement(); // a |
|
267 |
writeMediaAccess("0.0") |
|
274 | 268 |
|
275 |
writer.writeEndElement(); // h2
|
|
269 |
writer.writeEndElement(); // h1
|
|
276 | 270 |
|
277 | 271 |
String subtitle = parser.getAttributeValue(null, "subtitle"); |
278 | 272 |
if (subtitle != null && subtitle.length() > 0) { |
279 |
writer.writeStartElement("h3");
|
|
273 |
writer.writeStartElement("h2");
|
|
280 | 274 |
writer.writeAttribute("class", "subtitle"); |
281 | 275 |
writer.writeCharacters(subtitle); |
282 |
writer.writeEndElement(); // h3
|
|
276 |
writer.writeEndElement(); // h2
|
|
283 | 277 |
} |
284 | 278 |
|
285 |
// println "metadatas != null: "+(metadatas != null) |
|
286 |
// if (metadatas != null) { |
|
287 |
writer.writeStartElement("table"); |
|
288 |
boolean grey = false; |
|
289 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
290 |
String name = parser.getAttributeName(i); |
|
291 |
String value = parser.getAttributeValue(i); |
|
292 |
|
|
293 |
if ("title" == name) { |
|
294 |
continue; // ignore "title" metadata |
|
295 |
} |
|
296 |
|
|
297 |
grey = !grey; |
|
298 |
writer.writeStartElement("tr"); |
|
299 |
if (grey) { |
|
300 |
writer.writeAttribute("style","background-color:lightgrey;") |
|
301 |
} |
|
302 |
|
|
303 |
if (value != null) { |
|
304 |
writer.writeStartElement("td"); |
|
305 |
writer.writeCharacters(name); |
|
306 |
writer.writeEndElement(); |
|
307 |
writer.writeStartElement("td"); |
|
308 |
writer.writeCharacters(value); |
|
309 |
writer.writeEndElement(); |
|
310 |
} |
|
311 |
//get enqueteur to style their names |
|
312 |
if (name.startsWith("enq")) { |
|
313 |
interviewers.add(value) |
|
314 |
} |
|
315 |
writer.writeEndElement(); |
|
279 |
writer.writeStartElement("table"); |
|
280 |
writer.writeAttribute("class", "transcription-table"); |
|
281 |
boolean grey = false; |
|
282 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
283 |
String name = parser.getAttributeName(i); |
|
284 |
String value = parser.getAttributeValue(i); |
|
285 |
|
|
286 |
if ("title" == name) { |
|
287 |
continue; // ignore "title" metadata |
|
316 | 288 |
} |
317 |
writer.writeEndElement(); |
|
318 |
// } |
|
289 |
|
|
290 |
grey = !grey; |
|
291 |
writer.writeStartElement("tr"); |
|
292 |
if (grey) { |
|
293 |
writer.writeAttribute("style","background-color:lightgrey;") |
|
294 |
} |
|
295 |
|
|
296 |
if (value != null) { |
|
297 |
writer.writeStartElement("td"); |
|
298 |
writer.writeCharacters(name); |
|
299 |
writer.writeEndElement(); // td |
|
300 |
writer.writeStartElement("td"); |
|
301 |
writer.writeCharacters(value); |
|
302 |
writer.writeEndElement(); // td |
|
303 |
} |
|
304 |
//get enqueteur to style their names |
|
305 |
if (name.startsWith("enq")) { |
|
306 |
interviewers.add(value) |
|
307 |
} |
|
308 |
writer.writeEndElement(); // tr |
|
309 |
} |
|
310 |
writer.writeEndElement(); // table |
|
311 |
// } |
|
319 | 312 |
break; |
320 | 313 |
case "Topics": |
321 | 314 |
/*writer.writeStartElement("h2"); |
... | ... | |
393 | 386 |
|
394 | 387 |
String type = parser.getAttributeValue(null, "type") |
395 | 388 |
writer.writeAttribute("type", ""+type) |
389 |
|
|
396 | 390 |
String desc = parser.getAttributeValue(null, "topic") |
397 | 391 |
|
398 |
if (type != null || desc != null) {
|
|
399 |
writer.writeStartElement("h2");
|
|
392 |
if (type != null && type.length() > 0) {
|
|
393 |
writer.writeStartElement("h1");
|
|
400 | 394 |
writer.writeAttribute("class", "section-title") |
401 |
if (type != null || type.length() == 0) { |
|
402 |
writer.writeCharacters(type+": "+desc); |
|
403 |
} else { |
|
404 |
writer.writeCharacters(desc) |
|
405 |
} |
|
395 |
writer.writeCharacters(type); |
|
406 | 396 |
|
407 | 397 |
if (parser.getAttributeValue(null,"startTime") != null) { |
408 |
writer.writeEmptyElement("br"); |
|
409 |
writer.writeStartElement("a") |
|
410 |
writer.writeAttribute("onclick", "txmcommand('id', 'org.txm.backtomedia.commands.function.BackToMedia', 'corpus', '"+corpusname+"', 'text', '"+txtname+"', 'time', '"+parser.getAttributeValue(null,"startTime")+"')"); |
|
411 |
writer.writeAttribute("style", "cursor: pointer;") |
|
412 |
writer.writeAttribute("class", "play-media") |
|
413 |
writer.writeCharacters(" ♪♪") |
|
414 |
writer.writeEndElement() // a |
|
398 |
writeMediaAccess(parser.getAttributeValue(null,"startTime")) |
|
415 | 399 |
} |
416 | 400 |
|
401 |
writer.writeEndElement(); // h1 |
|
402 |
} |
|
403 |
|
|
404 |
if (desc != null && desc.length() > 0) { |
|
405 |
writer.writeStartElement("h2"); |
|
406 |
writer.writeAttribute("class", "section-desc") |
|
407 |
writer.writeCharacters(desc) |
|
417 | 408 |
writer.writeEndElement(); // h2 |
418 | 409 |
} |
419 | 410 |
|
420 |
String metadata = parser.getAttributeValue(null, "metadata") |
|
421 |
if (metadata != null && metadata.length() > 0) { // the metadata to show |
|
422 |
writer.writeStartElement("ul") |
|
423 |
//println "metadata=$metadata" |
|
424 |
for (def m : metadata.split("<li>")) { |
|
425 |
writer.writeStartElement("li") |
|
426 |
writer.writeCharacters(m) |
|
427 |
writer.writeEndElement() // li |
|
411 |
def metadata = new LinkedHashMap<String, String>() // temp to store attributes |
|
412 |
def metadataGroups = ["metadata":[]] // default metadata group |
|
413 |
def metadataDeclared = false |
|
414 |
if (parser.getAttributeValue(null, "metadata") != null && parser.getAttributeValue(null, "metadata_groups") != null) { |
|
415 |
def l1 = parser.getAttributeValue(null, "metadata").split("\\|"); |
|
416 |
def l2 = parser.getAttributeValue(null, "metadata_groups").split("\\|"); |
|
417 |
for (int i = 0 ; i < l1.size() ; i++) { |
|
418 |
def m = l1[i] |
|
419 |
def g = l2[i] |
|
420 |
metadata[m] = "" // forcing order of metadata by pre-declaring |
|
421 |
if (!metadataGroups.containsKey(g)) metadataGroups[g] = [] |
|
422 |
metadataGroups[g] << m // declaring a metadata type |
|
428 | 423 |
} |
429 |
writer.writeEndElement() // ul |
|
430 |
} else if (parser.getAttributeCount() > 1) { // process all attributes |
|
431 |
writer.writeStartElement("ul") |
|
432 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
433 |
String name = parser.getAttributeLocalName(i) |
|
434 |
if (!"type".equals(name) |
|
435 |
&& !"topic".equals(name) |
|
436 |
&& !"startTime".equals(name) |
|
437 |
&& !"endTime".equals(name)) { |
|
438 |
writer.writeStartElement("li") |
|
439 |
writer.writeCharacters(""+name+": "+parser.getAttributeValue(i)) |
|
440 |
writer.writeEndElement() // li |
|
424 |
metadataDeclared = true |
|
425 |
} |
|
426 |
|
|
427 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
428 |
String name = parser.getAttributeLocalName(i) |
|
429 |
if (!"type".equals(name) |
|
430 |
&& !"topic".equals(name) |
|
431 |
&& !"startTime".equals(name) |
|
432 |
&& !"endTime".equals(name)) { |
|
433 |
if (metadataDeclared && !metadata.containsKey(name)) { |
|
434 |
continue; // ignoring metadata since not in declared metadata |
|
435 |
} else { |
|
436 |
metadataGroups["metadata"] << name |
|
441 | 437 |
} |
438 |
|
|
439 |
metadata[name] = parser.getAttributeValue(i) |
|
442 | 440 |
} |
443 |
writer.writeEndElement(); // ul |
|
444 | 441 |
} |
445 | 442 |
|
443 |
writer.writeStartElement("p") |
|
444 |
writer.writeAttribute("class", "section-all-metadata"); |
|
445 |
for (String groupName : metadataGroups.keySet()) { |
|
446 |
def group = metadataGroups[groupName] |
|
447 |
if (group.size() > 0) { |
|
448 |
if (groupName.equals("text")) { |
|
449 |
writer.writeStartElement("p") |
|
450 |
writer.writeAttribute("class", "section-"+groupName); |
|
451 |
for (String k : group) { |
|
452 |
writer.writeStartElement("p") |
|
453 |
writer.writeAttribute("class", ""+groupName) |
|
454 |
writer.writeStartElement("h4") |
|
455 |
writer.writeCharacters(k) |
|
456 |
writer.writeEndElement() // li |
|
457 |
writer.writeCharacters(metadata[k]) |
|
458 |
writer.writeEndElement() // li |
|
459 |
} |
|
460 |
} else { |
|
461 |
writer.writeStartElement("ul") |
|
462 |
writer.writeAttribute("class", "section-"+groupName); |
|
463 |
for (String k : group) { |
|
464 |
writer.writeStartElement("li") |
|
465 |
writer.writeAttribute("class", ""+groupName) |
|
466 |
writer.writeCharacters(""+k+": "+metadata[k]) |
|
467 |
writer.writeEndElement() // li |
|
468 |
} |
|
469 |
} |
|
470 |
|
|
471 |
writer.writeEndElement(); // ul or p |
|
472 |
} |
|
473 |
} |
|
474 |
writer.writeEndElement(); // p |
|
475 |
|
|
446 | 476 |
break; |
447 | 477 |
case "sp": |
448 | 478 |
endBoldIfNeeded() |
... | ... | |
578 | 608 |
} |
579 | 609 |
previousElem = localname; |
580 | 610 |
break; |
611 |
|
|
581 | 612 |
case XMLStreamConstants.END_ELEMENT: |
582 | 613 |
localname = parser.getLocalName(); |
583 |
switch(localname) |
|
584 |
{ |
|
614 |
switch(localname) { |
|
585 | 615 |
case "text": |
586 | 616 |
break; |
587 | 617 |
case "Topics": |
... | ... | |
722 | 752 |
break; |
723 | 753 |
} |
724 | 754 |
} |
725 |
writer.writeEndElement(); |
|
755 |
writer.writeEndElement(); // body
|
|
726 | 756 |
|
727 | 757 |
writer.writeEmptyElement("pb"); |
728 | 758 |
nbBreak++ |
729 | 759 |
writer.writeAttribute("id", ""+nbBreak); |
730 | 760 |
|
731 |
writer.writeEndElement(); |
|
761 |
writer.writeEndElement(); // html
|
|
732 | 762 |
writer.close(); |
733 | 763 |
output.close(); |
734 | 764 |
if (parser != null) parser.close(); |
... | ... | |
767 | 797 |
writer.writeAttribute("class", "sync"); |
768 | 798 |
writer.writeCharacters(currentTime); |
769 | 799 |
|
800 |
writeMediaAccess(currentTime) |
|
801 |
|
|
802 |
writer.writeEndElement(); // span |
|
803 |
} |
|
804 |
|
|
805 |
private void writeMediaAccess(def time) { |
|
770 | 806 |
writer.writeStartElement("a"); |
771 |
writer.writeAttribute("onclick", "txmcommand('id', 'org.txm.backtomedia.commands.function.BackToMedia', 'corpus', '"+corpusname+"', 'text', '"+txtname+"', 'time', '"+currentTime+"')");
|
|
807 |
writer.writeAttribute("onclick", "txmcommand('id', 'org.txm.backtomedia.commands.function.BackToMedia', 'corpus', '"+corpusname+"', 'text', '"+txtname+"', 'time', '"+time+"')");
|
|
772 | 808 |
writer.writeAttribute("style", "cursor: pointer;") |
773 | 809 |
writer.writeAttribute("class", "play-media") |
774 |
writer.writeCharacters(" ???♪");
|
|
810 |
writer.writeCharacters(" ???");
|
|
775 | 811 |
writer.writeEndElement(); // a |
776 |
|
|
777 |
writer.writeEndElement(); |
|
778 | 812 |
} |
779 | 813 |
|
780 | 814 |
private void writeSpeaker(String spk, boolean overlapping) { |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/AddSections.groovy (revision 2354) | ||
---|---|---|
1 | 1 |
import java.nio.charset.Charset |
2 | 2 |
|
3 |
import org.txm.utils.AsciiUtils |
|
4 |
import org.txm.utils.ConsoleProgressBar |
|
5 |
import org.txm.utils.CsvReader |
|
6 | 3 |
import java.time.LocalTime |
7 | 4 |
import java.time.format.DateTimeFormatter |
5 |
import org.txm.utils.* |
|
8 | 6 |
|
9 |
File metadataFile = new File("/home/mdecorde/TEMP/ANTRACT/AF/metadata.tsv")
|
|
7 |
File metadataFile = new File("/home/mdecorde/TEMP/ANTRACT/AF/sujets.xlsx")
|
|
10 | 8 |
File trsDirectory = new File("/home/mdecorde/TEMP/ANTRACT/AF/trs") |
11 |
def idTRSColumn = "Lien notice principale" |
|
12 |
def typeColumns = ["Identifiant de la notice"] |
|
13 |
def topicColumns = ["Titre propre", "Notes du titre"] |
|
9 |
|
|
10 |
def joinTRSColumn = "Lien notice principale" |
|
11 |
def typeColumns = ["Date de diffusion"] |
|
12 |
def topicColumns = ["Titre propre"] |
|
14 | 13 |
def startTimeColumn = "antract_debut" |
15 | 14 |
def endTimeColumn = "antract_fin" |
16 | 15 |
//def metadataColumns = ["Identifiant de la notice", "Titre propre", "antract_debut", "antract_fin"] |
17 |
def metadataColumns = ["Date de diffusion", "Descripteurs (Aff. Col.)", "Durée", "Générique (Aff. Col.)", "Genre", "Identifiant de la notice", "Langue VO / VE ", "Lien notice principale", "Nature de production ", "Nom fichier segmenté (info)", "Notes du titre ", "Producteurs (Aff.)", "Résumé", "Séquences", "Thématique", "Titre propre", "Type de date", "Type de notice"] |
|
16 |
def metadataColumns = [ |
|
17 |
"Titre propre", "Date de diffusion", "Identifiant de la notice", "Notes du titre", "Type de date", "Durée", "Genre", "Langue VO / VE", "Nature de production", "Producteurs (Aff.)", "Thématique", |
|
18 |
"Nom fichier segmenté (info)", "antract_video", "antract_debut","antract_fin","antract_duree","antract_tc_type","antract_tc_date", |
|
19 |
"Résumé", "Séquences", "Descripteurs (Aff. Lig.)", "Générique (Aff. Lig.)"] |
|
20 |
def metadataColumnsGroups = [ |
|
21 |
"metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", |
|
22 |
"secondary", "secondary", "secondary","secondary","secondary","secondary","secondary", |
|
23 |
"text", "text", "text", "text"] |
|
18 | 24 |
|
25 |
if (metadataColumns.size() != metadataColumnsGroups.size()) { |
|
26 |
println "ERROR in metadata declarations&groups:" |
|
27 |
println "COLUMNS: "+metadataColumns |
|
28 |
println "GROUPS : "+metadataColumnsGroups |
|
29 |
return |
|
30 |
} |
|
31 |
|
|
19 | 32 |
if (!trsDirectory.exists()) { |
20 | 33 |
println "$trsDirectory not found" |
21 | 34 |
return |
22 | 35 |
} |
23 | 36 |
|
24 | 37 |
println "Loading data from $metadataFile..." |
25 |
CsvReader reader = new CsvReader(metadataFile.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8"))
|
|
38 |
TableReader reader = new TableReader(metadataFile)//, "\t".charAt(0), Charset.forName("UTF-8")
|
|
26 | 39 |
reader.readHeaders() |
27 | 40 |
def header = reader.getHeaders() |
28 |
if (!header.contains(idTRSColumn)) {
|
|
29 |
println "No TRS ID $idTRSColumn column found"
|
|
41 |
if (!header.contains(joinTRSColumn)) {
|
|
42 |
println "No TRS ID $joinTRSColumn column found"
|
|
30 | 43 |
return |
31 | 44 |
} |
32 | 45 |
if (!header.contains(startTimeColumn)) { |
... | ... | |
82 | 95 |
def sectionGroupsToInsert = [:] |
83 | 96 |
println "Reading data..." |
84 | 97 |
while (reader.readRecord()) { |
85 |
String id = reader.get(idTRSColumn).trim()
|
|
98 |
String id = reader.get(joinTRSColumn).trim()
|
|
86 | 99 |
if (id.endsWith(".mp4")) id = id.substring(0, id.length()-4) |
87 | 100 |
if (id.length() == 0) continue; |
88 | 101 |
|
... | ... | |
91 | 104 |
} |
92 | 105 |
def section = sectionGroupsToInsert[id] |
93 | 106 |
|
94 |
|
|
95 | 107 |
if (reader.get(startTimeColumn) != null && reader.get(startTimeColumn).length() > 0) { // ignore non timed sections |
96 | 108 |
|
97 | 109 |
def m = [:] |
... | ... | |
100 | 112 |
def data = [] |
101 | 113 |
for (def col : todo.value) { |
102 | 114 |
if (reader.get(col).trim().length() > 0) { |
103 |
data << col+": "+reader.get(col).trim().replace("\n", "")
|
|
115 |
data << reader.get(col).trim().replace("\n", "") |
|
104 | 116 |
} |
105 | 117 |
} |
106 | 118 |
m[todo.key] = data.join("\t") |
107 | 119 |
} |
108 |
for (def col : metadataColumns) { |
|
109 |
m[AsciiUtils.buildAttributeId(col)] = reader.get(col) |
|
120 |
def metadataList = [] |
|
121 |
def metadataGroupList = [] |
|
122 |
for (int i = 0 ; i < metadataColumns.size() ; i++) { |
|
123 |
def col = metadataColumns[i] |
|
124 |
String c = AsciiUtils.buildAttributeId(col) |
|
125 |
m[c] = reader.get(col) |
|
126 |
metadataList << c |
|
127 |
metadataGroupList << metadataColumnsGroups[i] |
|
110 | 128 |
} |
129 |
m["metadata"] = metadataList.join("|") |
|
130 |
m["metadata_groups"] = metadataGroupList.join("|") |
|
111 | 131 |
|
112 | 132 |
m["startTime"] = strTotime(reader.get(startTimeColumn)) |
113 | 133 |
m["endTime"] = strTotime(reader.get(endTimeColumn)) |
... | ... | |
204 | 224 |
} |
205 | 225 |
} |
206 | 226 |
cpb.done() |
227 |
reader.close() |
|
207 | 228 |
println "Done." |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/transcriberLoader.groovy (revision 2354) | ||
---|---|---|
85 | 85 |
int wordsPerPage = project.getEditionDefinition("default").getWordsPerPage() |
86 | 86 |
String page_element = project.getEditionDefinition("default").getPageElement() |
87 | 87 |
boolean build_edition = project.getEditionDefinition("default").getBuildEdition() |
88 |
boolean update = project.getDoUpdate() |
|
88 | 89 |
|
89 | 90 |
File srcDir = new File(rootDir); |
90 | 91 |
File binDir = project.getProjectDirectory(); |
... | ... | |
95 | 96 |
} |
96 | 97 |
|
97 | 98 |
File txmDir = new File(binDir,"txm/$corpusname"); |
98 |
txmDir.deleteDir(); |
|
99 |
if (!update) txmDir.deleteDir();
|
|
99 | 100 |
txmDir.mkdirs(); |
100 | 101 |
|
101 | 102 |
//get metadata values from CSV |
... | ... | |
115 | 116 |
else { |
116 | 117 |
println "no metadata file: "+allMetadataFile |
117 | 118 |
} |
119 |
|
|
120 |
final HashMap<String, String> textordersInfo = new HashMap<String, String>(); |
|
121 |
for (String t : metadatas.keySet()) { |
|
122 |
def ti = metadatas.get(t) |
|
123 |
for (org.txm.metadatas.Entry e : ti) { |
|
124 |
if ("textorder".equals(e.getId())) { |
|
125 |
textordersInfo[t+".trs"] = ti.value() |
|
126 |
} |
|
127 |
} |
|
128 |
} |
|
118 | 129 |
|
119 | 130 |
File propertyFile = new File(srcDir, "import.properties")//default |
120 | 131 |
Properties props = new Properties(); |
... | ... | |
123 | 134 |
FileInputStream input = new FileInputStream(propertyFile); |
124 | 135 |
props.load(input); |
125 | 136 |
input.close(); |
126 |
|
|
137 |
|
|
127 | 138 |
if (props.getProperty("removeInterviewer") != null) |
128 | 139 |
removeInterviewer = Boolean.parseBoolean(props.get("removeInterviewer").toString()); |
129 | 140 |
if (props.getProperty("ignoreTranscriberMetadata") != null) |
... | ... | |
134 | 145 |
csvHeaderNumber = props.get("csvHeaderNumber").toString().split("|"); |
135 | 146 |
//if (props.getProperty("includeComments") != null) |
136 | 147 |
// includeComments = props.get("includeComments").toString(); |
137 |
|
|
148 |
|
|
138 | 149 |
println "import properties: " |
139 | 150 |
println " removeInterviewer: "+removeInterviewer |
140 | 151 |
println " ignoreTranscriberMetadata: "+ignoreTranscriberMetadata |
... | ... | |
145 | 156 |
|
146 | 157 |
|
147 | 158 |
|
148 |
// Apply XSL |
|
149 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
150 |
if (MONITOR != null) MONITOR.worked(1, "XSL") |
|
151 |
if (xsl != null && xsl.trim().length() > 0) { |
|
152 |
if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"))) |
|
153 |
srcDir = new File(binDir, "src"); |
|
154 |
println "" |
|
155 |
} |
|
156 | 159 |
|
157 |
try { |
|
158 |
// select only trs files |
|
159 |
String ext = "trs"; |
|
160 |
ArrayList<File> trsfiles = srcDir.listFiles(); //find all trs files |
|
161 |
if (trsfiles == null) { |
|
162 |
println ("No files in "+srcDir.getAbsolutePath()) |
|
163 |
return false; |
|
164 |
} |
|
165 |
for (int i = 0 ; i < trsfiles.size() ; i++) { |
|
166 |
File f = trsfiles.get(i); |
|
167 |
if (!f.getName().endsWith(ext) || !f.canRead() || f.isHidden()) { |
|
168 |
trsfiles.remove(i) |
|
169 |
i--; |
|
170 |
} |
|
171 |
} |
|
172 | 160 |
|
173 |
if (trsfiles.size() == 0) { |
|
174 |
println ("No transcription file (*.trs) found in "+srcDir.getAbsolutePath()+". Aborting.") |
|
175 |
return false; |
|
176 |
} |
|
177 |
|
|
178 |
if (MONITOR != null) MONITOR.worked(1, "IMPORTER") |
|
179 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
180 |
println "-- IMPORTER" |
|
181 |
def imp = new importer(trsfiles, binDir, txmDir, metadatas, lang) //put result in the txm folder of binDir |
|
182 |
if (!imp.run()) { |
|
183 |
println "Failed to prepare files - Aborting"; |
|
184 |
return; |
|
185 |
} |
|
186 |
if (MONITOR != null) MONITOR.worked(20) |
|
187 |
|
|
188 |
println "-- Xml Validation" |
|
189 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
190 |
for (File infile : txmDir.listFiles()) { |
|
191 |
if (!ValidateXml.test(infile)) { |
|
192 |
println "$infile : Validation failed"; |
|
193 |
infile.delete(); |
|
161 |
try { |
|
162 |
if (!update) { |
|
163 |
// Apply XSL |
|
164 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
165 |
if (MONITOR != null) MONITOR.worked(1, "XSL") |
|
166 |
if (xsl != null && xsl.trim().length() > 0) { |
|
167 |
if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"))) |
|
168 |
srcDir = new File(binDir, "src"); |
|
169 |
println "" |
|
170 |
} |
|
171 |
|
|
172 |
// select only trs files |
|
173 |
String ext = "trs"; |
|
174 |
ArrayList<File> trsfiles = srcDir.listFiles(); //find all trs files |
|
175 |
if (trsfiles == null) { |
|
176 |
println ("No files in "+srcDir.getAbsolutePath()) |
|
177 |
return false; |
|
194 | 178 |
} |
195 |
} |
|
196 |
|
|
197 |
if (MONITOR != null) MONITOR.worked(5) |
|
198 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
199 |
println "-- Remove interviewer: "+removeInterviewer |
|
200 |
if (removeInterviewer) { |
|
201 |
if (metadatas == null) { |
|
202 |
println "Can't remove interviewer without a metadata.csv file defining who are the interviewers." |
|
203 |
} else { |
|
204 |
println "Removing some speakers in "+txmDir.listFiles().length+" file(s)" |
|
205 |
for (File infile : txmDir.listFiles()) { |
|
206 |
String filename = infile.getName(); |
|
207 |
int idx = filename.indexOf(".xml"); |
|
208 |
if (idx > 0) |
|
209 |
filename = filename.substring(0, idx); |
|
210 |
|
|
211 |
ArrayList<Pair<String, String>> metas = metadatas.get(filename) |
|
212 |
//println "filename=$filename metas= $metas" |
|
213 |
for (Pair p : metas) { |
|
214 |
if (p.getFirst().startsWith("enq")) { |
|
215 |
new RemoveSpeaker(infile, infile, p.getFirst()) |
|
179 |
for (int i = 0 ; i < trsfiles.size() ; i++) { |
|
180 |
File f = trsfiles.get(i); |
|
181 |
if (!f.getName().endsWith(ext) || !f.canRead() || f.isHidden()) { |
|
182 |
trsfiles.remove(i) |
|
183 |
i--; |
|
184 |
} |
|
185 |
} |
|
186 |
|
|
187 |
if (trsfiles.size() == 0) { |
|
188 |
println ("No transcription file (*.trs) found in "+srcDir.getAbsolutePath()+". Aborting.") |
|
189 |
return false; |
|
190 |
} |
|
191 |
|
|
192 |
if (MONITOR != null) MONITOR.worked(1, "IMPORTER") |
|
193 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
194 |
println "-- IMPORTER" |
|
195 |
def imp = new importer(trsfiles, binDir, txmDir, metadatas, lang) //put result in the txm folder of binDir |
|
196 |
if (!imp.run()) { |
|
197 |
println "Failed to prepare files - Aborting"; |
|
198 |
return; |
|
199 |
} |
|
200 |
if (MONITOR != null) MONITOR.worked(20) |
|
201 |
|
|
202 |
println "-- Xml Validation" |
|
203 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
204 |
for (File infile : txmDir.listFiles()) { |
|
205 |
if (!ValidateXml.test(infile)) { |
|
206 |
println "$infile : Validation failed"; |
|
207 |
infile.delete(); |
|
208 |
} |
|
209 |
} |
|
210 |
|
|
211 |
if (MONITOR != null) MONITOR.worked(5) |
|
212 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
213 |
println "-- Remove interviewer: "+removeInterviewer |
|
214 |
if (removeInterviewer) { |
|
215 |
if (metadatas == null) { |
|
216 |
println "Can't remove interviewer without a metadata.csv file defining who are the interviewers." |
|
217 |
} else { |
|
218 |
println "Removing some speakers in "+txmDir.listFiles().length+" file(s)" |
|
219 |
for (File infile : txmDir.listFiles()) { |
|
220 |
String filename = infile.getName(); |
|
221 |
int idx = filename.indexOf(".xml"); |
|
222 |
if (idx > 0) |
|
223 |
filename = filename.substring(0, idx); |
|
224 |
|
|
225 |
ArrayList<Pair<String, String>> metas = metadatas.get(filename) |
|
226 |
//println "filename=$filename metas= $metas" |
|
227 |
for (Pair p : metas) { |
|
228 |
if (p.getFirst().startsWith("enq")) { |
|
229 |
new RemoveSpeaker(infile, infile, p.getFirst()) |
|
230 |
} |
|
216 | 231 |
} |
217 |
} |
|
232 |
} |
|
233 |
} |
|
234 |
} |
|
235 |
|
|
236 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
237 |
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE") |
|
238 |
|
|
239 |
boolean annotationSuccess = false; |
|
240 |
if (annotate) { |
|
241 |
println "-- ANNOTATE - Running NLP tools" |
|
242 |
def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger") |
|
243 |
if (engine.processDirectory(txmDir, binDir, ["lang":model])) { |
|
244 |
annotationSuccess = true; |
|
218 | 245 |
} |
219 | 246 |
} |
220 |
} |
|
221 |
|
|
247 |
} |
|
248 |
trsfiles = new ArrayList<File>(Arrays.asList(txmDir.listFiles())); |
|
249 |
|
|
250 |
if (metadatas != null && metadatas.getPropertyNames().contains("textorder")) { |
|
251 |
Collections.sort(trsfiles, new Comparator<File>() { |
|
252 |
public int compare(File f1, File f2) { |
|
253 |
String o1 = textorder[f1.getName()]; |
|
254 |
String o2 = textorder[f2.getName()]; |
|
255 |
if (o1 == null && o2 == null) { |
|
256 |
return f1.compareTo(f2); |
|
257 |
} else if (o1 == null) { |
|
258 |
return 1 |
|
259 |
} else if (o2 == null) { |
|
260 |
return -1 |
|
261 |
} else { |
|
262 |
int c = o1.compareTo(o2); |
|
263 |
if (c == 0) return f1.compareTo(f2); |
|
264 |
else return c; |
|
265 |
} |
|
266 |
} |
|
267 |
}); |
|
268 |
} else { |
|
269 |
Collections.sort(trsfiles); |
|
270 |
} |
|
271 |
|
|
222 | 272 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
223 |
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE") |
|
224 |
|
|
225 |
boolean annotationSuccess = false; |
|
226 |
if (annotate) { |
|
227 |
println "-- ANNOTATE - Running NLP tools" |
|
228 |
def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger") |
|
229 |
if (engine.processDirectory(txmDir, binDir, ["lang":model])) { |
|
230 |
annotationSuccess = true; |
|
231 |
} |
|
232 |
} |
|
233 |
|
|
234 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
235 | 273 |
if (MONITOR != null) MONITOR.worked(25, "COMPILING") |
236 | 274 |
println "--COMPILING - Building Search Engine indexes" |
237 |
trsfiles = txmDir.listFiles(); |
|
238 |
|
|
275 |
|
|
239 | 276 |
def comp = new compiler() |
240 | 277 |
if(debug) comp.setDebug(); |
241 | 278 |
comp.removeInterviewers(removeInterviewer); |
242 | 279 |
comp.setIgnoreTranscriberMetadata(ignoreTranscriberMetadata); |
243 |
comp.setAnnotationSucces(annotationSuccess) |
|
244 | 280 |
if (!comp.run(project, trsfiles, corpusname, "default", binDir)) { |
245 | 281 |
println "Failed to compile files"; |
246 | 282 |
return; |
247 | 283 |
} |
248 |
|
|
284 |
|
|
249 | 285 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
250 |
|
|
286 |
|
|
251 | 287 |
File htmlDir = new File(binDir,"HTML/$corpusname"); |
252 | 288 |
htmlDir.deleteDir() |
253 | 289 |
htmlDir.mkdirs(); |
254 | 290 |
if (build_edition) { |
255 |
|
|
291 |
|
|
256 | 292 |
if (MONITOR != null) MONITOR.worked(20, "EDITION") |
257 | 293 |
println "-- EDITION - Building editions" |
258 |
|
|
259 |
List<File> filelist = txmDir.listFiles(); |
|
260 |
Collections.sort(filelist); |
|
294 |
|
|
261 | 295 |
def second = 0 |
262 |
|
|
263 |
println "Paginating "+filelist.size()+" texts"
|
|
264 |
ConsoleProgressBar cpb = new ConsoleProgressBar(filelist.size());
|
|
265 |
for (File txmFile : filelist) {
|
|
296 |
|
|
297 |
println "Paginating "+trsfiles.size()+" texts"
|
|
298 |
ConsoleProgressBar cpb = new ConsoleProgressBar(trsfiles.size());
|
|
299 |
for (File txmFile : trsfiles) {
|
|
266 | 300 |
cpb.tick() |
267 | 301 |
String txtname = txmFile.getName(); |
268 | 302 |
int i = txtname.lastIndexOf("."); |
269 | 303 |
if(i > 0) txtname = txtname.substring(0, i); |
270 |
|
|
304 |
|
|
271 | 305 |
List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang); |
272 | 306 |
List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang); |
273 |
|
|
274 |
Text t = new Text(project); |
|
275 |
t.setName(txtname); |
|
276 |
t.setSourceFile(txmFile) |
|
277 |
t.setTXMFile(txmFile) |
|
278 |
|
|
307 |
|
|
308 |
Text t = project.getText(txtname) |
|
309 |
if (t == null) { |
|
310 |
new Text(project); |
|
311 |
t.setName(txtname); |
|
312 |
t.setSourceFile(txmFile) |
|
313 |
t.setTXMFile(txmFile) |
|
314 |
} |
|
279 | 315 |
def ed = new pager(txmFile, htmlDir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element, metadatas); |
280 | 316 |
Edition edition = new Edition(t); |
281 | 317 |
edition.setName("default"); |
... | ... | |
288 | 324 |
} |
289 | 325 |
} |
290 | 326 |
cpb.done() |
291 |
|
|
327 |
|
|
292 | 328 |
//copy transcriber.css |
293 | 329 |
File cssfile = new File(Toolbox.getTxmHomePath(), "css/transcriber.css") |
294 | 330 |
File cssTXMFile = new File(Toolbox.getTxmHomePath(), "css/txm.css") |
... | ... | |
299 | 335 |
} |
300 | 336 |
|
301 | 337 |
//copy media files |
302 |
println "Copying media files if any (mp3, wav, mp4 or avi) "+filelist.size()+" texts"
|
|
303 |
cpb = new ConsoleProgressBar(filelist.size());
|
|
304 |
for (File txmFile : filelist) {
|
|
338 |
println "Copying media files if any (mp3, wav, mp4 or avi) "+trsfiles.size()+" texts"
|
|
339 |
cpb = new ConsoleProgressBar(trsfiles.size());
|
|
340 |
for (File txmFile : trsfiles) {
|
|
305 | 341 |
cpb.tick() |
306 | 342 |
String txtname = txmFile.getName(); |
307 | 343 |
int i = txtname.lastIndexOf("."); |
... | ... | |
310 | 346 |
if (!mediaFile.exists()) mediaFile = new File(project.getSrcdir(), txtname + ".wav") |
311 | 347 |
if (!mediaFile.exists()) mediaFile = new File(project.getSrcdir(), txtname + ".mp4") |
312 | 348 |
if (!mediaFile.exists()) mediaFile = new File(project.getSrcdir(), txtname + ".avi") |
313 |
|
|
349 |
|
|
314 | 350 |
if (mediaFile.exists()) { |
315 | 351 |
File copy = new File(binDir, "media/"+mediaFile.getName()) |
316 | 352 |
copy.getParentFile().mkdirs() |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/compiler.groovy (revision 2354) | ||
---|---|---|
73 | 73 |
/** The debug. */ |
74 | 74 |
boolean debug = false; |
75 | 75 |
|
76 |
/** The annotation succes. */ |
|
77 |
boolean annotationSucces = false; |
|
78 |
|
|
79 | 76 |
/** The removeinterviewers. */ |
80 | 77 |
boolean removeinterviewers = false; |
81 | 78 |
|
... | ... | |
162 | 159 |
|
163 | 160 |
println("Compiling "+xmlfiles.size()+" files") |
164 | 161 |
ConsoleProgressBar cpb = new ConsoleProgressBar(xmlfiles.size()) |
165 |
for (File f :xmlfiles) {
|
|
166 |
if (f.exists()) {
|
|
162 |
for (File txmFile :xmlfiles) {
|
|
163 |
if (txmFile.exists()) {
|
|
167 | 164 |
cpb.tick() |
168 |
if (!process(f)) {
|
|
169 |
println("Failed to compile "+f)
|
|
165 |
if (!process(txmFile)) {
|
|
166 |
println("Failed to compile "+txmFile)
|
|
170 | 167 |
} |
171 | 168 |
} |
172 | 169 |
} |
... | ... | |
680 | 677 |
debug = true; |
681 | 678 |
} |
682 | 679 |
|
683 |
/** |
|
684 |
* Sets the annotation success. |
|
685 |
* |
|
686 |
* @param val the new annotation success |
|
687 |
*/ |
|
688 |
public void setAnnotationSuccess(boolean val) { |
|
689 |
annotationSucces = val |
|
690 |
} |
|
691 | 680 |
} |
Formats disponibles : Unified diff