Révision 2354
| tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/pager.groovy (revision 2354) | ||
|---|---|---|
| 33 | 33 |
import javax.xml.stream.* |
| 34 | 34 |
|
| 35 | 35 |
import org.txm.importer.ApplyXsl2 |
| 36 |
import org.txm.metadatas.MetadataGroup |
|
| 36 | 37 |
import org.txm.metadatas.Metadatas |
| 37 | 38 |
import org.txm.utils.io.FileCopy; |
| 38 | 39 |
|
| ... | ... | |
| 229 | 230 |
writer.writeAttribute("http-equiv", "Content-Type");
|
| 230 | 231 |
writer.writeAttribute("content", "text/html");
|
| 231 | 232 |
writer.writeAttribute("charset", "UTF-8");
|
| 232 |
writer.writeEndElement(); |
|
| 233 |
writer.writeEndElement(); // meta
|
|
| 233 | 234 |
writer.writeStartElement("head");
|
| 234 |
|
|
| 235 | 235 |
//<link rel="stylesheet" type="text/css" href="class.css" /> |
| 236 | 236 |
writer.writeStartElement("link");
|
| 237 | 237 |
writer.writeAttribute("rel", "stylesheet");
|
| 238 | 238 |
writer.writeAttribute("type", "text/css");
|
| 239 | 239 |
writer.writeAttribute("href", "transcriber.css");
|
| 240 |
writer.writeEndElement(); |
|
| 241 |
writer.writeEndElement(); |
|
| 240 |
writer.writeEndElement(); // link
|
|
| 241 |
writer.writeEndElement(); // head
|
|
| 242 | 242 |
|
| 243 | 243 |
nbBreak++ |
| 244 | 244 |
writer.writeStartElement("body");
|
| ... | ... | |
| 254 | 254 |
switch (localname) {
|
| 255 | 255 |
case "text": |
| 256 | 256 |
|
| 257 |
writer.writeStartElement("h2");
|
|
| 257 |
writer.writeStartElement("h1");
|
|
| 258 | 258 |
writer.writeAttribute("class","title");
|
| 259 | 259 |
String title = parser.getAttributeValue(null, "title"); |
| 260 | 260 |
|
| ... | ... | |
| 264 | 264 |
writer.writeCharacters("Transcription "+txmfile.getName().substring(0, txmfile.getName().length() - 4));
|
| 265 | 265 |
} |
| 266 | 266 |
|
| 267 |
writer.writeEmptyElement("br");
|
|
| 268 |
writer.writeStartElement("a");
|
|
| 269 |
writer.writeAttribute("onclick", "txmcommand('id', 'org.txm.backtomedia.commands.function.BackToMedia', 'corpus', '"+corpusname+"', 'text', '"+txtname+"', 'time', '0.0')");
|
|
| 270 |
writer.writeAttribute("style", "cursor: pointer;")
|
|
| 271 |
writer.writeAttribute("class", "play-media")
|
|
| 272 |
writer.writeCharacters(" ♪♪");
|
|
| 273 |
writer.writeEndElement(); // a |
|
| 267 |
writeMediaAccess("0.0")
|
|
| 274 | 268 |
|
| 275 |
writer.writeEndElement(); // h2
|
|
| 269 |
writer.writeEndElement(); // h1
|
|
| 276 | 270 |
|
| 277 | 271 |
String subtitle = parser.getAttributeValue(null, "subtitle"); |
| 278 | 272 |
if (subtitle != null && subtitle.length() > 0) {
|
| 279 |
writer.writeStartElement("h3");
|
|
| 273 |
writer.writeStartElement("h2");
|
|
| 280 | 274 |
writer.writeAttribute("class", "subtitle");
|
| 281 | 275 |
writer.writeCharacters(subtitle); |
| 282 |
writer.writeEndElement(); // h3
|
|
| 276 |
writer.writeEndElement(); // h2
|
|
| 283 | 277 |
} |
| 284 | 278 |
|
| 285 |
// println "metadatas != null: "+(metadatas != null) |
|
| 286 |
// if (metadatas != null) {
|
|
| 287 |
writer.writeStartElement("table");
|
|
| 288 |
boolean grey = false; |
|
| 289 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
|
|
| 290 |
String name = parser.getAttributeName(i); |
|
| 291 |
String value = parser.getAttributeValue(i); |
|
| 292 |
|
|
| 293 |
if ("title" == name) {
|
|
| 294 |
continue; // ignore "title" metadata |
|
| 295 |
} |
|
| 296 |
|
|
| 297 |
grey = !grey; |
|
| 298 |
writer.writeStartElement("tr");
|
|
| 299 |
if (grey) {
|
|
| 300 |
writer.writeAttribute("style","background-color:lightgrey;")
|
|
| 301 |
} |
|
| 302 |
|
|
| 303 |
if (value != null) {
|
|
| 304 |
writer.writeStartElement("td");
|
|
| 305 |
writer.writeCharacters(name); |
|
| 306 |
writer.writeEndElement(); |
|
| 307 |
writer.writeStartElement("td");
|
|
| 308 |
writer.writeCharacters(value); |
|
| 309 |
writer.writeEndElement(); |
|
| 310 |
} |
|
| 311 |
//get enqueteur to style their names |
|
| 312 |
if (name.startsWith("enq")) {
|
|
| 313 |
interviewers.add(value) |
|
| 314 |
} |
|
| 315 |
writer.writeEndElement(); |
|
| 279 |
writer.writeStartElement("table");
|
|
| 280 |
writer.writeAttribute("class", "transcription-table");
|
|
| 281 |
boolean grey = false; |
|
| 282 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
|
|
| 283 |
String name = parser.getAttributeName(i); |
|
| 284 |
String value = parser.getAttributeValue(i); |
|
| 285 |
|
|
| 286 |
if ("title" == name) {
|
|
| 287 |
continue; // ignore "title" metadata |
|
| 316 | 288 |
} |
| 317 |
writer.writeEndElement(); |
|
| 318 |
// } |
|
| 289 |
|
|
| 290 |
grey = !grey; |
|
| 291 |
writer.writeStartElement("tr");
|
|
| 292 |
if (grey) {
|
|
| 293 |
writer.writeAttribute("style","background-color:lightgrey;")
|
|
| 294 |
} |
|
| 295 |
|
|
| 296 |
if (value != null) {
|
|
| 297 |
writer.writeStartElement("td");
|
|
| 298 |
writer.writeCharacters(name); |
|
| 299 |
writer.writeEndElement(); // td |
|
| 300 |
writer.writeStartElement("td");
|
|
| 301 |
writer.writeCharacters(value); |
|
| 302 |
writer.writeEndElement(); // td |
|
| 303 |
} |
|
| 304 |
//get enqueteur to style their names |
|
| 305 |
if (name.startsWith("enq")) {
|
|
| 306 |
interviewers.add(value) |
|
| 307 |
} |
|
| 308 |
writer.writeEndElement(); // tr |
|
| 309 |
} |
|
| 310 |
writer.writeEndElement(); // table |
|
| 311 |
// } |
|
| 319 | 312 |
break; |
| 320 | 313 |
case "Topics": |
| 321 | 314 |
/*writer.writeStartElement("h2");
|
| ... | ... | |
| 393 | 386 |
|
| 394 | 387 |
String type = parser.getAttributeValue(null, "type") |
| 395 | 388 |
writer.writeAttribute("type", ""+type)
|
| 389 |
|
|
| 396 | 390 |
String desc = parser.getAttributeValue(null, "topic") |
| 397 | 391 |
|
| 398 |
if (type != null || desc != null) {
|
|
| 399 |
writer.writeStartElement("h2");
|
|
| 392 |
if (type != null && type.length() > 0) {
|
|
| 393 |
writer.writeStartElement("h1");
|
|
| 400 | 394 |
writer.writeAttribute("class", "section-title")
|
| 401 |
if (type != null || type.length() == 0) {
|
|
| 402 |
writer.writeCharacters(type+": "+desc); |
|
| 403 |
} else {
|
|
| 404 |
writer.writeCharacters(desc) |
|
| 405 |
} |
|
| 395 |
writer.writeCharacters(type); |
|
| 406 | 396 |
|
| 407 | 397 |
if (parser.getAttributeValue(null,"startTime") != null) {
|
| 408 |
writer.writeEmptyElement("br");
|
|
| 409 |
writer.writeStartElement("a")
|
|
| 410 |
writer.writeAttribute("onclick", "txmcommand('id', 'org.txm.backtomedia.commands.function.BackToMedia', 'corpus', '"+corpusname+"', 'text', '"+txtname+"', 'time', '"+parser.getAttributeValue(null,"startTime")+"')");
|
|
| 411 |
writer.writeAttribute("style", "cursor: pointer;")
|
|
| 412 |
writer.writeAttribute("class", "play-media")
|
|
| 413 |
writer.writeCharacters(" ♪♪")
|
|
| 414 |
writer.writeEndElement() // a |
|
| 398 |
writeMediaAccess(parser.getAttributeValue(null,"startTime")) |
|
| 415 | 399 |
} |
| 416 | 400 |
|
| 401 |
writer.writeEndElement(); // h1 |
|
| 402 |
} |
|
| 403 |
|
|
| 404 |
if (desc != null && desc.length() > 0) {
|
|
| 405 |
writer.writeStartElement("h2");
|
|
| 406 |
writer.writeAttribute("class", "section-desc")
|
|
| 407 |
writer.writeCharacters(desc) |
|
| 417 | 408 |
writer.writeEndElement(); // h2 |
| 418 | 409 |
} |
| 419 | 410 |
|
| 420 |
String metadata = parser.getAttributeValue(null, "metadata") |
|
| 421 |
if (metadata != null && metadata.length() > 0) { // the metadata to show
|
|
| 422 |
writer.writeStartElement("ul")
|
|
| 423 |
//println "metadata=$metadata" |
|
| 424 |
for (def m : metadata.split("<li>")) {
|
|
| 425 |
writer.writeStartElement("li")
|
|
| 426 |
writer.writeCharacters(m) |
|
| 427 |
writer.writeEndElement() // li |
|
| 411 |
def metadata = new LinkedHashMap<String, String>() // temp to store attributes |
|
| 412 |
def metadataGroups = ["metadata":[]] // default metadata group |
|
| 413 |
def metadataDeclared = false |
|
| 414 |
if (parser.getAttributeValue(null, "metadata") != null && parser.getAttributeValue(null, "metadata_groups") != null) {
|
|
| 415 |
def l1 = parser.getAttributeValue(null, "metadata").split("\\|");
|
|
| 416 |
def l2 = parser.getAttributeValue(null, "metadata_groups").split("\\|");
|
|
| 417 |
for (int i = 0 ; i < l1.size() ; i++) {
|
|
| 418 |
def m = l1[i] |
|
| 419 |
def g = l2[i] |
|
| 420 |
metadata[m] = "" // forcing order of metadata by pre-declaring |
|
| 421 |
if (!metadataGroups.containsKey(g)) metadataGroups[g] = [] |
|
| 422 |
metadataGroups[g] << m // declaring a metadata type |
|
| 428 | 423 |
} |
| 429 |
writer.writeEndElement() // ul |
|
| 430 |
} else if (parser.getAttributeCount() > 1) { // process all attributes
|
|
| 431 |
writer.writeStartElement("ul")
|
|
| 432 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
|
|
| 433 |
String name = parser.getAttributeLocalName(i) |
|
| 434 |
if (!"type".equals(name) |
|
| 435 |
&& !"topic".equals(name) |
|
| 436 |
&& !"startTime".equals(name) |
|
| 437 |
&& !"endTime".equals(name)) {
|
|
| 438 |
writer.writeStartElement("li")
|
|
| 439 |
writer.writeCharacters(""+name+": "+parser.getAttributeValue(i))
|
|
| 440 |
writer.writeEndElement() // li |
|
| 424 |
metadataDeclared = true |
|
| 425 |
} |
|
| 426 |
|
|
| 427 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
|
|
| 428 |
String name = parser.getAttributeLocalName(i) |
|
| 429 |
if (!"type".equals(name) |
|
| 430 |
&& !"topic".equals(name) |
|
| 431 |
&& !"startTime".equals(name) |
|
| 432 |
&& !"endTime".equals(name)) {
|
|
| 433 |
if (metadataDeclared && !metadata.containsKey(name)) {
|
|
| 434 |
continue; // ignoring metadata since not in declared metadata |
|
| 435 |
} else {
|
|
| 436 |
metadataGroups["metadata"] << name |
|
| 441 | 437 |
} |
| 438 |
|
|
| 439 |
metadata[name] = parser.getAttributeValue(i) |
|
| 442 | 440 |
} |
| 443 |
writer.writeEndElement(); // ul |
|
| 444 | 441 |
} |
| 445 | 442 |
|
| 443 |
writer.writeStartElement("p")
|
|
| 444 |
writer.writeAttribute("class", "section-all-metadata");
|
|
| 445 |
for (String groupName : metadataGroups.keySet()) {
|
|
| 446 |
def group = metadataGroups[groupName] |
|
| 447 |
if (group.size() > 0) {
|
|
| 448 |
if (groupName.equals("text")) {
|
|
| 449 |
writer.writeStartElement("p")
|
|
| 450 |
writer.writeAttribute("class", "section-"+groupName);
|
|
| 451 |
for (String k : group) {
|
|
| 452 |
writer.writeStartElement("p")
|
|
| 453 |
writer.writeAttribute("class", ""+groupName)
|
|
| 454 |
writer.writeStartElement("h4")
|
|
| 455 |
writer.writeCharacters(k) |
|
| 456 |
writer.writeEndElement() // li |
|
| 457 |
writer.writeCharacters(metadata[k]) |
|
| 458 |
writer.writeEndElement() // li |
|
| 459 |
} |
|
| 460 |
} else {
|
|
| 461 |
writer.writeStartElement("ul")
|
|
| 462 |
writer.writeAttribute("class", "section-"+groupName);
|
|
| 463 |
for (String k : group) {
|
|
| 464 |
writer.writeStartElement("li")
|
|
| 465 |
writer.writeAttribute("class", ""+groupName)
|
|
| 466 |
writer.writeCharacters(""+k+": "+metadata[k])
|
|
| 467 |
writer.writeEndElement() // li |
|
| 468 |
} |
|
| 469 |
} |
|
| 470 |
|
|
| 471 |
writer.writeEndElement(); // ul or p |
|
| 472 |
} |
|
| 473 |
} |
|
| 474 |
writer.writeEndElement(); // p |
|
| 475 |
|
|
| 446 | 476 |
break; |
| 447 | 477 |
case "sp": |
| 448 | 478 |
endBoldIfNeeded() |
| ... | ... | |
| 578 | 608 |
} |
| 579 | 609 |
previousElem = localname; |
| 580 | 610 |
break; |
| 611 |
|
|
| 581 | 612 |
case XMLStreamConstants.END_ELEMENT: |
| 582 | 613 |
localname = parser.getLocalName(); |
| 583 |
switch(localname) |
|
| 584 |
{
|
|
| 614 |
switch(localname) {
|
|
| 585 | 615 |
case "text": |
| 586 | 616 |
break; |
| 587 | 617 |
case "Topics": |
| ... | ... | |
| 722 | 752 |
break; |
| 723 | 753 |
} |
| 724 | 754 |
} |
| 725 |
writer.writeEndElement(); |
|
| 755 |
writer.writeEndElement(); // body
|
|
| 726 | 756 |
|
| 727 | 757 |
writer.writeEmptyElement("pb");
|
| 728 | 758 |
nbBreak++ |
| 729 | 759 |
writer.writeAttribute("id", ""+nbBreak);
|
| 730 | 760 |
|
| 731 |
writer.writeEndElement(); |
|
| 761 |
writer.writeEndElement(); // html
|
|
| 732 | 762 |
writer.close(); |
| 733 | 763 |
output.close(); |
| 734 | 764 |
if (parser != null) parser.close(); |
| ... | ... | |
| 767 | 797 |
writer.writeAttribute("class", "sync");
|
| 768 | 798 |
writer.writeCharacters(currentTime); |
| 769 | 799 |
|
| 800 |
writeMediaAccess(currentTime) |
|
| 801 |
|
|
| 802 |
writer.writeEndElement(); // span |
|
| 803 |
} |
|
| 804 |
|
|
| 805 |
private void writeMediaAccess(def time) {
|
|
| 770 | 806 |
writer.writeStartElement("a");
|
| 771 |
writer.writeAttribute("onclick", "txmcommand('id', 'org.txm.backtomedia.commands.function.BackToMedia', 'corpus', '"+corpusname+"', 'text', '"+txtname+"', 'time', '"+currentTime+"')");
|
|
| 807 |
writer.writeAttribute("onclick", "txmcommand('id', 'org.txm.backtomedia.commands.function.BackToMedia', 'corpus', '"+corpusname+"', 'text', '"+txtname+"', 'time', '"+time+"')");
|
|
| 772 | 808 |
writer.writeAttribute("style", "cursor: pointer;")
|
| 773 | 809 |
writer.writeAttribute("class", "play-media")
|
| 774 |
writer.writeCharacters(" ???♪");
|
|
| 810 |
writer.writeCharacters(" ???");
|
|
| 775 | 811 |
writer.writeEndElement(); // a |
| 776 |
|
|
| 777 |
writer.writeEndElement(); |
|
| 778 | 812 |
} |
| 779 | 813 |
|
| 780 | 814 |
private void writeSpeaker(String spk, boolean overlapping) {
|
| tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/AddSections.groovy (revision 2354) | ||
|---|---|---|
| 1 | 1 |
import java.nio.charset.Charset |
| 2 | 2 |
|
| 3 |
import org.txm.utils.AsciiUtils |
|
| 4 |
import org.txm.utils.ConsoleProgressBar |
|
| 5 |
import org.txm.utils.CsvReader |
|
| 6 | 3 |
import java.time.LocalTime |
| 7 | 4 |
import java.time.format.DateTimeFormatter |
| 5 |
import org.txm.utils.* |
|
| 8 | 6 |
|
| 9 |
File metadataFile = new File("/home/mdecorde/TEMP/ANTRACT/AF/metadata.tsv")
|
|
| 7 |
File metadataFile = new File("/home/mdecorde/TEMP/ANTRACT/AF/sujets.xlsx")
|
|
| 10 | 8 |
File trsDirectory = new File("/home/mdecorde/TEMP/ANTRACT/AF/trs")
|
| 11 |
def idTRSColumn = "Lien notice principale" |
|
| 12 |
def typeColumns = ["Identifiant de la notice"] |
|
| 13 |
def topicColumns = ["Titre propre", "Notes du titre"] |
|
| 9 |
|
|
| 10 |
def joinTRSColumn = "Lien notice principale" |
|
| 11 |
def typeColumns = ["Date de diffusion"] |
|
| 12 |
def topicColumns = ["Titre propre"] |
|
| 14 | 13 |
def startTimeColumn = "antract_debut" |
| 15 | 14 |
def endTimeColumn = "antract_fin" |
| 16 | 15 |
//def metadataColumns = ["Identifiant de la notice", "Titre propre", "antract_debut", "antract_fin"] |
| 17 |
def metadataColumns = ["Date de diffusion", "Descripteurs (Aff. Col.)", "Durée", "Générique (Aff. Col.)", "Genre", "Identifiant de la notice", "Langue VO / VE ", "Lien notice principale", "Nature de production ", "Nom fichier segmenté (info)", "Notes du titre ", "Producteurs (Aff.)", "Résumé", "Séquences", "Thématique", "Titre propre", "Type de date", "Type de notice"] |
|
| 16 |
def metadataColumns = [ |
|
| 17 |
"Titre propre", "Date de diffusion", "Identifiant de la notice", "Notes du titre", "Type de date", "Durée", "Genre", "Langue VO / VE", "Nature de production", "Producteurs (Aff.)", "Thématique", |
|
| 18 |
"Nom fichier segmenté (info)", "antract_video", "antract_debut","antract_fin","antract_duree","antract_tc_type","antract_tc_date", |
|
| 19 |
"Résumé", "Séquences", "Descripteurs (Aff. Lig.)", "Générique (Aff. Lig.)"] |
|
| 20 |
def metadataColumnsGroups = [ |
|
| 21 |
"metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", |
|
| 22 |
"secondary", "secondary", "secondary","secondary","secondary","secondary","secondary", |
|
| 23 |
"text", "text", "text", "text"] |
|
| 18 | 24 |
|
| 25 |
if (metadataColumns.size() != metadataColumnsGroups.size()) {
|
|
| 26 |
println "ERROR in metadata declarations&groups:" |
|
| 27 |
println "COLUMNS: "+metadataColumns |
|
| 28 |
println "GROUPS : "+metadataColumnsGroups |
|
| 29 |
return |
|
| 30 |
} |
|
| 31 |
|
|
| 19 | 32 |
if (!trsDirectory.exists()) {
|
| 20 | 33 |
println "$trsDirectory not found" |
| 21 | 34 |
return |
| 22 | 35 |
} |
| 23 | 36 |
|
| 24 | 37 |
println "Loading data from $metadataFile..." |
| 25 |
CsvReader reader = new CsvReader(metadataFile.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8"))
|
|
| 38 |
TableReader reader = new TableReader(metadataFile)//, "\t".charAt(0), Charset.forName("UTF-8")
|
|
| 26 | 39 |
reader.readHeaders() |
| 27 | 40 |
def header = reader.getHeaders() |
| 28 |
if (!header.contains(idTRSColumn)) {
|
|
| 29 |
println "No TRS ID $idTRSColumn column found"
|
|
| 41 |
if (!header.contains(joinTRSColumn)) {
|
|
| 42 |
println "No TRS ID $joinTRSColumn column found"
|
|
| 30 | 43 |
return |
| 31 | 44 |
} |
| 32 | 45 |
if (!header.contains(startTimeColumn)) {
|
| ... | ... | |
| 82 | 95 |
def sectionGroupsToInsert = [:] |
| 83 | 96 |
println "Reading data..." |
| 84 | 97 |
while (reader.readRecord()) {
|
| 85 |
String id = reader.get(idTRSColumn).trim()
|
|
| 98 |
String id = reader.get(joinTRSColumn).trim()
|
|
| 86 | 99 |
if (id.endsWith(".mp4")) id = id.substring(0, id.length()-4)
|
| 87 | 100 |
if (id.length() == 0) continue; |
| 88 | 101 |
|
| ... | ... | |
| 91 | 104 |
} |
| 92 | 105 |
def section = sectionGroupsToInsert[id] |
| 93 | 106 |
|
| 94 |
|
|
| 95 | 107 |
if (reader.get(startTimeColumn) != null && reader.get(startTimeColumn).length() > 0) { // ignore non timed sections
|
| 96 | 108 |
|
| 97 | 109 |
def m = [:] |
| ... | ... | |
| 100 | 112 |
def data = [] |
| 101 | 113 |
for (def col : todo.value) {
|
| 102 | 114 |
if (reader.get(col).trim().length() > 0) {
|
| 103 |
data << col+": "+reader.get(col).trim().replace("\n", "")
|
|
| 115 |
data << reader.get(col).trim().replace("\n", "")
|
|
| 104 | 116 |
} |
| 105 | 117 |
} |
| 106 | 118 |
m[todo.key] = data.join("\t")
|
| 107 | 119 |
} |
| 108 |
for (def col : metadataColumns) {
|
|
| 109 |
m[AsciiUtils.buildAttributeId(col)] = reader.get(col) |
|
| 120 |
def metadataList = [] |
|
| 121 |
def metadataGroupList = [] |
|
| 122 |
for (int i = 0 ; i < metadataColumns.size() ; i++) {
|
|
| 123 |
def col = metadataColumns[i] |
|
| 124 |
String c = AsciiUtils.buildAttributeId(col) |
|
| 125 |
m[c] = reader.get(col) |
|
| 126 |
metadataList << c |
|
| 127 |
metadataGroupList << metadataColumnsGroups[i] |
|
| 110 | 128 |
} |
| 129 |
m["metadata"] = metadataList.join("|")
|
|
| 130 |
m["metadata_groups"] = metadataGroupList.join("|")
|
|
| 111 | 131 |
|
| 112 | 132 |
m["startTime"] = strTotime(reader.get(startTimeColumn)) |
| 113 | 133 |
m["endTime"] = strTotime(reader.get(endTimeColumn)) |
| ... | ... | |
| 204 | 224 |
} |
| 205 | 225 |
} |
| 206 | 226 |
cpb.done() |
| 227 |
reader.close() |
|
| 207 | 228 |
println "Done." |
| tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/transcriberLoader.groovy (revision 2354) | ||
|---|---|---|
| 85 | 85 |
int wordsPerPage = project.getEditionDefinition("default").getWordsPerPage()
|
| 86 | 86 |
String page_element = project.getEditionDefinition("default").getPageElement()
|
| 87 | 87 |
boolean build_edition = project.getEditionDefinition("default").getBuildEdition()
|
| 88 |
boolean update = project.getDoUpdate() |
|
| 88 | 89 |
|
| 89 | 90 |
File srcDir = new File(rootDir); |
| 90 | 91 |
File binDir = project.getProjectDirectory(); |
| ... | ... | |
| 95 | 96 |
} |
| 96 | 97 |
|
| 97 | 98 |
File txmDir = new File(binDir,"txm/$corpusname"); |
| 98 |
txmDir.deleteDir(); |
|
| 99 |
if (!update) txmDir.deleteDir();
|
|
| 99 | 100 |
txmDir.mkdirs(); |
| 100 | 101 |
|
| 101 | 102 |
//get metadata values from CSV |
| ... | ... | |
| 115 | 116 |
else {
|
| 116 | 117 |
println "no metadata file: "+allMetadataFile |
| 117 | 118 |
} |
| 119 |
|
|
| 120 |
final HashMap<String, String> textordersInfo = new HashMap<String, String>(); |
|
| 121 |
for (String t : metadatas.keySet()) {
|
|
| 122 |
def ti = metadatas.get(t) |
|
| 123 |
for (org.txm.metadatas.Entry e : ti) {
|
|
| 124 |
if ("textorder".equals(e.getId())) {
|
|
| 125 |
textordersInfo[t+".trs"] = ti.value() |
|
| 126 |
} |
|
| 127 |
} |
|
| 128 |
} |
|
| 118 | 129 |
|
| 119 | 130 |
File propertyFile = new File(srcDir, "import.properties")//default |
| 120 | 131 |
Properties props = new Properties(); |
| ... | ... | |
| 123 | 134 |
FileInputStream input = new FileInputStream(propertyFile); |
| 124 | 135 |
props.load(input); |
| 125 | 136 |
input.close(); |
| 126 |
|
|
| 137 |
|
|
| 127 | 138 |
if (props.getProperty("removeInterviewer") != null)
|
| 128 | 139 |
removeInterviewer = Boolean.parseBoolean(props.get("removeInterviewer").toString());
|
| 129 | 140 |
if (props.getProperty("ignoreTranscriberMetadata") != null)
|
| ... | ... | |
| 134 | 145 |
csvHeaderNumber = props.get("csvHeaderNumber").toString().split("|");
|
| 135 | 146 |
//if (props.getProperty("includeComments") != null)
|
| 136 | 147 |
// includeComments = props.get("includeComments").toString();
|
| 137 |
|
|
| 148 |
|
|
| 138 | 149 |
println "import properties: " |
| 139 | 150 |
println " removeInterviewer: "+removeInterviewer |
| 140 | 151 |
println " ignoreTranscriberMetadata: "+ignoreTranscriberMetadata |
| ... | ... | |
| 145 | 156 |
|
| 146 | 157 |
|
| 147 | 158 |
|
| 148 |
// Apply XSL |
|
| 149 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
|
|
| 150 |
if (MONITOR != null) MONITOR.worked(1, "XSL") |
|
| 151 |
if (xsl != null && xsl.trim().length() > 0) {
|
|
| 152 |
if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"))) |
|
| 153 |
srcDir = new File(binDir, "src"); |
|
| 154 |
println "" |
|
| 155 |
} |
|
| 156 | 159 |
|
| 157 |
try {
|
|
| 158 |
// select only trs files |
|
| 159 |
String ext = "trs"; |
|
| 160 |
ArrayList<File> trsfiles = srcDir.listFiles(); //find all trs files |
|
| 161 |
if (trsfiles == null) {
|
|
| 162 |
println ("No files in "+srcDir.getAbsolutePath())
|
|
| 163 |
return false; |
|
| 164 |
} |
|
| 165 |
for (int i = 0 ; i < trsfiles.size() ; i++) {
|
|
| 166 |
File f = trsfiles.get(i); |
|
| 167 |
if (!f.getName().endsWith(ext) || !f.canRead() || f.isHidden()) {
|
|
| 168 |
trsfiles.remove(i) |
|
| 169 |
i--; |
|
| 170 |
} |
|
| 171 |
} |
|
| 172 | 160 |
|
| 173 |
if (trsfiles.size() == 0) {
|
|
| 174 |
println ("No transcription file (*.trs) found in "+srcDir.getAbsolutePath()+". Aborting.")
|
|
| 175 |
return false; |
|
| 176 |
} |
|
| 177 |
|
|
| 178 |
if (MONITOR != null) MONITOR.worked(1, "IMPORTER") |
|
| 179 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
|
|
| 180 |
println "-- IMPORTER" |
|
| 181 |
def imp = new importer(trsfiles, binDir, txmDir, metadatas, lang) //put result in the txm folder of binDir |
|
| 182 |
if (!imp.run()) {
|
|
| 183 |
println "Failed to prepare files - Aborting"; |
|
| 184 |
return; |
|
| 185 |
} |
|
| 186 |
if (MONITOR != null) MONITOR.worked(20) |
|
| 187 |
|
|
| 188 |
println "-- Xml Validation" |
|
| 189 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
|
|
| 190 |
for (File infile : txmDir.listFiles()) {
|
|
| 191 |
if (!ValidateXml.test(infile)) {
|
|
| 192 |
println "$infile : Validation failed"; |
|
| 193 |
infile.delete(); |
|
| 161 |
try {
|
|
| 162 |
if (!update) {
|
|
| 163 |
// Apply XSL |
|
| 164 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
|
|
| 165 |
if (MONITOR != null) MONITOR.worked(1, "XSL") |
|
| 166 |
if (xsl != null && xsl.trim().length() > 0) {
|
|
| 167 |
if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"))) |
|
| 168 |
srcDir = new File(binDir, "src"); |
|
| 169 |
println "" |
|
| 170 |
} |
|
| 171 |
|
|
| 172 |
// select only trs files |
|
| 173 |
String ext = "trs"; |
|
| 174 |
ArrayList<File> trsfiles = srcDir.listFiles(); //find all trs files |
|
| 175 |
if (trsfiles == null) {
|
|
| 176 |
println ("No files in "+srcDir.getAbsolutePath())
|
|
| 177 |
return false; |
|
| 194 | 178 |
} |
| 195 |
} |
|
| 196 |
|
|
| 197 |
if (MONITOR != null) MONITOR.worked(5) |
|
| 198 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
|
|
| 199 |
println "-- Remove interviewer: "+removeInterviewer |
|
| 200 |
if (removeInterviewer) {
|
|
| 201 |
if (metadatas == null) {
|
|
| 202 |
println "Can't remove interviewer without a metadata.csv file defining who are the interviewers." |
|
| 203 |
} else {
|
|
| 204 |
println "Removing some speakers in "+txmDir.listFiles().length+" file(s)" |
|
| 205 |
for (File infile : txmDir.listFiles()) {
|
|
| 206 |
String filename = infile.getName(); |
|
| 207 |
int idx = filename.indexOf(".xml");
|
|
| 208 |
if (idx > 0) |
|
| 209 |
filename = filename.substring(0, idx); |
|
| 210 |
|
|
| 211 |
ArrayList<Pair<String, String>> metas = metadatas.get(filename) |
|
| 212 |
//println "filename=$filename metas= $metas" |
|
| 213 |
for (Pair p : metas) {
|
|
| 214 |
if (p.getFirst().startsWith("enq")) {
|
|
| 215 |
new RemoveSpeaker(infile, infile, p.getFirst()) |
|
| 179 |
for (int i = 0 ; i < trsfiles.size() ; i++) {
|
|
| 180 |
File f = trsfiles.get(i); |
|
| 181 |
if (!f.getName().endsWith(ext) || !f.canRead() || f.isHidden()) {
|
|
| 182 |
trsfiles.remove(i) |
|
| 183 |
i--; |
|
| 184 |
} |
|
| 185 |
} |
|
| 186 |
|
|
| 187 |
if (trsfiles.size() == 0) {
|
|
| 188 |
println ("No transcription file (*.trs) found in "+srcDir.getAbsolutePath()+". Aborting.")
|
|
| 189 |
return false; |
|
| 190 |
} |
|
| 191 |
|
|
| 192 |
if (MONITOR != null) MONITOR.worked(1, "IMPORTER") |
|
| 193 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
|
|
| 194 |
println "-- IMPORTER" |
|
| 195 |
def imp = new importer(trsfiles, binDir, txmDir, metadatas, lang) //put result in the txm folder of binDir |
|
| 196 |
if (!imp.run()) {
|
|
| 197 |
println "Failed to prepare files - Aborting"; |
|
| 198 |
return; |
|
| 199 |
} |
|
| 200 |
if (MONITOR != null) MONITOR.worked(20) |
|
| 201 |
|
|
| 202 |
println "-- Xml Validation" |
|
| 203 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
|
|
| 204 |
for (File infile : txmDir.listFiles()) {
|
|
| 205 |
if (!ValidateXml.test(infile)) {
|
|
| 206 |
println "$infile : Validation failed"; |
|
| 207 |
infile.delete(); |
|
| 208 |
} |
|
| 209 |
} |
|
| 210 |
|
|
| 211 |
if (MONITOR != null) MONITOR.worked(5) |
|
| 212 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
|
|
| 213 |
println "-- Remove interviewer: "+removeInterviewer |
|
| 214 |
if (removeInterviewer) {
|
|
| 215 |
if (metadatas == null) {
|
|
| 216 |
println "Can't remove interviewer without a metadata.csv file defining who are the interviewers." |
|
| 217 |
} else {
|
|
| 218 |
println "Removing some speakers in "+txmDir.listFiles().length+" file(s)" |
|
| 219 |
for (File infile : txmDir.listFiles()) {
|
|
| 220 |
String filename = infile.getName(); |
|
| 221 |
int idx = filename.indexOf(".xml");
|
|
| 222 |
if (idx > 0) |
|
| 223 |
filename = filename.substring(0, idx); |
|
| 224 |
|
|
| 225 |
ArrayList<Pair<String, String>> metas = metadatas.get(filename) |
|
| 226 |
//println "filename=$filename metas= $metas" |
|
| 227 |
for (Pair p : metas) {
|
|
| 228 |
if (p.getFirst().startsWith("enq")) {
|
|
| 229 |
new RemoveSpeaker(infile, infile, p.getFirst()) |
|
| 230 |
} |
|
| 216 | 231 |
} |
| 217 |
} |
|
| 232 |
} |
|
| 233 |
} |
|
| 234 |
} |
|
| 235 |
|
|
| 236 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
|
|
| 237 |
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE") |
|
| 238 |
|
|
| 239 |
boolean annotationSuccess = false; |
|
| 240 |
if (annotate) {
|
|
| 241 |
println "-- ANNOTATE - Running NLP tools" |
|
| 242 |
def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
|
|
| 243 |
if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
|
|
| 244 |
annotationSuccess = true; |
|
| 218 | 245 |
} |
| 219 | 246 |
} |
| 220 |
} |
|
| 221 |
|
|
| 247 |
} |
|
| 248 |
trsfiles = new ArrayList<File>(Arrays.asList(txmDir.listFiles())); |
|
| 249 |
|
|
| 250 |
if (metadatas != null && metadatas.getPropertyNames().contains("textorder")) {
|
|
| 251 |
Collections.sort(trsfiles, new Comparator<File>() {
|
|
| 252 |
public int compare(File f1, File f2) {
|
|
| 253 |
String o1 = textorder[f1.getName()]; |
|
| 254 |
String o2 = textorder[f2.getName()]; |
|
| 255 |
if (o1 == null && o2 == null) {
|
|
| 256 |
return f1.compareTo(f2); |
|
| 257 |
} else if (o1 == null) {
|
|
| 258 |
return 1 |
|
| 259 |
} else if (o2 == null) {
|
|
| 260 |
return -1 |
|
| 261 |
} else {
|
|
| 262 |
int c = o1.compareTo(o2); |
|
| 263 |
if (c == 0) return f1.compareTo(f2); |
|
| 264 |
else return c; |
|
| 265 |
} |
|
| 266 |
} |
|
| 267 |
}); |
|
| 268 |
} else {
|
|
| 269 |
Collections.sort(trsfiles); |
|
| 270 |
} |
|
| 271 |
|
|
| 222 | 272 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
|
| 223 |
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE") |
|
| 224 |
|
|
| 225 |
boolean annotationSuccess = false; |
|
| 226 |
if (annotate) {
|
|
| 227 |
println "-- ANNOTATE - Running NLP tools" |
|
| 228 |
def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
|
|
| 229 |
if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
|
|
| 230 |
annotationSuccess = true; |
|
| 231 |
} |
|
| 232 |
} |
|
| 233 |
|
|
| 234 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
|
|
| 235 | 273 |
if (MONITOR != null) MONITOR.worked(25, "COMPILING") |
| 236 | 274 |
println "--COMPILING - Building Search Engine indexes" |
| 237 |
trsfiles = txmDir.listFiles(); |
|
| 238 |
|
|
| 275 |
|
|
| 239 | 276 |
def comp = new compiler() |
| 240 | 277 |
if(debug) comp.setDebug(); |
| 241 | 278 |
comp.removeInterviewers(removeInterviewer); |
| 242 | 279 |
comp.setIgnoreTranscriberMetadata(ignoreTranscriberMetadata); |
| 243 |
comp.setAnnotationSucces(annotationSuccess) |
|
| 244 | 280 |
if (!comp.run(project, trsfiles, corpusname, "default", binDir)) {
|
| 245 | 281 |
println "Failed to compile files"; |
| 246 | 282 |
return; |
| 247 | 283 |
} |
| 248 |
|
|
| 284 |
|
|
| 249 | 285 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
|
| 250 |
|
|
| 286 |
|
|
| 251 | 287 |
File htmlDir = new File(binDir,"HTML/$corpusname"); |
| 252 | 288 |
htmlDir.deleteDir() |
| 253 | 289 |
htmlDir.mkdirs(); |
| 254 | 290 |
if (build_edition) {
|
| 255 |
|
|
| 291 |
|
|
| 256 | 292 |
if (MONITOR != null) MONITOR.worked(20, "EDITION") |
| 257 | 293 |
println "-- EDITION - Building editions" |
| 258 |
|
|
| 259 |
List<File> filelist = txmDir.listFiles(); |
|
| 260 |
Collections.sort(filelist); |
|
| 294 |
|
|
| 261 | 295 |
def second = 0 |
| 262 |
|
|
| 263 |
println "Paginating "+filelist.size()+" texts"
|
|
| 264 |
ConsoleProgressBar cpb = new ConsoleProgressBar(filelist.size());
|
|
| 265 |
for (File txmFile : filelist) {
|
|
| 296 |
|
|
| 297 |
println "Paginating "+trsfiles.size()+" texts"
|
|
| 298 |
ConsoleProgressBar cpb = new ConsoleProgressBar(trsfiles.size());
|
|
| 299 |
for (File txmFile : trsfiles) {
|
|
| 266 | 300 |
cpb.tick() |
| 267 | 301 |
String txtname = txmFile.getName(); |
| 268 | 302 |
int i = txtname.lastIndexOf(".");
|
| 269 | 303 |
if(i > 0) txtname = txtname.substring(0, i); |
| 270 |
|
|
| 304 |
|
|
| 271 | 305 |
List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang); |
| 272 | 306 |
List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang); |
| 273 |
|
|
| 274 |
Text t = new Text(project); |
|
| 275 |
t.setName(txtname); |
|
| 276 |
t.setSourceFile(txmFile) |
|
| 277 |
t.setTXMFile(txmFile) |
|
| 278 |
|
|
| 307 |
|
|
| 308 |
Text t = project.getText(txtname) |
|
| 309 |
if (t == null) {
|
|
| 310 |
new Text(project); |
|
| 311 |
t.setName(txtname); |
|
| 312 |
t.setSourceFile(txmFile) |
|
| 313 |
t.setTXMFile(txmFile) |
|
| 314 |
} |
|
| 279 | 315 |
def ed = new pager(txmFile, htmlDir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element, metadatas); |
| 280 | 316 |
Edition edition = new Edition(t); |
| 281 | 317 |
edition.setName("default");
|
| ... | ... | |
| 288 | 324 |
} |
| 289 | 325 |
} |
| 290 | 326 |
cpb.done() |
| 291 |
|
|
| 327 |
|
|
| 292 | 328 |
//copy transcriber.css |
| 293 | 329 |
File cssfile = new File(Toolbox.getTxmHomePath(), "css/transcriber.css") |
| 294 | 330 |
File cssTXMFile = new File(Toolbox.getTxmHomePath(), "css/txm.css") |
| ... | ... | |
| 299 | 335 |
} |
| 300 | 336 |
|
| 301 | 337 |
//copy media files |
| 302 |
println "Copying media files if any (mp3, wav, mp4 or avi) "+filelist.size()+" texts"
|
|
| 303 |
cpb = new ConsoleProgressBar(filelist.size());
|
|
| 304 |
for (File txmFile : filelist) {
|
|
| 338 |
println "Copying media files if any (mp3, wav, mp4 or avi) "+trsfiles.size()+" texts"
|
|
| 339 |
cpb = new ConsoleProgressBar(trsfiles.size());
|
|
| 340 |
for (File txmFile : trsfiles) {
|
|
| 305 | 341 |
cpb.tick() |
| 306 | 342 |
String txtname = txmFile.getName(); |
| 307 | 343 |
int i = txtname.lastIndexOf(".");
|
| ... | ... | |
| 310 | 346 |
if (!mediaFile.exists()) mediaFile = new File(project.getSrcdir(), txtname + ".wav") |
| 311 | 347 |
if (!mediaFile.exists()) mediaFile = new File(project.getSrcdir(), txtname + ".mp4") |
| 312 | 348 |
if (!mediaFile.exists()) mediaFile = new File(project.getSrcdir(), txtname + ".avi") |
| 313 |
|
|
| 349 |
|
|
| 314 | 350 |
if (mediaFile.exists()) {
|
| 315 | 351 |
File copy = new File(binDir, "media/"+mediaFile.getName()) |
| 316 | 352 |
copy.getParentFile().mkdirs() |
| tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/compiler.groovy (revision 2354) | ||
|---|---|---|
| 73 | 73 |
/** The debug. */ |
| 74 | 74 |
boolean debug = false; |
| 75 | 75 |
|
| 76 |
/** The annotation succes. */ |
|
| 77 |
boolean annotationSucces = false; |
|
| 78 |
|
|
| 79 | 76 |
/** The removeinterviewers. */ |
| 80 | 77 |
boolean removeinterviewers = false; |
| 81 | 78 |
|
| ... | ... | |
| 162 | 159 |
|
| 163 | 160 |
println("Compiling "+xmlfiles.size()+" files")
|
| 164 | 161 |
ConsoleProgressBar cpb = new ConsoleProgressBar(xmlfiles.size()) |
| 165 |
for (File f :xmlfiles) {
|
|
| 166 |
if (f.exists()) {
|
|
| 162 |
for (File txmFile :xmlfiles) {
|
|
| 163 |
if (txmFile.exists()) {
|
|
| 167 | 164 |
cpb.tick() |
| 168 |
if (!process(f)) {
|
|
| 169 |
println("Failed to compile "+f)
|
|
| 165 |
if (!process(txmFile)) {
|
|
| 166 |
println("Failed to compile "+txmFile)
|
|
| 170 | 167 |
} |
| 171 | 168 |
} |
| 172 | 169 |
} |
| ... | ... | |
| 680 | 677 |
debug = true; |
| 681 | 678 |
} |
| 682 | 679 |
|
| 683 |
/** |
|
| 684 |
* Sets the annotation success. |
|
| 685 |
* |
|
| 686 |
* @param val the new annotation success |
|
| 687 |
*/ |
|
| 688 |
public void setAnnotationSuccess(boolean val) {
|
|
| 689 |
annotationSucces = val |
|
| 690 |
} |
|
| 691 | 680 |
} |
Formats disponibles : Unified diff