Révision 2999
| tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/Vocapia2Transcriber.groovy (revision 2999) | ||
|---|---|---|
| 37 | 37 |
|
| 38 | 38 |
boolean flagWord = false |
| 39 | 39 |
def winfos = [:] |
| 40 |
def turninfos = [:] |
|
| 40 | 41 |
boolean other = false; |
| 41 | 42 |
String word = "" |
| 42 | 43 |
try {
|
| ... | ... | |
| 92 | 93 |
|
| 93 | 94 |
case "SpeechSegment": // <SpeechSegment ch="1" sconf="1.00" stime="9.94" etime="43.81" spkid="Enquêtrice" lang="fre" lconf="1.00" trs="1"> |
| 94 | 95 |
writer.writeStartElement("Turn")
|
| 95 |
writer.writeAttribute("speaker", parser.getAttributeValue(null, "spkid"))
|
|
| 96 |
writer.writeAttribute("startTime", parser.getAttributeValue(null, "stime"))
|
|
| 97 |
writer.writeAttribute("endTime", parser.getAttributeValue(null, "etime"))
|
|
| 96 |
|
|
| 97 |
turninfos = ["speaker":parser.getAttributeValue(null, "spkid"), |
|
| 98 |
"startTime":parser.getAttributeValue(null, "stime"), |
|
| 99 |
"endTime":parser.getAttributeValue(null, "etime"), |
|
| 100 |
] |
|
| 101 |
for (String attr : turninfos.keySet()) {
|
|
| 102 |
writer.writeAttribute(attr, turninfos[attr]) |
|
| 103 |
} |
|
| 104 |
|
|
| 98 | 105 |
writer.writeCharacters("\n")
|
| 99 | 106 |
writer.writeStartElement("Sync")
|
| 100 | 107 |
writer.writeAttribute("time", parser.getAttributeValue(null, "stime"))
|
| ... | ... | |
| 148 | 155 |
flagWord = false |
| 149 | 156 |
word = word.trim() |
| 150 | 157 |
if (word.startsWith("*")) {
|
| 158 |
//close current Turn and start a 'other' Turn |
|
| 159 |
writer.writeEndElement() // current Turn |
|
| 160 |
writer.writeStartElement("Turn")
|
|
| 161 |
writer.writeAttribute("speaker", "other")
|
|
| 162 |
writer.writeAttribute("startTime", winfos["time"])
|
|
| 163 |
writer.writeAttribute("orig-speaker", turninfos["speaker"])
|
|
| 164 |
writer.writeCharacters("\n")
|
|
| 151 | 165 |
other = true |
| 152 | 166 |
word = word.substring(1) |
| 153 | 167 |
} |
| ... | ... | |
| 155 | 169 |
String otherAttributeValue = Boolean.toString(other) // set now |
| 156 | 170 |
|
| 157 | 171 |
if (other && word.endsWith("*")) {
|
| 172 |
|
|
| 173 |
//close the current 'other' Turn and restart the actual Turn |
|
| 174 |
writer.writeEndElement() // current 'other' Turn |
|
| 175 |
|
|
| 176 |
writer.writeStartElement("Turn") // rebuild the orig Turn with its infos
|
|
| 177 |
turninfos["startTime"] = winfos["end"] // fix the startTime using the current word end time |
|
| 178 |
for (String attr : turninfos.keySet()) {
|
|
| 179 |
writer.writeAttribute(attr, turninfos[attr]) |
|
| 180 |
} |
|
| 181 |
writer.writeCharacters("\n")
|
|
| 182 |
|
|
| 158 | 183 |
word = word.substring(0, word.length()-1) |
| 159 | 184 |
other = false |
| 160 | 185 |
} |
| ... | ... | |
| 178 | 203 |
|
| 179 | 204 |
for (def punct : puncts) { // pre-retokenize if any
|
| 180 | 205 |
writer.writeStartElement("w")
|
| 181 |
writer.writeAttribute("time", winfos["time"])
|
|
| 182 |
writer.writeAttribute("start", winfos["start"])
|
|
| 183 |
writer.writeAttribute("end", winfos["start"])
|
|
| 206 |
for (String attr : winfos.keySet()) {
|
|
| 207 |
writer.writeAttribute(attr, winfos[attr])
|
|
| 208 |
}
|
|
| 184 | 209 |
writer.writeCharacters(punct) |
| 185 | 210 |
writer.writeEndElement() // w |
| 186 | 211 |
writer.writeCharacters("\n")
|
| 187 |
|
|
| 188 |
|
|
| 189 | 212 |
} |
| 190 | 213 |
|
| 191 | 214 |
puncts = [] |
| tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/pager.groovy (revision 2999) | ||
|---|---|---|
| 35 | 35 |
import org.txm.importer.ApplyXsl2 |
| 36 | 36 |
import org.txm.metadatas.MetadataGroup |
| 37 | 37 |
import org.txm.metadatas.Metadatas |
| 38 |
import org.txm.utils.TimeFormatter |
|
| 38 | 39 |
import org.txm.utils.io.FileCopy; |
| 39 | 40 |
|
| 40 | 41 |
|
| ... | ... | |
| 55 | 56 |
List<String> NoSpaceAfter; |
| 56 | 57 |
|
| 57 | 58 |
/** The pages. */ |
| 58 |
def pages = [];
|
|
| 59 |
def indexes = [];
|
|
| 59 |
def pages = [] |
|
| 60 |
def indexes = [] |
|
| 60 | 61 |
|
| 61 | 62 |
/** The wordcount. */ |
| 62 |
int wordcount = 0;
|
|
| 63 |
int wordcount = 0 |
|
| 63 | 64 |
|
| 64 | 65 |
/** The pagecount. */ |
| 65 |
int pagecount = 0;
|
|
| 66 |
int pagecount = 0 |
|
| 66 | 67 |
|
| 67 | 68 |
/** The wordmax. */ |
| 68 |
int wordmax = 10;
|
|
| 69 |
int wordmax = 10 |
|
| 69 | 70 |
|
| 70 | 71 |
/** The wordid. */ |
| 71 | 72 |
String wordid; |
| 72 | 73 |
|
| 73 | 74 |
/** The first word. */ |
| 74 |
boolean firstWord = true;
|
|
| 75 |
boolean firstWord = true |
|
| 75 | 76 |
|
| 76 | 77 |
/** The wordvalue. */ |
| 77 | 78 |
String wordvalue; |
| ... | ... | |
| 113 | 114 |
|
| 114 | 115 |
File outfile; |
| 115 | 116 |
|
| 116 |
String corpusname ="";
|
|
| 117 |
String corpusname ="" |
|
| 117 | 118 |
String cuttingTag = "pb" |
| 118 |
String txtname;
|
|
| 119 |
File htmlDir;
|
|
| 120 |
File defaultDir;
|
|
| 121 |
Metadatas metadatas;
|
|
| 119 |
String txtname |
|
| 120 |
File htmlDir |
|
| 121 |
File defaultDir |
|
| 122 |
Metadatas metadatas |
|
| 122 | 123 |
|
| 123 |
def interviewers = [];
|
|
| 124 |
def interviewers = null
|
|
| 124 | 125 |
def eventTranslations = ["^^":"mot inconnu", "?":"orthographe incertaine", |
| 125 | 126 |
"()":"rupture de syntaxe", "b":"bruit indéterminé", |
| 126 | 127 |
"*":"mot corrigé", |
| ... | ... | |
| 133 | 134 |
"pif":"inaudible", "r":"respiration", |
| 134 | 135 |
"rire":"rire du locuteur", "shh":"soufle électrique", |
| 135 | 136 |
"sif":"sifflement du locuteur", "tx":"toux"]; |
| 136 |
String currentTime = ""; |
|
| 137 |
boolean bold = false; |
|
| 138 |
int writenLength = 0; |
|
| 139 |
boolean spokenTurn = false; |
|
| 140 |
boolean firstSync = false; |
|
| 141 |
boolean firstWho = false; |
|
| 137 |
String currentUTime = "" |
|
| 138 |
String startTimeSp = "" |
|
| 139 |
String endTimeSp = "" |
|
| 140 |
String startTimeU = "0" |
|
| 141 |
String previousStartTimeU = "0" |
|
| 142 |
boolean bold = false |
|
| 143 |
int writenLength = 0 |
|
| 144 |
boolean spokenTurn = false |
|
| 145 |
boolean firstSync = false |
|
| 146 |
boolean firstWho = false |
|
| 147 |
int nSilence = 0 |
|
| 142 | 148 |
/** |
| 143 | 149 |
* Instantiates a new pager. |
| 144 | 150 |
* |
| ... | ... | |
| 307 | 313 |
writer.writeEndElement(); // td |
| 308 | 314 |
} |
| 309 | 315 |
//get enqueteur to style their names |
| 310 |
if (name.startsWith("enq")) {
|
|
| 311 |
interviewers.add(value)
|
|
| 316 |
if (name.equals("out-of-text-to-edit-locutor")) {
|
|
| 317 |
interviewers = /$value/
|
|
| 312 | 318 |
} |
| 313 | 319 |
writer.writeEndElement(); // tr |
| 314 | 320 |
} |
| ... | ... | |
| 493 | 499 |
firstWho = true; |
| 494 | 500 |
spokenTurn = false; |
| 495 | 501 |
overlapping = false |
| 502 |
nSilence = 0 // will count the number of silence written to avoid writting [silence] at the sp end |
|
| 496 | 503 |
|
| 497 | 504 |
writer.writeStartElement("p");
|
| 498 | 505 |
writer.writeAttribute("class", "turn");
|
| 506 |
writer.writeCharacters("\n");
|
|
| 499 | 507 |
|
| 508 |
this.startTimeSp = parser.getAttributeValue(null, "start") |
|
| 509 |
this.endTimeSp = parser.getAttributeValue(null, "end") |
|
| 510 |
|
|
| 500 | 511 |
overlapping = ("true" == parser.getAttributeValue(null,"overlap"))
|
| 501 | 512 |
String spid = parser.getAttributeValue(null, "who"); |
| 502 | 513 |
|
| ... | ... | |
| 512 | 523 |
break; |
| 513 | 524 |
case "u": |
| 514 | 525 |
writer.writeCharacters("\n");
|
| 515 |
this.currentTime = parser.getAttributeValue(null,"time"); |
|
| 526 |
this.previousStartTimeU = this.startTimeU |
|
| 527 |
this.startTimeU = parser.getAttributeValue(null, "start"); |
|
| 528 |
this.currentUTime = parser.getAttributeValue(null, "time"); |
|
| 516 | 529 |
|
| 517 | 530 |
if (previousElem == "u" && writenLength == 0) { // if previous u had no words, it was a silence
|
| 531 |
def duration = "" |
|
| 532 |
try {
|
|
| 533 |
def d = Float.parseFloat(this.startTimeU) - Float.parseFloat(this.previousStartTimeU) |
|
| 534 |
duration = " "+TimeFormatter.formatTime(d); |
|
| 535 |
} catch(Exception e) { e.printStackTrace()}
|
|
| 518 | 536 |
writer.writeStartElement("span");
|
| 519 | 537 |
writer.writeAttribute("class", "event");
|
| 520 |
writer.writeCharacters("[silence]");
|
|
| 538 |
writer.writeCharacters("[silence$duration]");
|
|
| 521 | 539 |
writer.writeEndElement(); // span |
| 522 |
writer.writeEmptyElement("br");
|
|
| 540 |
nSilence++ |
|
| 541 |
//writer.writeEmptyElement("br");
|
|
| 523 | 542 |
} |
| 524 | 543 |
|
| 525 | 544 |
String spk = parser.getAttributeValue(null, "who") |
| ... | ... | |
| 534 | 553 |
previousSPK = spk |
| 535 | 554 |
if (overlapping) previousSPK = null |
| 536 | 555 |
|
| 556 |
writenLength = 0; |
|
| 537 | 557 |
// writenLength = 0; |
| 538 | 558 |
/*writer.writeStartElement("span");
|
| 539 | 559 |
writer.writeAttribute("class", "sync");
|
| ... | ... | |
| 652 | 672 |
case "sp": |
| 653 | 673 |
//println "CLOSING: "+parser.getLocalName() |
| 654 | 674 |
endBoldIfNeeded() |
| 655 |
if (!spokenTurn) {
|
|
| 675 |
if (!spokenTurn && nSilence == 0) {
|
|
| 656 | 676 |
writer.writeStartElement("span");
|
| 657 | 677 |
writer.writeAttribute("class", "event");
|
| 658 |
writer.writeCharacters("[silence]");
|
|
| 678 |
String duration = "" |
|
| 679 |
try {
|
|
| 680 |
def d = Float.parseFloat(endTimeSp)-Float.parseFloat(startTimeSp) |
|
| 681 |
duration = " "+TimeFormatter.formatTime(d); |
|
| 682 |
} catch (Exception e) {e.printStackTrace()}
|
|
| 683 |
writer.writeCharacters("[silence$duration]");
|
|
| 659 | 684 |
writer.writeEndElement(); |
| 660 | 685 |
writer.writeEmptyElement("br");
|
| 661 | 686 |
} |
| ... | ... | |
| 680 | 705 |
//writer.writeEndElement() // span@class=u |
| 681 | 706 |
//writer.writeEmptyElement("br");
|
| 682 | 707 |
//if (overlapping) writer.writeEndElement(); // b |
| 708 |
writer.writeCharacters("\n");
|
|
| 683 | 709 |
break; |
| 684 | 710 |
case "event": |
| 685 | 711 |
break; |
| ... | ... | |
| 694 | 720 |
spokenTurn = true; |
| 695 | 721 |
int l = lastword.length(); |
| 696 | 722 |
String endOfLastWord = ""; |
| 697 |
if(l > 0)
|
|
| 723 |
if (l > 0) {
|
|
| 698 | 724 |
endOfLastWord = lastword.subSequence(l-1, l); |
| 699 |
|
|
| 725 |
} |
|
| 700 | 726 |
if (interpvalue != null) {
|
| 701 | 727 |
interpvalue = interpvalue.replace("\"",""");
|
| 702 | 728 |
} |
| ... | ... | |
| 726 | 752 |
|
| 727 | 753 |
if (interpvalue.contains("rapp1")) {
|
| 728 | 754 |
writer.writeCharacters(" «");
|
| 729 |
} else if (wordvalue == "\"") {
|
|
| 730 |
// don't write this char |
|
| 731 |
} else {
|
|
| 732 |
writer.writeStartElement("span");
|
|
| 733 |
writer.writeAttribute("class", "word");
|
|
| 734 |
writer.writeAttribute("title", interpvalue);
|
|
| 735 |
writer.writeAttribute("id", wordid);
|
|
| 736 |
writer.writeCharacters(wordvalue); |
|
| 737 |
writer.writeEndElement(); |
|
| 738 |
} |
|
| 755 |
} |
|
| 756 |
|
|
| 757 |
writer.writeStartElement("span");
|
|
| 758 |
writer.writeAttribute("class", "word");
|
|
| 759 |
writer.writeAttribute("title", interpvalue);
|
|
| 760 |
writer.writeAttribute("id", wordid);
|
|
| 761 |
writer.writeCharacters(wordvalue); |
|
| 762 |
writer.writeEndElement(); |
|
| 763 |
|
|
| 739 | 764 |
if (interpvalue.contains("orth")) {
|
| 740 | 765 |
writer.writeStartElement("span");
|
| 741 | 766 |
writer.writeAttribute("class", "event");
|
| ... | ... | |
| 748 | 773 |
writer.writeCharacters("_[!]");
|
| 749 | 774 |
writer.writeEndElement(); |
| 750 | 775 |
} |
| 751 |
|
|
| 752 | 776 |
if (interpvalue.contains("rapp2")) {
|
| 753 |
writer.writeCharacters(" » ");
|
|
| 777 |
writer.writeCharacters("» ");
|
|
| 754 | 778 |
} |
| 755 | 779 |
|
| 756 | 780 |
lastword=wordvalue; |
| ... | ... | |
| 817 | 841 |
private void writeCurrentTime() {
|
| 818 | 842 |
writer.writeStartElement("span");
|
| 819 | 843 |
writer.writeAttribute("class", "sync");
|
| 820 |
writer.writeCharacters(currentTime); |
|
| 844 |
writer.writeCharacters(currentUTime);
|
|
| 821 | 845 |
|
| 822 |
writeMediaAccess(currentTime) |
|
| 846 |
writeMediaAccess(currentUTime)
|
|
| 823 | 847 |
|
| 824 | 848 |
writer.writeEndElement() // span |
| 825 | 849 |
} |
| ... | ... | |
| 838 | 862 |
|
| 839 | 863 |
writer.writeStartElement("span");
|
| 840 | 864 |
writer.writeAttribute("class", "spk");
|
| 841 |
if(interviewers.contains(spk)) {
|
|
| 842 |
bold = true; |
|
| 843 |
} else {
|
|
| 844 |
bold = false; |
|
| 845 |
} |
|
| 865 |
bold = interviewers != null && interviewers.matches(spk) |
|
| 846 | 866 |
spk = spk.replaceAll('^([^0-9]*)([0-9]+)$', '$1 $2');
|
| 847 | 867 |
if (overlapping) {
|
| 848 | 868 |
writer.writeCharacters("// ")
|
| ... | ... | |
| 876 | 896 |
} |
| 877 | 897 |
} |
| 878 | 898 |
|
| 879 |
// private String formatTime(float time, boolean doshort) |
|
| 880 |
// {
|
|
| 881 |
// String rez = " "; |
|
| 882 |
// // if(time >= 3600) // >= 1h |
|
| 883 |
// // {
|
|
| 884 |
// float h = time / 3600; |
|
| 885 |
// time = time%3600; |
|
| 886 |
// float min = (time%3600) / 60; |
|
| 887 |
// int sec = (int)time%60; |
|
| 888 |
// |
|
| 889 |
// if(min < 10) |
|
| 890 |
// rez = ""+(int)h+":0"+(int)min;//+":"+time%60; |
|
| 891 |
// else |
|
| 892 |
// rez = ""+(int)h+":"+(int)min;//+":"+time%60; |
|
| 893 |
// //if (!doshort) |
|
| 894 |
// if (sec > 9) |
|
| 895 |
// rez += ":"+(int)time%60; |
|
| 896 |
// else |
|
| 897 |
// rez += ":0"+(int)time%60; |
|
| 898 |
// // } |
|
| 899 |
// // else if(time >= 60) // >= 1min |
|
| 900 |
// // {
|
|
| 901 |
// // int min = time/60; |
|
| 902 |
// // if(min < 10) |
|
| 903 |
// // rez = "00:0"+min;//+":"+time%60; |
|
| 904 |
// // else |
|
| 905 |
// // rez = "00:"+min;//+":"+time%60; |
|
| 906 |
// // if(!doshort) |
|
| 907 |
// // rez += ":"+(int)time%60; |
|
| 908 |
// // } |
|
| 909 |
// // else // < 60 |
|
| 910 |
// // {
|
|
| 911 |
// // if(time < 10) |
|
| 912 |
// // return " 0:0"+time; |
|
| 913 |
// // else |
|
| 914 |
// // return " 0:"+time; |
|
| 915 |
// // } |
|
| 916 |
// return rez; |
|
| 917 |
// } |
|
| 918 |
|
|
| 919 | 899 |
/** |
| 920 | 900 |
* Gets the page files. |
| 921 | 901 |
* |
| tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/transcriberLoader.groovy (revision 2999) | ||
|---|---|---|
| 225 | 225 |
ArrayList<Pair<String, String>> metas = metadatas.get(filename) |
| 226 | 226 |
//println "filename=$filename metas= $metas" |
| 227 | 227 |
for (Pair p : metas) {
|
| 228 |
if (p.getFirst().startsWith("enq")) {
|
|
| 229 |
new RemoveSpeaker(infile, infile, p.getFirst())
|
|
| 228 |
if (p.getFirst().startsWith("out-of-text-to-edit-locutor")) {
|
|
| 229 |
new RemoveSpeaker(infile, infile, p.getSecond())
|
|
| 230 | 230 |
} |
| 231 | 231 |
} |
| 232 | 232 |
} |
| tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/RemoveSpeaker.groovy (revision 2999) | ||
|---|---|---|
| 31 | 31 |
import org.txm.scripts.importer.*; |
| 32 | 32 |
import org.txm.scripts.importer.graal.PersonalNamespaceContext |
| 33 | 33 |
import org.txm.utils.*; |
| 34 |
import org.txm.utils.xml.DomUtils |
|
| 34 | 35 |
import org.txm.metadatas.*; |
| 35 | 36 |
|
| 36 | 37 |
import java.io.File; |
| ... | ... | |
| 65 | 66 |
* @param outfile the outfile |
| 66 | 67 |
* @param id the id |
| 67 | 68 |
*/ |
| 68 |
public RemoveSpeaker(File transcriptionfile, File outfile, String id) |
|
| 69 |
{
|
|
| 69 |
public RemoveSpeaker(File transcriptionfile, File outfile, String idRegex) {
|
|
| 70 | 70 |
System.setProperty("javax.xml.transform.TransformerFactory", "net.sf.saxon.TransformerFactoryImpl");
|
| 71 | 71 |
|
| 72 | 72 |
this.outfile = outfile; |
| 73 |
String xpathString = "//tei:u[@spk='"+id+"']";
|
|
| 73 |
// String xpathString = "//u";
|
|
| 74 | 74 |
//println "removing $xpathString in $transcriptionfile" |
| 75 |
DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance(); |
|
| 76 |
//println "domFactory: $domFactory" |
|
| 77 |
domFactory.setNamespaceAware(true); // never forget this! |
|
| 78 |
domFactory.setXIncludeAware(true); |
|
| 79 |
DocumentBuilder builder = domFactory.newDocumentBuilder(); |
|
| 75 |
// DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();
|
|
| 76 |
// //println "domFactory: $domFactory"
|
|
| 77 |
// domFactory.setNamespaceAware(true); // never forget this!
|
|
| 78 |
// domFactory.setXIncludeAware(true);
|
|
| 79 |
// DocumentBuilder builder = domFactory.newDocumentBuilder();
|
|
| 80 | 80 |
//println "builder $builder" |
| 81 |
doc = builder.parse(transcriptionfile); |
|
| 81 |
//doc = builder.parse(transcriptionfile);
|
|
| 82 | 82 |
//println "doc $doc" |
| 83 |
doc = DomUtils.load(transcriptionfile) |
|
| 84 |
//def xpath = XPathFactory.newInstance().newXPath() |
|
| 85 |
// xpath.setNamespaceContext(new PersonalNamespaceContext()); |
|
| 86 |
//def expr = xpath.compile(xpathString); |
|
| 87 |
def nodes = doc.getDocumentElement().getElementsByTagName("u")
|
|
| 83 | 88 |
|
| 84 |
def xpath = XPathFactory.newInstance().newXPath() |
|
| 85 |
xpath.setNamespaceContext(new PersonalNamespaceContext()); |
|
| 86 |
def expr = xpath.compile(xpathString); |
|
| 87 |
def nodes = expr.evaluate(doc, XPathConstants.NODESET); |
|
| 89 |
def reg = /$idRegex/ |
|
| 88 | 90 |
|
| 89 | 91 |
for (def node : nodes) {
|
| 90 |
//println "remove node "+node |
|
| 91 |
Element elem = (Element)node; |
|
| 92 |
elem.getParentNode().removeChild(node); |
|
| 92 |
|
|
| 93 |
//Element elem = (Element)node; |
|
| 94 |
|
|
| 95 |
String who = node.getAttribute("who") // [@who='"+idRegex+"']
|
|
| 96 |
if (reg.matches(who)) {
|
|
| 97 |
node.getParentNode().removeChild(node); |
|
| 98 |
} |
|
| 93 | 99 |
} |
| 94 | 100 |
save() |
| 95 | 101 |
} |
| ... | ... | |
| 99 | 105 |
* |
| 100 | 106 |
* @return true, if successful |
| 101 | 107 |
*/ |
| 102 |
private boolean save() |
|
| 103 |
{
|
|
| 108 |
private boolean save() {
|
|
| 104 | 109 |
try {
|
| 105 | 110 |
// Création de la source DOM |
| 106 | 111 |
Source source = new DOMSource(doc); |
| tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/TranscriberTokenizer.groovy (revision 2999) | ||
|---|---|---|
| 70 | 70 |
* @param infile the infile |
| 71 | 71 |
* @param outfile the outfile |
| 72 | 72 |
*/ |
| 73 |
public TranscriberTokenizer(File infile, File outfile, String lang) |
|
| 74 |
{
|
|
| 73 |
public TranscriberTokenizer(File infile, File outfile, String lang) {
|
|
| 75 | 74 |
super(infile, outfile, lang) |
| 76 | 75 |
txtname = infile.getName(); |
| 77 | 76 |
int idx = txtname.lastIndexOf(".")
|
| ... | ... | |
| 96 | 95 |
audio = "present" |
| 97 | 96 |
notation = s; |
| 98 | 97 |
event = ""; |
| 99 |
|
|
| 100 |
if (s.startsWith("\"")) {
|
|
| 101 |
rapp = true; |
|
| 102 |
event += "#rapp1"; |
|
| 103 |
} else if(s.endsWith("\"")) {
|
|
| 104 |
rapp = false; |
|
| 105 |
event += "#rapp2"; |
|
| 106 |
} |
|
| 107 | 98 |
|
| 99 |
//TODO does not work (eg ' "word" '). This step should be done after the tokenizer step is done |
|
| 100 |
// if (s.startsWith("\"") && s.endsWith("\"")) {
|
|
| 101 |
// // not rapp1 or rapp2 |
|
| 102 |
// } else if (s.startsWith("\"")) {
|
|
| 103 |
// rapp = true; |
|
| 104 |
// event += "#rapp1"; |
|
| 105 |
// } else if(s.endsWith("\"")) {
|
|
| 106 |
// rapp = false; |
|
| 107 |
// event += "#rapp2"; |
|
| 108 |
// } |
|
| 109 |
|
|
| 108 | 110 |
//test events |
| 109 | 111 |
if (s.startsWith("^^")) {
|
| 110 | 112 |
event += "#orth"; |
| tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/compiler.groovy (revision 2999) | ||
|---|---|---|
| 90 | 90 |
|
| 91 | 91 |
|
| 92 | 92 |
/** The interviewers. */ |
| 93 |
ArrayList<String> interviewers = [];
|
|
| 93 |
def interviewers = null
|
|
| 94 | 94 |
static HashSet<String> sectionAttrs; |
| 95 | 95 |
|
| 96 | 96 |
/** The anatypes. */ |
| ... | ... | |
| 481 | 481 |
break; |
| 482 | 482 |
case "w": |
| 483 | 483 |
// concat spk id and ref |
| 484 |
String isEnq = (interviewers.contains(u_name))?"*":"";
|
|
| 484 |
String isEnq = (interviewers != null && interviewers.matches(u_name))?"*":"";
|
|
| 485 | 485 |
String ref = (u_name+", "+formatedTime+""+isEnq) |
| 486 | 486 |
if (ADD_TEXTID_TO_REF) ref = textid+", "+ref |
| 487 | 487 |
vForm +="\t"+u_name+"\t"+ref |
| ... | ... | |
| 508 | 508 |
} |
| 509 | 509 |
} |
| 510 | 510 |
|
| 511 |
|
|
| 512 | 511 |
vForm = vForm.replaceAll("\n", "").replaceAll("&", "&").replaceAll("<", "<");
|
| 513 | 512 |
|
| 514 | 513 |
if (removeinterviewers) {
|
| 515 |
if (!interviewers.contains(u_name))
|
|
| 514 |
if (!interviewers.matches(u_name))
|
|
| 516 | 515 |
output.write(vForm+"\t"+wordid+vAna+"\n"); |
| 517 | 516 |
} else {
|
| 518 | 517 |
output.write(vForm+"\t"+wordid+vAna+"\n"); |
| ... | ... | |
| 660 | 659 |
|
| 661 | 660 |
for (int i = 0 ; i < parser.getAttributeCount() ; i ++) {
|
| 662 | 661 |
list.add(new Pair(parser.getAttributeLocalName(i).replace("_","").toLowerCase(), parser.getAttributeValue(i)));
|
| 663 |
if (parser.getAttributeLocalName(i).startsWith("enq"))
|
|
| 664 |
interviewers.add(parser.getAttributeValue(i));
|
|
| 662 |
if (parser.getAttributeLocalName(i).equals("out-of-text-to-edit-locutor"))
|
|
| 663 |
interviewers = /${parser.getAttributeValue(i)}/;
|
|
| 665 | 664 |
} |
| 666 | 665 |
return |
| 667 | 666 |
case "Topic": |
Formats disponibles : Unified diff