Révision 3003
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/importer.groovy (revision 3003) | ||
---|---|---|
187 | 187 |
File tfile = new File(tokenizedDir, pfile.getName()) |
188 | 188 |
try { |
189 | 189 |
TranscriberTokenizer tokenizer = new TranscriberTokenizer(pfile, tfile, lang) |
190 |
tokenizer.setRetokenize("true" == project.getTokenizerParameter("doRetokenizeStep", "false")) |
|
190 | 191 |
if (!tokenizer.process()) { |
191 | 192 |
println("Failed to tokenize "+pfile) |
192 | 193 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/TranscriberTokenizer.groovy (revision 3003) | ||
---|---|---|
108 | 108 |
// } |
109 | 109 |
|
110 | 110 |
//test events |
111 |
if (s.startsWith("^^")) { |
|
111 |
if (s.startsWith("^^") && s.length() > 2) {
|
|
112 | 112 |
event += "#orth"; |
113 | 113 |
s = s.substring(2); |
114 | 114 |
} |
115 |
if (s.startsWith("*")) { |
|
115 |
if (s.startsWith("*") && s.length() > 1) {
|
|
116 | 116 |
event += "#corr"; |
117 | 117 |
s = s.substring(1); |
118 | 118 |
} |
... | ... | |
121 | 121 |
event = event.substring(1);//remove '|' |
122 | 122 |
|
123 | 123 |
// test audio |
124 |
if (s.contains("(") || s.contains(")")) // contains ( or )
|
|
124 |
if ((s.contains("(") || s.contains(")")) && && s.length() > 2) // contains ( or )
|
|
125 | 125 |
{ |
126 | 126 |
if (s.contains("(") ^ s.contains(")")) // ert(ert XOR ert)ert |
127 | 127 |
{ |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZImporter.groovy (revision 3003) | ||
---|---|---|
331 | 331 |
new File(module.getBinaryDirectory(),"tokenized").deleteDir() |
332 | 332 |
new File(module.getBinaryDirectory(),"tokenized").mkdir() |
333 | 333 |
|
334 |
boolean retokenize = "true" == module.getProject().getTokenizerParameter("doRetokenizeStep", "false") |
|
334 | 335 |
String outSideTextTagsRegex = ""; |
335 | 336 |
String outSideTextTagsAndKeepContentRegex = ""; |
336 | 337 |
String noteRegex = ""; |
... | ... | |
380 | 381 |
File infile = f; |
381 | 382 |
File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName()); |
382 | 383 |
SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang) |
383 |
|
|
384 |
tokenizer.setRetokenize(retokenize) |
|
384 | 385 |
if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) { |
385 | 386 |
tokenizer.setOutSideTextTags(outSideTextTagsRegex) |
386 | 387 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/filters/Tokeniser/SimpleTokenizerXml.groovy (revision 3003) | ||
---|---|---|
51 | 51 |
|
52 | 52 |
import javax.xml.stream.* |
53 | 53 |
|
54 |
import org.txm.scripts.importer.graal.PersonalNamespaceContext
|
|
54 |
import org.txm.importer.PersonalNamespaceContext
|
|
55 | 55 |
import org.txm.tokenizer.TokenizerClasses |
56 | 56 |
|
57 | 57 |
@CompileStatic |
58 | 58 |
public class SimpleTokenizerXml { |
59 |
|
|
59 |
|
|
60 |
boolean retokenize = false |
|
61 |
LinkedHashMap<String, String>retokenizedWordProperties = new LinkedHashMap() |
|
62 |
|
|
60 | 63 |
/** The word_tags. */ |
61 | 64 |
String word_tags = TokenizerClasses.word_tags; |
62 | 65 |
String word_element_to_create = TokenizerClasses.word_element_to_create; |
63 | 66 |
Pattern reg_word_tags; |
64 | 67 |
/** The intraword_tags. */ |
65 | 68 |
String intraword_tags = TokenizerClasses.intraword_tags; |
66 |
|
|
69 |
|
|
67 | 70 |
/** The punct_strong. */ |
68 | 71 |
String punct_strong = TokenizerClasses.punct_strong; |
69 |
|
|
72 |
|
|
70 | 73 |
/** The punct_all. */ |
71 | 74 |
String punct_all = TokenizerClasses.punct_all; |
72 |
|
|
75 |
|
|
73 | 76 |
/** The word_chars. */ |
74 | 77 |
String word_chars = TokenizerClasses.word_chars; |
75 |
|
|
78 |
|
|
76 | 79 |
String fclitics = null; // default behavior don't manage clitics |
77 | 80 |
String pclitics = null; // default behavior don't manage clitics |
78 |
|
|
81 |
|
|
79 | 82 |
/** The outside_text_tags_ignore_content. */ |
80 | 83 |
String note_content = null; |
81 | 84 |
String outside_text_tags_keep_content = null // tag and content NOT removed but not tokenized |
82 | 85 |
String outside_text_tags = null // tag and content removed |
83 | 86 |
String startTag = null |
84 | 87 |
Pattern reg_note_content; |
85 |
Pattern reg_outside_text_tags_keep_content;
|
|
88 |
Pattern reg_outside_text_tags_keep_content; |
|
86 | 89 |
Pattern reg_outside_text_tags; |
87 | 90 |
Pattern reg_startTag; |
88 | 91 |
|
89 | 92 |
Pattern reg_punct_other = Pattern.compile("\\p{P}") |
90 |
|
|
93 |
|
|
91 | 94 |
/** The DEBUG. */ |
92 | 95 |
public boolean DEBUG = false; |
93 |
|
|
96 |
|
|
94 | 97 |
/** The outfile. */ |
95 | 98 |
File outfile; |
96 |
|
|
99 |
|
|
97 | 100 |
/** The infile. */ |
98 | 101 |
File infile; |
99 |
|
|
102 |
|
|
100 | 103 |
String lang; |
101 |
|
|
104 |
|
|
102 | 105 |
/** The buffer. */ |
103 | 106 |
StringBuffer buffer; |
104 |
|
|
107 |
|
|
105 | 108 |
/** The writer. */ |
106 | 109 |
XMLStreamWriter writer; |
107 | 110 |
BufferedOutputStream output; |
108 |
|
|
111 |
|
|
109 | 112 |
/** The parser. */ |
110 | 113 |
XMLStreamReader parser |
111 |
|
|
114 |
|
|
112 | 115 |
/** The localname. */ |
113 | 116 |
String localname; |
114 |
|
|
117 |
|
|
115 | 118 |
/** The prefix. */ |
116 | 119 |
String prefix; |
117 | 120 |
String filename; |
118 |
|
|
121 |
|
|
119 | 122 |
def regElision = null; |
120 | 123 |
def reg3pts = null; |
121 | 124 |
def regPunct; |
... | ... | |
126 | 129 |
Pattern regLN; |
127 | 130 |
Pattern regCTRL; |
128 | 131 |
Pattern regSplitWhiteSpaces; |
129 |
|
|
132 |
|
|
130 | 133 |
public SimpleTokenizerXml(File infile, File outfile) { |
131 | 134 |
this(infile, outfile, ""); |
132 | 135 |
} |
133 |
|
|
136 |
|
|
134 | 137 |
/** |
135 | 138 |
* Instantiates a new simple tokenizer xml. |
136 | 139 |
* |
... | ... | |
150 | 153 |
} else if (lang.startsWith("it")) { |
151 | 154 |
pclitics = TokenizerClasses.PClitic_it |
152 | 155 |
} |
153 |
|
|
156 |
|
|
154 | 157 |
word_tags = TokenizerClasses.word_tags; |
155 | 158 |
reg_word_tags = Pattern.compile(word_tags); |
156 | 159 |
|
... | ... | |
158 | 161 |
punct_strong = TokenizerClasses.punct_strong; |
159 | 162 |
punct_all = TokenizerClasses.punct_all; |
160 | 163 |
word_chars = TokenizerClasses.word_chars; |
161 |
|
|
164 |
|
|
162 | 165 |
this.outfile = outfile; |
163 | 166 |
this.infile = infile; |
164 | 167 |
this.filename = infile.getName(); |
165 | 168 |
int index = filename.lastIndexOf("."); |
166 | 169 |
if (index > 0) filename = filename.substring(0, index); |
167 |
|
|
170 |
|
|
168 | 171 |
String strRegElision = TokenizerClasses.regElision; |
169 | 172 |
if (strRegElision != null && strRegElision.length() > 0) |
170 | 173 |
regElision = ~/^([\p{L}-]++${strRegElision}[\p{L}-]++)(.*)$/ // the test must start with the start of string ^ |
... | ... | |
176 | 179 |
|
177 | 180 |
if (fclitics != null && fclitics.length() > 0) |
178 | 181 |
regFClitics = ~/(.+)($fclitics)$/ // the test must end with the end of string $ |
179 |
|
|
182 |
|
|
180 | 183 |
if (pclitics != null && pclitics.length() > 0) |
181 | 184 |
regPClitics = /^($pclitics)(.*)/ // the test must start with the start of string ^ |
182 |
|
|
185 |
|
|
183 | 186 |
if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0) |
184 | 187 |
regWhitespaces = ~TokenizerClasses.whitespaces; |
185 |
|
|
188 |
|
|
186 | 189 |
if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0) |
187 | 190 |
regSplitWhiteSpaces = Pattern.compile(TokenizerClasses.whitespaces); |
188 |
|
|
191 |
|
|
189 | 192 |
regLN = Pattern.compile("/\n/"); |
190 | 193 |
regCTRL = Pattern.compile("/\\p{C}/"); |
191 | 194 |
} |
192 |
|
|
195 |
|
|
193 | 196 |
/** |
194 | 197 |
* Fill infos. |
195 | 198 |
* |
... | ... | |
202 | 205 |
prefix = parser.getPrefix(); |
203 | 206 |
} |
204 | 207 |
} |
205 |
|
|
208 |
|
|
206 | 209 |
/** |
207 | 210 |
* Donothing: just write what is read |
208 | 211 |
* |
... | ... | |
225 | 228 |
// writer.writeStartElement(namespace, localname); |
226 | 229 |
// else |
227 | 230 |
writer.writeStartElement(localname); |
228 |
|
|
231 |
|
|
229 | 232 |
// if(parser.getNamespaceCount() > 0) |
230 | 233 |
// writer.writeDefaultNamespace(parser.getNamespaceURI(0)) |
231 | 234 |
// for(int i = 1 ; i < parser.getNamespaceCount() ; i++) |
232 | 235 |
// writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i)); |
233 |
|
|
236 |
|
|
234 | 237 |
String namespace_prefix; |
235 | 238 |
for (int i = 0 ; i< parser.getNamespaceCount() ; i++) { |
236 | 239 |
namespace_prefix = parser.getNamespacePrefix(i); |
... | ... | |
240 | 243 |
writer.writeDefaultNamespace(parser.getNamespaceURI(i)); |
241 | 244 |
} |
242 | 245 |
} |
243 |
|
|
246 |
|
|
244 | 247 |
String attrprefix, attname; |
245 | 248 |
boolean hasId = false; |
246 | 249 |
//boolean hasType = false |
... | ... | |
251 | 254 |
if ("id".equals(attname)) hasId = true; |
252 | 255 |
//if ("type".equals(attname)) hasType = true; |
253 | 256 |
if ("n".equals(attname)) hasN = true; |
254 |
|
|
257 |
|
|
255 | 258 |
if (attrprefix != null && attrprefix.length() > 0) |
256 | 259 |
writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i)) |
257 | 260 |
else |
258 | 261 |
writer.writeAttribute(attname, parser.getAttributeValue(i)) |
259 | 262 |
} |
260 |
|
|
263 |
|
|
261 | 264 |
if (wordid != null && !hasId && localname == word_element_to_create) |
262 | 265 |
writer.writeAttribute("id", "w_"+filename+"_"+wordcount); |
263 |
|
|
266 |
|
|
264 | 267 |
if (!hasN && localname == word_element_to_create) |
265 | 268 |
writer.writeAttribute("n", ""+wordcount); |
266 |
|
|
267 |
writer.writeCharacters("\n"); |
|
269 |
|
|
270 |
if (!reg_word_tags.matcher(localname).matches()) { |
|
271 |
writer.writeCharacters("\n"); |
|
272 |
} |
|
268 | 273 |
} |
269 | 274 |
else if(event == XMLStreamConstants.END_ELEMENT) |
270 | 275 |
{ |
... | ... | |
275 | 280 |
{ |
276 | 281 |
//println parser.getText(); |
277 | 282 |
//writer.writeCharacters("𦟛"); |
278 |
|
|
279 |
|
|
283 |
|
|
284 |
|
|
280 | 285 |
// checks if the token starts with an high surrogate |
281 | 286 |
// if(isHighSurrogate(parser.getText().charAt(0))) { |
282 | 287 |
// println "warning: invalid UTF-8 XML range, token " + parser.getText() + " has been replaced."; |
... | ... | |
292 | 297 |
// writer.writeCharacters("&#" + scalar + ";"); |
293 | 298 |
// } |
294 | 299 |
// else |
295 |
|
|
296 |
|
|
300 |
|
|
301 |
|
|
297 | 302 |
if (insideword) { // ensure there is not \t or \n in the word form value |
298 | 303 |
writer.writeCharacters(parser.getText().trim().replace("\n", " ").replace("\t", " ")); |
299 | 304 |
} else { |
... | ... | |
301 | 306 |
} |
302 | 307 |
} |
303 | 308 |
} |
304 |
|
|
305 |
|
|
309 |
|
|
310 |
|
|
306 | 311 |
/** |
307 | 312 |
* Converts the specified surrogates pair to scalar. |
308 | 313 |
* @param highSurrogate |
... | ... | |
312 | 317 |
public int surrogatesPairToScalar(char highSurrogate, char lowSurrogate) { |
313 | 318 |
return ((highSurrogate - 0xD800) * 0x400) + (lowSurrogate - 0xDC00) + 0x10000; |
314 | 319 |
} |
315 |
|
|
320 |
|
|
316 | 321 |
/** |
317 | 322 |
* Checks if the specified character is an high/leading surrogate. |
318 | 323 |
* @param character |
... | ... | |
321 | 326 |
public boolean isHighSurrogate(char character) { |
322 | 327 |
return (character >= 0xD800 && character <= 0xDBFF); |
323 | 328 |
} |
324 |
|
|
325 |
|
|
326 |
|
|
329 |
|
|
330 |
|
|
331 |
|
|
327 | 332 |
/** The wordcount. */ |
328 | 333 |
int wordcount = 0; |
329 |
|
|
334 |
|
|
330 | 335 |
/** The ignorecontent. */ |
331 | 336 |
boolean ignorecontent = true; // tokenization starts with element name matching $reg_startTag |
332 | 337 |
boolean insideword = false; |
... | ... | |
345 | 350 |
output = new BufferedOutputStream(new FileOutputStream(outfile)) |
346 | 351 |
writer = factory.createXMLStreamWriter(output, "UTF-8") |
347 | 352 |
writer.setNamespaceContext(new PersonalNamespaceContext()); |
348 |
|
|
353 |
|
|
349 | 354 |
def inputData = infile.toURI().toURL().openStream(); |
350 | 355 |
def inputfactory = XMLInputFactory.newInstance(); |
351 | 356 |
//inputfactory.setProperty("http://apache.org/xml/properties/input-buffer-size", new Integer(2048)); |
352 | 357 |
//inputfactory.setExpandEntityReferences(false); |
353 |
parser = inputfactory.createXMLStreamReader(inputData, "UTF-8"); |
|
358 |
XMLInputFactory.newInstance(); |
|
359 |
parser = inputfactory.createXMLStreamReader(inputData); |
|
354 | 360 |
//println "PARSER: "+parser.getClass() |
355 | 361 |
writer.writeStartDocument("UTF-8","1.0"); |
356 | 362 |
writer.writeCharacters("\n"); |
357 |
|
|
363 |
|
|
358 | 364 |
int previousEvent = 0; |
359 | 365 |
boolean startProcess = false; |
360 | 366 |
if (startTag == null) // if no startTag specified we process from the start |
... | ... | |
363 | 369 |
|
364 | 370 |
buffer = new StringBuffer(); |
365 | 371 |
//println "process - start start tag: "+startTag+" startProcess: $startProcess" |
366 |
// println "reg_outside_text_tags_keep_content=$reg_outside_text_tags_keep_content" |
|
367 |
// println "reg_outside_text_tags=$reg_outside_text_tags" |
|
368 |
// println "reg_note_content=$reg_note_content" |
|
372 |
// println "reg_outside_text_tags_keep_content=$reg_outside_text_tags_keep_content"
|
|
373 |
// println "reg_outside_text_tags=$reg_outside_text_tags"
|
|
374 |
// println "reg_note_content=$reg_note_content"
|
|
369 | 375 |
try { |
370 | 376 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
371 | 377 |
if (!startProcess) { |
... | ... | |
380 | 386 |
continue; |
381 | 387 |
} |
382 | 388 |
} |
383 |
|
|
389 |
|
|
384 | 390 |
if (previousEvent == XMLStreamConstants.CHARACTERS && previousEvent != event) { |
385 | 391 |
processWord(); // tokenize now! |
386 | 392 |
buffer.setLength(0); |
... | ... | |
391 | 397 |
localname = parser.getLocalName() |
392 | 398 |
if (reg_word_tags.matcher(localname).matches()) { // ignore the content of the word but keep counting |
393 | 399 |
//println "Found pretagged word"; |
394 |
wordcount++; |
|
395 |
donothing(event, wordcount); |
|
396 |
//ignorecontent = true; |
|
397 |
insideword = true; |
|
400 |
|
|
401 |
if (retokenize) { |
|
402 |
retokenizedWordProperties.clear() |
|
403 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
404 |
retokenizedWordProperties[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i) |
|
405 |
} |
|
406 |
} else { |
|
407 |
wordcount++; |
|
408 |
donothing(event, wordcount); |
|
409 |
//ignorecontent = true; |
|
410 |
insideword = true; |
|
411 |
} |
|
398 | 412 |
} else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY |
399 | 413 |
// ignore the tag only |
400 | 414 |
donothing(event, null); // write the tag |
... | ... | |
414 | 428 |
//println "Close: "+localname; |
415 | 429 |
localname = parser.getLocalName() |
416 | 430 |
if (reg_word_tags.matcher(localname).matches()) { |
417 |
//ignorecontent = false; |
|
418 |
insideword = false; |
|
419 |
writer.writeEndElement(); |
|
420 |
writer.writeCharacters("\n"); |
|
431 |
if (retokenize) { |
|
432 |
retokenizedWordProperties.clear() |
|
433 |
} else { |
|
434 |
//ignorecontent = false; |
|
435 |
insideword = false; |
|
436 |
writer.writeEndElement(); |
|
437 |
writer.writeCharacters("\n"); |
|
438 |
} |
|
421 | 439 |
} else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag |
422 | 440 |
ignorecontent = false; |
423 | 441 |
donothing(event, null); |
... | ... | |
448 | 466 |
} |
449 | 467 |
previousEvent = event; |
450 | 468 |
} |
451 |
|
|
469 |
|
|
452 | 470 |
parser.close() |
453 | 471 |
writer.close(); |
454 | 472 |
output.close(); |
... | ... | |
464 | 482 |
} |
465 | 483 |
return true; |
466 | 484 |
} |
467 |
|
|
485 |
|
|
486 |
public void setRetokenize(boolean retokenize) { |
|
487 |
this.retokenize = retokenize |
|
488 |
} |
|
489 |
|
|
468 | 490 |
/** |
469 | 491 |
* Set the element and content to ignore |
470 | 492 |
* |
... | ... | |
526 | 548 |
if (regSplitWhiteSpaces != null) { |
527 | 549 |
for (String s : regSplitWhiteSpaces.split(text)) { // separate with unicode white spaces |
528 | 550 |
// if (DEBUG){println "process $s"} |
529 |
iterate(s);
|
|
530 |
}
|
|
551 |
iterate(s); |
|
552 |
} |
|
531 | 553 |
} else { |
532 | 554 |
iterate(text); |
533 | 555 |
} |
534 | 556 |
} |
535 |
|
|
557 |
|
|
536 | 558 |
/** |
537 | 559 |
* Iterate. a String, should be called when a word is found in a String |
538 | 560 |
* |
... | ... | |
541 | 563 |
*/ |
542 | 564 |
protected iterate(String s) { |
543 | 565 |
while (s != null && s.length() > 0) { |
544 |
// if (DEBUG){println " > $s"} |
|
566 |
// if (DEBUG){println " > $s"}
|
|
545 | 567 |
s = standardChecks(s); |
546 | 568 |
} |
547 | 569 |
} |
548 |
|
|
570 |
|
|
549 | 571 |
/** |
550 | 572 |
* Standard checks. |
551 | 573 |
* |
... | ... | |
556 | 578 |
public String standardChecks(String s) { |
557 | 579 |
Matcher m; |
558 | 580 |
if (fclitics != null && (m = s =~ regFClitics) ) { |
559 |
// if (DEBUG) println "CLITIC found: $s ->"+ m |
|
581 |
// if (DEBUG) println "CLITIC found: $s ->"+ m
|
|
560 | 582 |
iterate(m.group(1)) |
561 | 583 |
|
562 | 584 |
wordcount++; |
... | ... | |
583 | 605 |
|
584 | 606 |
return ""; |
585 | 607 |
} else if (regElision != null && (m = s =~ regElision) ) { |
586 |
// if (DEBUG) println "Elision found: $s ->"+ m.group(1)+" + "+m.group(2)+" + "+m.group(3) |
|
608 |
// if (DEBUG) println "Elision found: $s ->"+ m.group(1)+" + "+m.group(2)+" + "+m.group(3)
|
|
587 | 609 |
//iterate(m.group(1)) |
588 |
|
|
589 |
// int sep = s.indexOf("'"); |
|
590 |
// if (sep < 0) |
|
591 |
// sep = s.indexOf("’"); |
|
592 |
// if (sep < 0) |
|
593 |
// sep = s.indexOf("‘"); |
|
594 |
|
|
610 |
|
|
611 |
// int sep = s.indexOf("'");
|
|
612 |
// if (sep < 0)
|
|
613 |
// sep = s.indexOf("’");
|
|
614 |
// if (sep < 0)
|
|
615 |
// sep = s.indexOf("‘");
|
|
616 |
|
|
595 | 617 |
wordcount++; |
596 | 618 |
writer.writeStartElement(word_element_to_create); |
597 | 619 |
writeWordAttributes();// id |
... | ... | |
599 | 621 |
writer.writeCharacters(m.group(1)); |
600 | 622 |
writer.writeEndElement(); |
601 | 623 |
writer.writeCharacters("\n"); |
602 |
|
|
624 |
|
|
603 | 625 |
iterate(m.group(2)) |
604 | 626 |
|
605 | 627 |
return ""; |
606 | 628 |
} else if (reg3pts != null && (m = s =~ reg3pts) ) { |
607 |
// if(DEBUG){println "REG '...' found: $s -> "+m.group(1)+" + "+m.group(2)+" + "+m.group(3)} |
|
629 |
// if(DEBUG){println "REG '...' found: $s -> "+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
|
|
608 | 630 |
iterate(m.group(1)) |
609 |
|
|
631 |
|
|
610 | 632 |
wordcount++; |
611 | 633 |
writer.writeStartElement(word_element_to_create); |
612 | 634 |
writeWordAttributes();// id |
... | ... | |
614 | 636 |
writer.writeCharacters("..."); |
615 | 637 |
writer.writeEndElement(); |
616 | 638 |
writer.writeCharacters("\n"); |
617 |
|
|
639 |
|
|
618 | 640 |
return m.group(3); |
619 | 641 |
} else if (regPunct != null && (m = s =~ regPunct) ) { |
620 | 642 |
if(DEBUG){println "PUNCT found: $s ->"+m.group(1)+" + "+m.group(2)+" + "+m.group(3)} |
621 | 643 |
iterate(m.group(1)); |
622 |
|
|
644 |
|
|
623 | 645 |
wordcount++; |
624 | 646 |
writer.writeStartElement(word_element_to_create); |
625 | 647 |
writeWordAttributes();// id |
... | ... | |
627 | 649 |
writer.writeCharacters(m.group(2)); |
628 | 650 |
writer.writeEndElement(); |
629 | 651 |
writer.writeCharacters("\n"); |
630 |
|
|
652 |
|
|
631 | 653 |
return m.group(3); |
632 | 654 |
} else { |
633 |
// if(DEBUG){println "Other found: "+s} |
|
655 |
// if(DEBUG){println "Other found: "+s}
|
|
634 | 656 |
wordcount++; |
635 | 657 |
writer.writeStartElement(word_element_to_create); |
636 | 658 |
writeWordAttributes();// id |
637 |
// if (reg_punct_other.matcher(s).matches()) |
|
638 |
// writer.writeAttribute("type","pon"); |
|
639 |
// else |
|
640 |
// writer.writeAttribute("type","w"); |
|
659 |
// if (reg_punct_other.matcher(s).matches())
|
|
660 |
// writer.writeAttribute("type","pon");
|
|
661 |
// else
|
|
662 |
// writer.writeAttribute("type","w");
|
|
641 | 663 |
writer.writeCharacters(s); |
642 | 664 |
writer.writeEndElement(); |
643 | 665 |
writer.writeCharacters("\n"); |
644 |
|
|
666 |
|
|
645 | 667 |
return ""; |
646 | 668 |
} |
647 | 669 |
} |
648 |
|
|
670 |
|
|
649 | 671 |
/** |
650 | 672 |
* Write word attributes. |
651 | 673 |
* |
... | ... | |
654 | 676 |
protected writeWordAttributes() { |
655 | 677 |
writer.writeAttribute("id", "w_"+filename+"_"+wordcount); |
656 | 678 |
writer.writeAttribute("n",""+wordcount); |
679 |
for (String attr : retokenizedWordProperties.keySet()) { |
|
680 |
if ("id" == attr) { |
|
681 |
writer.writeAttribute("previous-id", retokenizedWordProperties[attr]); |
|
682 |
} else if ("n" == attr) { |
|
683 |
writer.writeAttribute("previous-n", retokenizedWordProperties[attr]); |
|
684 |
} else { |
|
685 |
writer.writeAttribute(attr, retokenizedWordProperties[attr]); |
|
686 |
} |
|
687 |
} |
|
657 | 688 |
} |
658 |
|
|
689 |
|
|
659 | 690 |
public void setStartTag(String tag) |
660 | 691 |
{ |
661 | 692 |
this.startTag = tag; |
662 | 693 |
this.reg_startTag = Pattern.compile(startTag); |
663 | 694 |
} |
664 |
|
|
695 |
|
|
665 | 696 |
/** |
666 | 697 |
* Tokenize. |
667 | 698 |
* |
... | ... | |
672 | 703 |
{ |
673 | 704 |
return str.tokenize() // cut by whitespace |
674 | 705 |
} |
675 |
|
|
706 |
|
|
676 | 707 |
/** |
677 | 708 |
* The main method. |
678 | 709 |
* |
... | ... | |
680 | 711 |
*/ |
681 | 712 |
public static void main(String[] args) |
682 | 713 |
{ |
683 |
File SVNDIR = new File(System.getProperty("user.home"), "xml/tokenizernum"); |
|
684 |
Runtime.getRuntime().exec("svn", "update", SVNDIR.getAbsolutePath()); |
|
685 |
File inputFile = new File(SVNDIR, "baye1.xml") |
|
686 |
File outputFile = new File(SVNDIR, "baye1-t.xml") |
|
687 |
SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, "fr") |
|
688 |
tokenizer.DEBUG = true |
|
689 |
tokenizer.setNote("note") |
|
690 |
//tokenizer.setOutSideTextTags("teiHeader") |
|
691 |
tokenizer.setOutSideTextTagsAndKeepContent("teiHeader") |
|
692 |
println outputFile.toString() + " : "+tokenizer.process(); |
|
714 |
// File SVNDIR = new File(System.getProperty("user.home"), "xml/tokenizernum"); |
|
715 |
// Runtime.getRuntime().exec("svn", "update", SVNDIR.getAbsolutePath()); |
|
716 |
// File inputFile = new File(SVNDIR, "baye1.xml") |
|
717 |
// File outputFile = new File(SVNDIR, "baye1-t.xml") |
|
718 |
// SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, "fr") |
|
719 |
// tokenizer.DEBUG = true |
|
720 |
// tokenizer.setNote("note") |
|
721 |
// //tokenizer.setOutSideTextTags("teiHeader") |
|
722 |
// tokenizer.setOutSideTextTagsAndKeepContent("teiHeader") |
|
723 |
// println outputFile.toString() + " : "+tokenizer.process(); |
|
724 |
|
|
725 |
|
|
693 | 726 |
// FIXME: tests UTF-8 XML ranges |
694 | 727 |
// File inputDir = new File("C:/Tools/Textometrie/___corpus/xml/test_bug_import_chinois/") |
695 | 728 |
// File inputFile = new File(inputDir, "problem2.xml"); |
696 |
|
|
729 |
|
|
697 | 730 |
/* |
698 |
File inputDir = new File("/home/mdecorde/xml/bvhepistemon2016") |
|
699 |
File outputDir = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast") |
|
700 |
File outputDir2 = new File("/home/mdecorde/xml/bvhepistemon2016/out-slow") |
|
701 |
File outputDir3 = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast2") |
|
702 |
File inputFile = new File(inputDir, "1542_RabelaisGrgt.xml"); |
|
703 |
outputDir.deleteDir() |
|
704 |
outputDir2.deleteDir() |
|
705 |
outputDir3.deleteDir() |
|
706 |
outputDir.mkdir() |
|
707 |
outputDir2.mkdir() |
|
708 |
outputDir3.mkdir() |
|
709 |
long time; |
|
710 |
for (String lang : ["none"]) { |
|
711 |
println "LANG: $lang" |
|
712 |
|
|
713 |
time = System.currentTimeMillis(); |
|
714 |
File outputFile = new File(outputDir, "${lang}.xml") |
|
715 |
SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, lang) |
|
716 |
tokenizer.process(); |
|
717 |
println "FAST $lang: "+(System.currentTimeMillis()-time) |
|
718 |
|
|
719 |
// time = System.currentTimeMillis(); |
|
720 |
// File outputFile3 = new File(outputDir3, "${lang}.xml") |
|
721 |
// FastSimpleTokenizerXml tokenizer3 = new FastSimpleTokenizerXml(inputFile, outputFile3, lang) |
|
722 |
// tokenizer3.process(); |
|
723 |
// println "FAST2 $lang: "+(System.currentTimeMillis()-time) |
|
724 |
// |
|
725 |
// time = System.currentTimeMillis(); |
|
726 |
// File outputFile2 = new File(outputDir2, "${lang}.xml") |
|
727 |
// OldSimpleTokenizerXml tokenizer2 = new OldSimpleTokenizerXml(inputFile, outputFile2, lang) |
|
728 |
// tokenizer2.process(); |
|
729 |
// println "SLOW $lang: "+(System.currentTimeMillis()-time) |
|
730 |
} |
|
731 |
*/ |
|
731 |
File inputDir = new File("/home/mdecorde/xml/bvhepistemon2016") |
|
732 |
File outputDir = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast") |
|
733 |
File outputDir2 = new File("/home/mdecorde/xml/bvhepistemon2016/out-slow") |
|
734 |
File outputDir3 = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast2") |
|
735 |
File inputFile = new File(inputDir, "1542_RabelaisGrgt.xml"); |
|
736 |
outputDir.deleteDir() |
|
737 |
outputDir2.deleteDir() |
|
738 |
outputDir3.deleteDir() |
|
739 |
outputDir.mkdir() |
|
740 |
outputDir2.mkdir() |
|
741 |
outputDir3.mkdir() |
|
742 |
long time; |
|
743 |
for (String lang : ["none"]) { |
|
744 |
println "LANG: $lang" |
|
745 |
time = System.currentTimeMillis(); |
|
746 |
File outputFile = new File(outputDir, "${lang}.xml") |
|
747 |
SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, lang) |
|
748 |
tokenizer.process(); |
|
749 |
println "FAST $lang: "+(System.currentTimeMillis()-time) |
|
750 |
// time = System.currentTimeMillis(); |
|
751 |
// File outputFile3 = new File(outputDir3, "${lang}.xml") |
|
752 |
// FastSimpleTokenizerXml tokenizer3 = new FastSimpleTokenizerXml(inputFile, outputFile3, lang) |
|
753 |
// tokenizer3.process(); |
|
754 |
// println "FAST2 $lang: "+(System.currentTimeMillis()-time) |
|
755 |
// |
|
756 |
// time = System.currentTimeMillis(); |
|
757 |
// File outputFile2 = new File(outputDir2, "${lang}.xml") |
|
758 |
// OldSimpleTokenizerXml tokenizer2 = new OldSimpleTokenizerXml(inputFile, outputFile2, lang) |
|
759 |
// tokenizer2.process(); |
|
760 |
// println "SLOW $lang: "+(System.currentTimeMillis()-time) |
|
761 |
} |
|
762 |
*/ |
|
732 | 763 |
println "" |
733 | 764 |
println "Done" |
734 |
|
|
765 |
|
|
735 | 766 |
// String lang = "en" |
736 | 767 |
// File inDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant") |
737 | 768 |
// File outDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant-t") |
... | ... | |
750 | 781 |
// } |
751 | 782 |
// println "" |
752 | 783 |
// println "Done" |
753 |
|
|
784 |
|
|
785 |
String lang = "fr" |
|
786 |
File inFile = new File(System.getProperty("user.home"), "xml/vocapia/test.trs") |
|
787 |
File outFile = new File(System.getProperty("user.home"), "xml/vocapia/test-retokenized.trs") |
|
788 |
|
|
789 |
println "processing "+inFile |
|
790 |
|
|
791 |
SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inFile, outFile, lang) |
|
792 |
tokenizer.setRetokenize(true) |
|
793 |
//tokenizer.setDEBUG false |
|
794 |
tokenizer.process(); |
|
795 |
|
|
796 |
println "Done" |
|
797 |
|
|
754 | 798 |
} |
755 | 799 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/13nov/FixTranscription.groovy (revision 3003) | ||
---|---|---|
1 |
package org.txm.macro.transcription |
|
2 |
|
|
3 |
import javax.xml.stream.* |
|
4 |
|
|
5 |
import org.txm.importer.PersonalNamespaceContext |
|
6 |
import org.txm.xml.IdentityHook |
|
7 |
import org.txm.xml.* |
|
8 |
|
|
9 |
import java.io.BufferedOutputStream |
|
10 |
import java.io.FileOutputStream |
|
11 |
import java.io.IOException |
|
12 |
import java.net.URL |
|
13 |
import java.util.* |
|
14 |
import java.util.Map.Entry |
|
15 |
import java.util.regex.Pattern |
|
16 |
|
|
17 |
class FixTranscription extends XMLProcessor { |
|
18 |
|
|
19 |
LocalNamesHookActivator activator; |
|
20 |
IdentityHook hook; |
|
21 |
boolean retokenizeWords |
|
22 |
|
|
23 |
public FixTranscription(File xmlfile, boolean retokenizeWords) { |
|
24 |
super(xmlfile) |
|
25 |
this.retokenizeWords = retokenizeWords |
|
26 |
|
|
27 |
activator = new LocalNamesHookActivator<>(hook, ["w", "Turn", "Sync"]); |
|
28 |
|
|
29 |
hook = new IdentityHook("word_hook", activator, this) { |
|
30 |
|
|
31 |
boolean inTurn = false; |
|
32 |
|
|
33 |
boolean inW = false; |
|
34 |
StringBuilder wordBuffer = new StringBuilder(); |
|
35 |
|
|
36 |
String currentTime; |
|
37 |
LinkedHashMap turnInfos = new LinkedHashMap() |
|
38 |
LinkedHashMap wInfos = new LinkedHashMap() |
|
39 |
boolean other |
|
40 |
|
|
41 |
@Override |
|
42 |
public boolean deactivate() { |
|
43 |
return true; |
|
44 |
} |
|
45 |
|
|
46 |
@Override |
|
47 |
public boolean _activate() { |
|
48 |
return true; |
|
49 |
} |
|
50 |
|
|
51 |
@Override |
|
52 |
protected void processStartElement() throws XMLStreamException, IOException { |
|
53 |
if (localname.equals("Turn")) { |
|
54 |
// store values |
|
55 |
inTurn = true; |
|
56 |
turnInfos.clear() |
|
57 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
58 |
turnInfos[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i) |
|
59 |
} |
|
60 |
currentTime = turnInfos["startTime"] |
|
61 |
super.processStartElement(); |
|
62 |
} else if (localname.equals("Sync")) { |
|
63 |
currentTime = parser.getAttributeValue(null, "time") |
|
64 |
super.processStartElement(); |
|
65 |
} else if (localname.equals("w")) { |
|
66 |
// store values |
|
67 |
inW = true; |
|
68 |
wInfos.clear() |
|
69 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
70 |
wInfos[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i) |
|
71 |
} |
|
72 |
String time = parser.getAttributeValue(null, "time") |
|
73 |
if (time != null && time.length() > 0) { |
|
74 |
currentTime = time |
|
75 |
} |
|
76 |
wordBuffer.setLength(0); |
|
77 |
return; // write w later |
|
78 |
} |
|
79 |
else { |
|
80 |
super.processStartElement(); |
|
81 |
} |
|
82 |
} |
|
83 |
|
|
84 |
@Override |
|
85 |
protected void processCharacters() throws XMLStreamException { |
|
86 |
if (inW) { |
|
87 |
wordBuffer.append(parser.getText()) |
|
88 |
} |
|
89 |
else { |
|
90 |
super.processCharacters(); |
|
91 |
} |
|
92 |
} |
|
93 |
|
|
94 |
@Override |
|
95 |
protected void processEndElement() throws XMLStreamException { |
|
96 |
if (localname.equals("w")) { |
|
97 |
|
|
98 |
inW = false |
|
99 |
String word = wordBuffer.toString().trim() |
|
100 |
if (!other && word.startsWith("*")) { |
|
101 |
//close current Turn and start a 'other' Turn |
|
102 |
writer.writeEndElement() // current Turn |
|
103 |
writer.writeCharacters("\n") |
|
104 |
|
|
105 |
def tmpInfos = new LinkedHashMap() |
|
106 |
for (String attr : turnInfos.keySet()) tmpInfos[attr] = turnInfos[attr] |
|
107 |
tmpInfos["orig-speaker"] = turnInfos["speaker"] |
|
108 |
tmpInfos["speaker"] = "other" |
|
109 |
tmpInfos["startTime"] = currentTime |
|
110 |
writer.writeStartElement("Turn") |
|
111 |
for (String attr : tmpInfos.keySet()) { |
|
112 |
writer.writeAttribute(attr, tmpInfos[attr]) |
|
113 |
} |
|
114 |
|
|
115 |
other = true |
|
116 |
word = word.substring(1) |
|
117 |
} |
|
118 |
|
|
119 |
boolean shouldCloseOtherTurn = false; |
|
120 |
if (other && word.endsWith("*")) { |
|
121 |
shouldCloseOtherTurn = true; |
|
122 |
|
|
123 |
word = word.substring(0, word.length()-1) |
|
124 |
other = false |
|
125 |
} |
|
126 |
|
|
127 |
// split before re-writing the word |
|
128 |
def puncts = [] |
|
129 |
if (retokenizeWords) { |
|
130 |
// while (word.length() > 0 && word.matches("\\p{Punct}.+")) { |
|
131 |
// puncts << word.substring(0, 1) |
|
132 |
// word = word.substring(1, word.length()) |
|
133 |
// } |
|
134 |
// //vocapia fix |
|
135 |
// Pattern reg = Pattern.compile("([^']+')(.+)") |
|
136 |
// def m = reg.matcher(word) |
|
137 |
// while (word.length() > 0 && m.matches()) { |
|
138 |
// puncts << m.group(1) |
|
139 |
// word = m.group(2) |
|
140 |
// m = reg.matcher(word) |
|
141 |
// } |
|
142 |
} |
|
143 |
|
|
144 |
for (def punct : puncts) { // pre-retokenize if any |
|
145 |
writer.writeStartElement("w") |
|
146 |
for (String attr : wInfos.keySet()) { |
|
147 |
writer.writeAttribute(attr, wInfos[attr]) |
|
148 |
} |
|
149 |
writer.writeCharacters(punct) |
|
150 |
writer.writeEndElement() // w |
|
151 |
writer.writeCharacters("\n") |
|
152 |
} |
|
153 |
|
|
154 |
puncts = [] |
|
155 |
if (retokenizeWords) { |
|
156 |
// while (word.length() > 0 && word.matches(".+\\p{Punct}")) { |
|
157 |
// puncts << word.substring(word.length()-1, word.length()) |
|
158 |
// word = word.substring(0, word.length()-1) |
|
159 |
// } |
|
160 |
} |
|
161 |
|
|
162 |
if ("XXX".equals(word)) { // <Event desc="XXX" type="unknown" extent="next"/> |
|
163 |
writer.writeStartElement("event") // start the initial word |
|
164 |
writer.writeAttribute("desc", "XXX from "+wInfos["start"] + " to "+wInfos["end"]) |
|
165 |
writer.writeAttribute("type", "unknown") |
|
166 |
writer.writeAttribute("extent", "instantaneous") |
|
167 |
writer.writeEndElement() // event |
|
168 |
word = "" // don't write the word |
|
169 |
} |
|
170 |
|
|
171 |
if (word.length() > 0) { |
|
172 |
|
|
173 |
writer.writeStartElement("w") // start the initial word |
|
174 |
for (String attr : wInfos.keySet() ) { |
|
175 |
writer.writeAttribute(attr, wInfos[attr]) |
|
176 |
} |
|
177 |
writer.writeCharacters(word) |
|
178 |
writer.writeEndElement() // w |
|
179 |
} |
|
180 |
|
|
181 |
for (String punct : puncts) { // post-retokenize if any |
|
182 |
writer.writeStartElement("w") |
|
183 |
for (String attr : wInfos.keySet() ) { |
|
184 |
writer.writeAttribute(attr, wInfos[attr]) |
|
185 |
} |
|
186 |
writer.writeCharacters(punct) |
|
187 |
writer.writeEndElement() // w |
|
188 |
} |
|
189 |
|
|
190 |
if (shouldCloseOtherTurn) { |
|
191 |
shouldCloseOtherTurn = false; |
|
192 |
//close the current 'other' Turn and restart the actual Turn |
|
193 |
writer.writeEndElement() // current 'other' Turn |
|
194 |
|
|
195 |
writer.writeStartElement("Turn") // rebuild the orig Turn and fix its start-end infos |
|
196 |
turnInfos["startTime"] = wInfos["end"] // fix the startTime using the current word end time |
|
197 |
for (String attr : turnInfos.keySet()) { |
|
198 |
writer.writeAttribute(attr, turnInfos[attr]) |
|
199 |
} |
|
200 |
|
|
201 |
other = false |
|
202 |
} |
|
203 |
} else { |
|
204 |
super.processEndElement(); |
|
205 |
} |
|
206 |
} |
|
207 |
} |
|
208 |
} |
|
209 |
|
|
210 |
public static void main(String[] args) { |
|
211 |
File infile = new File("/home/mdecorde/xml/vocapia","test.trs") |
|
212 |
File outfile = new File("/home/mdecorde/xml/vocapia","test-fixed.trs") |
|
213 |
def processor = new FixTranscription(infile, true) |
|
214 |
println processor.process(outfile) |
|
215 |
} |
|
216 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/13nov/FixTranscriptionsMacro.groovy (revision 3003) | ||
---|---|---|
1 |
package org.txm.macro.transcription |
|
2 |
|
|
3 |
import java.time.LocalTime |
|
4 |
import java.time.format.DateTimeFormatter |
|
5 |
import org.txm.utils.* |
|
6 |
import org.txm.utils.logger.* |
|
7 |
|
|
8 |
@Field @Option(name="trsFile", usage="A single vocapia XML file", widget="FileOpen", required=false, def="") |
|
9 |
File trsFile; |
|
10 |
|
|
11 |
@Field @Option(name="trsDirectory", usage="A Vocapia XML files directory to process", widget="Folder", required=false, def="") |
|
12 |
File trsDirectory; |
|
13 |
|
|
14 |
@Field @Option(name="resultDirectory", usage="The result directory", widget="Folder", required=false, def="") |
|
15 |
File resultDirectory; |
|
16 |
|
|
17 |
if (!ParametersDialog.open(this)) return; |
|
18 |
|
|
19 |
if (trsDirectory.equals(resultDirectory) || trsFile.getParentFile().equals(resultDirectory)) { |
|
20 |
|
|
21 |
return false; |
|
22 |
} |
|
23 |
|
|
24 |
resultDirectory.mkdirs(); |
|
25 |
|
|
26 |
def trsFiles = [] |
|
27 |
if (trsDirectory != null && trsDirectory.exists()) { |
|
28 |
|
|
29 |
println "Processing TRS directory: $trsDirectory" |
|
30 |
for (File file : trsDirectory.listFiles()) { |
|
31 |
if (file.getName().toLowerCase().endsWith(".trs")) { |
|
32 |
trsFiles << file |
|
33 |
} |
|
34 |
} |
|
35 |
} else if (trsFile != null && trsFile.exists()) { |
|
36 |
println "Processing TRS file: $trsFile" |
|
37 |
trsFiles << trsFile |
|
38 |
} |
|
39 |
|
|
40 |
if (trsFiles.size() == 0) { |
|
41 |
println "No XML file found for parameters trsFile=$trsFile and trsDirectory=$trsDirectory" |
|
42 |
return false |
|
43 |
} |
|
44 |
|
|
45 |
ConsoleProgressBar cpb = new ConsoleProgressBar(trsFiles.size()) |
|
46 |
for (File trsFile : trsFiles) { |
|
47 |
cpb.tick() |
|
48 |
FixTranscription fixer = new FixTranscription(trsFile) |
|
49 |
String name = FileUtils.stripExtension(trsFile) |
|
50 |
File outFile = new File(resultDirectory, name+".trs") |
|
51 |
|
|
52 |
if (!fixer.process(outFile)) { |
|
53 |
println "WARNING: ERROR WHILE PROCESSING: "+trsFile |
|
54 |
return false |
|
55 |
} |
|
56 |
} |
|
57 |
cpb.done() |
|
58 |
|
|
59 |
println "Done: "+trsFiles.size()+" files processed. Result files in $resultDirectory" |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/Vocapia2Transcriber.groovy (revision 3003) | ||
---|---|---|
36 | 36 |
XMLStreamReader parser = factory.createXMLStreamReader(inputData); |
37 | 37 |
|
38 | 38 |
boolean flagWord = false |
39 |
def winfos = [:]
|
|
40 |
def turninfos = [:]
|
|
39 |
def winfos = new LinkedHashMap()
|
|
40 |
def turninfos = new LinkedHashMap()
|
|
41 | 41 |
boolean other = false; |
42 | 42 |
String word = "" |
43 | 43 |
try { |
... | ... | |
89 | 89 |
writer.writeStartElement("Episode") |
90 | 90 |
//<Section type="report" startTime="0" endTime="3617.593"> |
91 | 91 |
writer.writeStartElement("Section") |
92 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
93 |
writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i)) |
|
94 |
} |
|
92 | 95 |
break; |
93 | 96 |
|
94 | 97 |
case "SpeechSegment": // <SpeechSegment ch="1" sconf="1.00" stime="9.94" etime="43.81" spkid="Enquêtrice" lang="fre" lconf="1.00" trs="1"> |
95 | 98 |
writer.writeStartElement("Turn") |
96 |
|
|
97 |
turninfos = ["speaker":parser.getAttributeValue(null, "spkid"),
|
|
98 |
"startTime":parser.getAttributeValue(null, "stime"),
|
|
99 |
"endTime":parser.getAttributeValue(null, "etime"),
|
|
100 |
]
|
|
101 |
for (String attr : turninfos.keySet()) { |
|
102 |
writer.writeAttribute(attr, turninfos[attr])
|
|
99 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
100 |
String name = parser.getAttributeLocalName(i)
|
|
101 |
if (name == "stime") name = "startTime"
|
|
102 |
else if (name == "etime") name = "endTime"
|
|
103 |
else if (name == "spkid") name = "speaker"
|
|
104 |
|
|
105 |
writer.writeAttribute(name, parser.getAttributeValue(i))
|
|
103 | 106 |
} |
104 |
|
|
107 |
|
|
105 | 108 |
writer.writeCharacters("\n") |
106 | 109 |
writer.writeStartElement("Sync") |
107 | 110 |
writer.writeAttribute("time", parser.getAttributeValue(null, "stime")) |
... | ... | |
114 | 117 |
|
115 | 118 |
// store w infos in case the word must be splited |
116 | 119 |
def endValue = String.format(Locale.US, "%.2f", (Double.parseDouble(parser.getAttributeValue(null, "stime")) + Double.parseDouble(parser.getAttributeValue(null, "dur")))) |
117 |
winfos = ["time":parser.getAttributeValue(null, "stime"), "start": parser.getAttributeValue(null, "stime"), "end":endValue] |
|
120 |
def startValue = parser.getAttributeValue(null, "stime"); |
|
121 |
winfos.clear() |
|
122 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
123 |
String name = parser.getAttributeLocalName(i) |
|
124 |
if (name == "stime") continue |
|
125 |
|
|
126 |
winfos[name] = parser.getAttributeValue(i) |
|
127 |
} |
|
128 |
winfos["time"] = startValue |
|
129 |
winfos["start"] = startValue |
|
130 |
winfos["end"] = endValue |
|
131 |
|
|
132 |
// write <w> at the end of <Word> tag |
|
118 | 133 |
break |
119 | 134 |
} |
120 | 135 |
break; |
... | ... | |
154 | 169 |
case "Word": |
155 | 170 |
flagWord = false |
156 | 171 |
word = word.trim() |
157 |
if (word.startsWith("*")) { |
|
158 |
//close current Turn and start a 'other' Turn |
|
159 |
writer.writeEndElement() // current Turn |
|
160 |
writer.writeStartElement("Turn") |
|
161 |
writer.writeAttribute("speaker", "other") |
|
162 |
writer.writeAttribute("startTime", winfos["time"]) |
|
163 |
writer.writeAttribute("orig-speaker", turninfos["speaker"]) |
|
164 |
writer.writeCharacters("\n") |
|
165 |
other = true |
|
166 |
word = word.substring(1) |
|
167 |
} |
|
168 | 172 |
|
169 |
String otherAttributeValue = Boolean.toString(other) // set now |
|
170 |
|
|
171 |
if (other && word.endsWith("*")) { |
|
172 |
|
|
173 |
//close the current 'other' Turn and restart the actual Turn |
|
174 |
writer.writeEndElement() // current 'other' Turn |
|
175 |
|
|
176 |
writer.writeStartElement("Turn") // rebuild the orig Turn with its infos |
|
177 |
turninfos["startTime"] = winfos["end"] // fix the startTime using the current word end time |
|
178 |
for (String attr : turninfos.keySet()) { |
|
179 |
writer.writeAttribute(attr, turninfos[attr]) |
|
180 |
} |
|
181 |
writer.writeCharacters("\n") |
|
182 |
|
|
183 |
word = word.substring(0, word.length()-1) |
|
184 |
other = false |
|
185 |
} |
|
186 |
|
|
187 | 173 |
// split before the word |
188 | 174 |
def puncts = [] |
189 | 175 |
if (retokenizeWords) { |
176 |
|
|
190 | 177 |
while (word.length() > 0 && word.matches("\\p{Punct}.+")) { |
191 |
puncts << word.substring(0, 1)
|
|
192 |
word = word.substring(1, word.length())
|
|
178 |
puncts << word.substring(0 ,1)
|
|
179 |
word = word.substring(1) |
|
193 | 180 |
} |
194 | 181 |
|
182 |
// fix "d'abord" like words |
|
195 | 183 |
Pattern reg = Pattern.compile("([^']+')(.+)") |
196 | 184 |
def m = reg.matcher(word) |
197 | 185 |
while (word.length() > 0 && m.matches()) { |
... | ... | |
200 | 188 |
m = reg.matcher(word) |
201 | 189 |
} |
202 | 190 |
} |
203 |
|
|
191 |
|
|
204 | 192 |
for (def punct : puncts) { // pre-retokenize if any |
205 | 193 |
writer.writeStartElement("w") |
206 | 194 |
for (String attr : winfos.keySet()) { |
... | ... | |
214 | 202 |
puncts = [] |
215 | 203 |
if (retokenizeWords) { |
216 | 204 |
while (word.length() > 0 && word.matches(".+\\p{Punct}")) { |
217 |
puncts << word.substring(word.length()-1, word.length())
|
|
205 |
puncts.add(0, word.substring(word.length()-1, word.length()))
|
|
218 | 206 |
word = word.substring(0, word.length()-1) |
219 | 207 |
} |
220 | 208 |
} |
221 | 209 |
|
222 |
writer.writeStartElement("w") // start the initial word |
|
223 |
writer.writeAttribute("time", winfos["time"]) |
|
224 |
writer.writeAttribute("start", winfos["start"]) |
|
225 |
writer.writeAttribute("end", winfos["end"]) |
|
226 |
writer.writeAttribute("other", otherAttributeValue) |
|
210 |
if (word.length() > 0) { |
|
211 |
writer.writeStartElement("w") // start the initial word |
|
212 |
for (String attr : winfos.keySet()) { |
|
213 |
writer.writeAttribute(attr, winfos[attr]) |
|
214 |
} |
|
215 |
writer.writeCharacters(word) |
|
216 |
writer.writeEndElement() // w |
|
217 |
writer.writeCharacters("\n") |
|
218 |
} |
|
227 | 219 |
|
228 |
writer.writeCharacters(word) |
|
229 |
writer.writeEndElement() // w |
|
230 |
writer.writeCharacters("\n") |
|
231 |
|
|
232 |
for (String punct : puncts) { // post-retokenize if any |
|
220 |
for (String punct : puncts) { // write post-retokenize if any |
|
233 | 221 |
writer.writeStartElement("w") |
234 |
writer.writeAttribute("time", winfos["time"])
|
|
235 |
writer.writeAttribute("start", winfos["end"])
|
|
236 |
writer.writeAttribute("end", winfos["end"])
|
|
222 |
for (String attr : winfos.keySet()) {
|
|
223 |
writer.writeAttribute(attr, winfos[attr])
|
|
224 |
}
|
|
237 | 225 |
writer.writeCharacters(punct) |
238 | 226 |
writer.writeEndElement() // w |
239 | 227 |
writer.writeCharacters("\n") |
... | ... | |
269 | 257 |
File infile = new File("/home/mdecorde/xml/vocapia","test.xml") |
270 | 258 |
File outfile = new File("/home/mdecorde/xml/vocapia","test.trs") |
271 | 259 |
def processor = new Vocapia2Transcriber(infile) |
272 |
println processor.process(outfile) |
|
260 |
println processor.process(outfile, true)
|
|
273 | 261 |
} |
274 | 262 |
} |
Formats disponibles : Unified diff