Révision 3344
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/importer.groovy (revision 3344) | ||
---|---|---|
188 | 188 |
try { |
189 | 189 |
TranscriberTokenizer tokenizer = new TranscriberTokenizer(pfile, tfile, lang) |
190 | 190 |
tokenizer.setRetokenize("true" == project.getTokenizerParameter("doRetokenizeStep", "false")) |
191 |
tokenizer.setDoBuildWordIDs("true" == project.getTokenizerParameter("doBuildWordIds", "true")) |
|
191 | 192 |
if (!tokenizer.process()) { |
192 | 193 |
println("Failed to tokenize "+pfile) |
193 | 194 |
} |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZImporter.groovy (revision 3344) | ||
---|---|---|
340 | 340 |
new File(module.getBinaryDirectory(),"tokenized").deleteDir() |
341 | 341 |
new File(module.getBinaryDirectory(),"tokenized").mkdir() |
342 | 342 |
|
343 |
boolean retokenize = "true" == module.getProject().getTokenizerParameter("doRetokenizeStep", "false") |
|
344 | 343 |
String outSideTextTagsRegex = ""; |
345 | 344 |
String outSideTextTagsAndKeepContentRegex = ""; |
346 | 345 |
String noteRegex = ""; |
... | ... | |
398 | 397 |
tokenizer.setStringTokenizer(stringTokenizer) |
399 | 398 |
} |
400 | 399 |
} |
401 |
tokenizer.setRetokenize(retokenize) |
|
400 |
tokenizer.setRetokenize("true" == project.getTokenizerParameter("doRetokenizeStep", "false")) |
|
401 |
tokenizer.setDoBuildWordIDs("true" == project.getTokenizerParameter("doBuildWordIds", "true")) |
|
402 | 402 |
if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) { |
403 | 403 |
tokenizer.setOutSideTextTags(outSideTextTagsRegex) |
404 | 404 |
} |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xml/importer.groovy (revision 3344) | ||
---|---|---|
164 | 164 |
File infile = f |
165 | 165 |
File outfile = new File(binDir, "tokenized/"+f.getName()) |
166 | 166 |
SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, TokenizerClasses.newTokenizerClasses(project.getPreferencesScope(), lang)) |
167 |
boolean retokenize = "true" == project.getTokenizerParameter("doRetokenizeStep", "false")
|
|
168 |
tokenizer.setRetokenize(retokenize)
|
|
167 |
tokenizer.setRetokenize("true" == project.getTokenizerParameter("doRetokenizeStep", "false"))
|
|
168 |
tokenizer.setDoBuildWordIDs("true" == project.getTokenizerParameter("doBuildWordIds", "true"))
|
|
169 | 169 |
if (ignoredElements != null && ignoredElements.trim().length() > 0) { |
170 | 170 |
tokenizer.setOutSideTextTagsAndKeepContent(ignoredElements) |
171 | 171 |
} |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/filters/Tokeniser/SimpleTokenizerXml.groovy (revision 3344) | ||
---|---|---|
61 | 61 |
|
62 | 62 |
StringTokenizer stringTokenizer; |
63 | 63 |
boolean retokenize = false |
64 |
boolean doBuildWordIDs = true |
|
65 |
ArrayList noIdWrittenErrors = new ArrayList() |
|
64 | 66 |
LinkedHashMap<String, String>retokenizedWordProperties = new LinkedHashMap() |
65 | 67 |
|
66 | 68 |
/** The word_tags. */ |
... | ... | |
183 | 185 |
if (event == XMLStreamConstants.START_ELEMENT ) { |
184 | 186 |
|
185 | 187 |
localname = parser.getLocalName(); |
186 |
if (wordid != null) { |
|
188 |
if (wordid != null) { // normalizing word elements
|
|
187 | 189 |
localname = word_element_to_create; |
188 | 190 |
} |
189 | 191 |
|
... | ... | |
210 | 212 |
} |
211 | 213 |
} |
212 | 214 |
|
213 |
String attrprefix, attname; |
|
214 |
boolean hasId = false; |
|
215 |
//boolean hasType = false |
|
216 |
boolean hasN = false |
|
217 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
218 |
attname = parser.getAttributeLocalName(i); |
|
219 |
attrprefix = parser.getAttributePrefix(i); |
|
220 |
if ("id".equals(attname)) hasId = true; |
|
221 |
//if ("type".equals(attname)) hasType = true; |
|
222 |
if ("n".equals(attname)) hasN = true; |
|
223 |
|
|
224 |
if (attrprefix != null && attrprefix.length() > 0) { |
|
225 |
writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i)) |
|
226 |
} else { |
|
227 |
writer.writeAttribute(attname, parser.getAttributeValue(i)) |
|
215 |
if (word_element_to_create.equals(localname)) { // the localname has been normalized to word_element_to_create |
|
216 |
writeWordAttributes(true) |
|
217 |
} else { |
|
218 |
String attrprefix, attname; |
|
219 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
220 |
attname = parser.getAttributeLocalName(i); |
|
221 |
attrprefix = parser.getAttributePrefix(i); |
|
222 |
|
|
223 |
if (attrprefix != null && attrprefix.length() > 0) { |
|
224 |
writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i)) |
|
225 |
} else { |
|
226 |
writer.writeAttribute(attname, parser.getAttributeValue(i)) |
|
227 |
} |
|
228 | 228 |
} |
229 | 229 |
} |
230 | 230 |
|
231 |
if (wordid != null && !hasId && localname == word_element_to_create) { |
|
232 |
writer.writeAttribute("id", "w_"+filename+"_"+wordcount); |
|
233 |
} |
|
234 |
if (!hasN && localname == word_element_to_create) { |
|
235 |
writer.writeAttribute("n", ""+wordcount); |
|
236 |
} |
|
237 | 231 |
if (!reg_word_tags.matcher(localname).matches()) { |
238 | 232 |
writer.writeCharacters("\n"); |
239 | 233 |
} |
... | ... | |
433 | 427 |
previousEvent = event; |
434 | 428 |
} |
435 | 429 |
|
430 |
if (noIdWrittenErrors.size() > 0) { |
|
431 |
println "Missing word IDs in source XML: "+noIdWrittenErrors.join(", ") |
|
432 |
} |
|
433 |
|
|
436 | 434 |
parser.close() |
437 | 435 |
writer.close(); |
438 | 436 |
output.close(); |
... | ... | |
453 | 451 |
this.retokenize = retokenize |
454 | 452 |
} |
455 | 453 |
|
454 |
public void setDoBuildWordIDs(boolean doBuildWordIDs) { |
|
455 |
this.doBuildWordIDs = doBuildWordIDs |
|
456 |
} |
|
457 |
|
|
456 | 458 |
/** |
457 | 459 |
* Set the element and content to ignore |
458 | 460 |
* |
... | ... | |
517 | 519 |
for (def word : words) { |
518 | 520 |
wordcount++; |
519 | 521 |
writer.writeStartElement(word_element_to_create); |
520 |
writeWordAttributes();// id |
|
522 |
writeWordAttributes(false);// id
|
|
521 | 523 |
writer.writeCharacters(word); |
522 | 524 |
writer.writeEndElement(); |
523 | 525 |
writer.writeCharacters("\n"); |
... | ... | |
533 | 535 |
* |
534 | 536 |
* @return the java.lang. object |
535 | 537 |
*/ |
536 |
protected writeWordAttributes() { |
|
537 |
writer.writeAttribute("id", "w_"+filename+"_"+wordcount); |
|
538 |
writer.writeAttribute("n",""+wordcount); |
|
538 |
protected writeWordAttributes(boolean fromParser) { |
|
539 |
boolean idWritten = doBuildWordIDs; |
|
540 |
if (doBuildWordIDs) { |
|
541 |
writer.writeAttribute("id", "w_"+filename+"_"+wordcount); |
|
542 |
writer.writeAttribute("n",""+wordcount); |
|
543 |
} |
|
544 |
|
|
545 |
// rewrite the properties of the retokenized word |
|
539 | 546 |
for (String attr : retokenizedWordProperties.keySet()) { |
540 |
if ("id" == attr) { |
|
541 |
writer.writeAttribute("old-id", retokenizedWordProperties[attr]);
|
|
542 |
} else if ("n" == attr) { |
|
543 |
writer.writeAttribute("old-n", retokenizedWordProperties[attr]);
|
|
547 |
if (doBuildWordIDs && "id" == attr) {
|
|
548 |
writer.writeAttribute("foreign-id", retokenizedWordProperties[attr]);
|
|
549 |
} else if (doBuildWordIDs && "n" == attr) {
|
|
550 |
writer.writeAttribute("foreign-n", retokenizedWordProperties[attr]);
|
|
544 | 551 |
} else { |
545 | 552 |
writer.writeAttribute(attr, retokenizedWordProperties[attr]); |
553 |
if ("id" == attr) idWritten = true; |
|
546 | 554 |
} |
547 | 555 |
} |
556 |
if (fromParser) { |
|
557 |
String attrprefix, attname; |
|
558 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
559 |
attname = parser.getAttributeLocalName(i); |
|
560 |
attrprefix = parser.getAttributePrefix(i); |
|
561 |
|
|
562 |
if (doBuildWordIDs && "id" == attname) { |
|
563 |
writer.writeAttribute("foreign-id", parser.getAttributeValue(i)); |
|
564 |
} else if (doBuildWordIDs && "n" == attname) { |
|
565 |
writer.writeAttribute("foreign-n", parser.getAttributeValue(i)); |
|
566 |
} else { |
|
567 |
if (attrprefix != null && attrprefix.length() > 0) { |
|
568 |
writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i)) |
|
569 |
} else { |
|
570 |
writer.writeAttribute(attname, parser.getAttributeValue(i)) |
|
571 |
} |
|
572 |
if ("id" == attname) idWritten = true; |
|
573 |
} |
|
574 |
} |
|
575 |
} |
|
576 |
if (!idWritten && noIdWrittenErrors.size() < 20) { |
|
577 |
noIdWrittenErrors << parser.getLocation().toString(); |
|
578 |
if (noIdWrittenErrors.size() == 20) { |
|
579 |
noIdWrittenErrors << "..."; |
|
580 |
} |
|
581 |
} |
|
582 |
|
|
583 |
|
|
548 | 584 |
} |
549 | 585 |
|
550 | 586 |
public void setStartTag(String tag) |
TXM/trunk/org.txm.rcp/src/main/java/org/txm/rcp/editors/imports/sections/TokenizerSection.java (revision 3344) | ||
---|---|---|
34 | 34 |
|
35 | 35 |
Button doRetokenizeStepButton; |
36 | 36 |
|
37 |
Button doBuildWordIDsButton; |
|
38 |
|
|
37 | 39 |
Text textElisions; |
38 | 40 |
|
39 | 41 |
Text textPuncts; |
... | ... | |
111 | 113 |
gdata.colspan = 3; |
112 | 114 |
doRetokenizeStepButton.setLayoutData(gdata); |
113 | 115 |
|
116 |
doBuildWordIDsButton = toolkit.createButton(sectionClient, "Build word identifiers", SWT.CHECK); |
|
117 |
doBuildWordIDsButton.setToolTipText(TXMUIMessages.performWordSegmentationWithinWord); |
|
118 |
doBuildWordIDsButton.setSelection(true); |
|
119 |
doBuildWordIDsButton.setVisible(moduleParams.get(ImportModuleCustomization.ADVANCEDTOKENIZER)); |
|
120 |
gdata = new TableWrapData(TableWrapData.LEFT, TableWrapData.FILL); |
|
121 |
gdata.colspan = 3; |
|
122 |
doBuildWordIDsButton.setLayoutData(gdata); |
|
114 | 123 |
|
115 | 124 |
Label l0 = toolkit.createLabel(sectionClient, TXMUIMessages.separatorCharacters, SWT.WRAP); |
116 | 125 |
gdata = new TableWrapData(TableWrapData.LEFT, TableWrapData.FILL); |
... | ... | |
303 | 312 |
protected void updateFieldsEnableState() { |
304 | 313 |
boolean enabled = doTokenizeStepButton.getSelection(); |
305 | 314 |
doRetokenizeStepButton.setEnabled(enabled); |
315 |
doBuildWordIDsButton.setEnabled(enabled); |
|
306 | 316 |
textWhiteSpaces.setEnabled(enabled); |
307 | 317 |
textPuncts.setEnabled(enabled); |
308 | 318 |
textElisions.setEnabled(enabled); |
... | ... | |
329 | 339 |
doTokenizeStepButton.setSelection(doTokenizeStep); |
330 | 340 |
boolean doRetokenizeStep = "true".equals(project.getTokenizerParameter("doRetokenizeStep", "false")); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ |
331 | 341 |
doRetokenizeStepButton.setSelection(doRetokenizeStep); |
342 |
boolean doBuildWordIds = "true".equals(project.getTokenizerParameter("doBuildWordIds", "true")); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ |
|
343 |
doBuildWordIDsButton.setSelection(doBuildWordIds); |
|
332 | 344 |
updateFieldsEnableState(); |
333 | 345 |
|
334 | 346 |
// HashMap<String, TTest> additionalTests = project.getTokenizerAdditionalTests(); |
... | ... | |
351 | 363 |
project.addTokenizerParameter("word_tags", wordElementText.getText()); //$NON-NLS-1$ |
352 | 364 |
project.addTokenizerParameter("doTokenizeStep", "" + doTokenizeStepButton.getSelection()); //$NON-NLS-1$ //$NON-NLS-2$ |
353 | 365 |
project.addTokenizerParameter("doRetokenizeStep", "" + doRetokenizeStepButton.getSelection()); //$NON-NLS-1$ //$NON-NLS-2$ |
366 |
project.addTokenizerParameter("doBuildWordIds", "" + doBuildWordIDsButton.getSelection()); //$NON-NLS-1$ //$NON-NLS-2$ |
|
354 | 367 |
project.addTokenizerParameter("onlyThoseTests", "false"); //$NON-NLS-1$ //$NON-NLS-2$ |
355 | 368 |
// project.setTokenizerAdditionalTests(additionalTokenizerTests); //$NON-NLS-1$ |
356 | 369 |
|
Formats disponibles : Unified diff