Révision 3706
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZImporter.groovy (revision 3706) | ||
---|---|---|
408 | 408 |
if (outSideTextTagsAndKeepContentRegex != null && outSideTextTagsAndKeepContentRegex.trim().length() > 0) { |
409 | 409 |
tokenizer.setOutSideTextTagsAndKeepContent(outSideTextTagsAndKeepContentRegex) |
410 | 410 |
} |
411 |
|
|
412 |
if (noteRegex != null && noteRegex.trim().length() > 0) { |
|
413 |
tokenizer.setNote(noteRegex) |
|
414 |
} |
|
415 |
|
|
411 |
|
|
416 | 412 |
// tokenize ! |
417 | 413 |
if (!tokenizer.process()) { |
418 | 414 |
println("Failed to process "+f) |
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/filters/Tokeniser/ChunkTokenizerXml.groovy (revision 3706) | ||
---|---|---|
580 | 580 |
} |
581 | 581 |
|
582 | 582 |
/** |
583 |
* Set element content to NOT tokenize |
|
584 |
* |
|
585 |
* @param regexp |
|
586 |
*/ |
|
587 |
public void setNote(String regexp) { |
|
588 |
this.note_content = regexp; |
|
589 |
this.reg_note_content = Pattern.compile(note_content); |
|
590 |
} |
|
591 |
|
|
592 |
/** |
|
593 | 583 |
* Set the element to ignore but not their content |
594 | 584 |
* |
595 | 585 |
* @param regexp |
... | ... | |
695 | 685 |
|
696 | 686 |
ChunkTokenizerXml tokenizer = new ChunkTokenizerXml(inFile, lang) |
697 | 687 |
tokenizer.setRetokenize(false) |
698 |
tokenizer.setNote("note") |
|
699 | 688 |
//tokenizer.setOutSideTextTags("outsideToEdit") |
700 | 689 |
tokenizer.setOutSideTextTagsAndKeepContent("outsideToEdit") |
701 | 690 |
//tokenizer.setDEBUG false |
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/filters/Tokeniser/SimpleTokenizerXml.groovy (revision 3706) | ||
---|---|---|
464 | 464 |
this.outside_text_tags = regexp; |
465 | 465 |
this.reg_outside_text_tags = Pattern.compile(outside_text_tags); |
466 | 466 |
} |
467 |
|
|
467 |
|
|
468 | 468 |
/** |
469 |
* Set element content to NOT tokenize |
|
470 |
* |
|
471 |
* @param regexp |
|
472 |
*/ |
|
473 |
public void setNote(String regexp) { |
|
474 |
this.note_content = regexp; |
|
475 |
this.reg_note_content = Pattern.compile(note_content); |
|
476 |
} |
|
477 |
|
|
478 |
/** |
|
479 | 469 |
* Set the element to ignore but not their content |
480 | 470 |
* |
481 | 471 |
* @param regexp |
... | ... | |
595 | 585 |
* @param str the str |
596 | 586 |
* @return the list |
597 | 587 |
*/ |
598 |
public List<String> tokenize(String str) |
|
599 |
{ |
|
588 |
public List<String> tokenize(String str) { |
|
600 | 589 |
return str.tokenize() // cut by whitespace |
601 | 590 |
} |
602 | 591 |
|
... | ... | |
605 | 594 |
* |
606 | 595 |
* @param args the arguments |
607 | 596 |
*/ |
608 |
public static void main(String[] args) |
|
609 |
{ |
|
597 |
public static void main(String[] args) {
|
|
598 |
|
|
610 | 599 |
// File SVNDIR = new File(System.getProperty("user.home"), "xml/tokenizernum"); |
611 | 600 |
// Runtime.getRuntime().exec("svn", "update", SVNDIR.getAbsolutePath()); |
612 | 601 |
// File inputFile = new File(SVNDIR, "baye1.xml") |
613 | 602 |
// File outputFile = new File(SVNDIR, "baye1-t.xml") |
614 | 603 |
// SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, "fr") |
615 | 604 |
// tokenizer.DEBUG = true |
616 |
// tokenizer.setNote("note") |
|
617 | 605 |
// //tokenizer.setOutSideTextTags("teiHeader") |
618 | 606 |
// tokenizer.setOutSideTextTagsAndKeepContent("teiHeader") |
619 | 607 |
// println outputFile.toString() + " : "+tokenizer.process(); |
... | ... | |
686 | 674 |
|
687 | 675 |
SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inFile, outFile, lang) |
688 | 676 |
tokenizer.setRetokenize(false) |
689 |
tokenizer.setNote("note") |
|
690 | 677 |
//tokenizer.setOutSideTextTags("outsideToEdit") |
691 | 678 |
tokenizer.setOutSideTextTagsAndKeepContent("outsideToEdit") |
692 | 679 |
//tokenizer.setDEBUG false |
Formats disponibles : Unified diff