Révision 1177
| tmp/org.txm.rcp/src/main/java/org/txm/rcp/editors/imports/sections/TokenizerSection.java (revision 1177) | ||
|---|---|---|
| 303 | 303 |
textPunctsStrong.setText(project.getTokenizerParameter("punct_strong", TokenizerClasses.punct_strong));
|
| 304 | 304 |
textElisions.setText(project.getTokenizerParameter("regElision", TokenizerClasses.regElision));
|
| 305 | 305 |
|
| 306 |
String wtagElement = project.getTokenizerParameter("wordelement", "w");
|
|
| 306 |
String wtagElement = project.getTokenizerWordElement();
|
|
| 307 | 307 |
wordElementText.setText(wtagElement); |
| 308 | 308 |
|
| 309 | 309 |
boolean doTokenizeStep = project.getDoTokenizerStep(); |
| ... | ... | |
| 327 | 327 |
project.addTokenizerParameter("regPunct", textPuncts.getText()); //$NON-NLS-1$
|
| 328 | 328 |
project.addTokenizerParameter("punct_strong", textPunctsStrong.getText()); //$NON-NLS-1$
|
| 329 | 329 |
project.addTokenizerParameter("regElision", textElisions.getText()); //$NON-NLS-1$
|
| 330 |
project.addTokenizerParameter("wordelement", wordElementText.getText()); //$NON-NLS-1$
|
|
| 330 |
project.addTokenizerParameter("word_tags", wordElementText.getText()); //$NON-NLS-1$
|
|
| 331 | 331 |
project.addTokenizerParameter("doTokenizeStep", ""+doTokenizeStepButton.getSelection()); //$NON-NLS-1$
|
| 332 | 332 |
project.addTokenizerParameter("onlyThoseTests", "false"); //$NON-NLS-1$
|
| 333 | 333 |
//project.setTokenizerAdditionalTests(additionalTokenizerTests); //$NON-NLS-1$ |
| tmp/org.txm.core/src/java/org/txm/objects/Project.java (revision 1177) | ||
|---|---|---|
| 802 | 802 |
// } |
| 803 | 803 |
|
| 804 | 804 |
if (!rcpProject.getName().equals(pName)) { // project directory and project name must stay in sync
|
| 805 |
IPath path = rcpProject.getProjectRelativePath(); |
|
| 806 |
String name = rcpProject.getName(); |
|
| 805 | 807 |
IPath destination = new Path(this.getName()); |
| 806 |
if (destination.toFile().exists()) {
|
|
| 807 |
System.out.println("Error: could not rename project since one already existing with the same new name="+this.getName());
|
|
| 808 |
return false; |
|
| 808 |
|
|
| 809 |
IWorkspace rcpWorkspace = ResourcesPlugin.getWorkspace(); |
|
| 810 |
IProject p2 = rcpWorkspace.getRoot().getProject(this.getName()); |
|
| 811 |
if (p2.exists()) {
|
|
| 812 |
//System.out.println("Error: could not rename project since one already existing with the same new name="+this.getName());
|
|
| 813 |
//return false; |
|
| 814 |
rcpProject = p2; |
|
| 815 |
} else {
|
|
| 816 |
rcpProject.move(destination, true, null); |
|
| 809 | 817 |
} |
| 810 |
rcpProject.move(destination, true, null); |
|
| 811 | 818 |
} |
| 812 | 819 |
|
| 813 | 820 |
ImportEngines engines = Toolbox.getImportEngines(); |
| ... | ... | |
| 1007 | 1014 |
|
| 1008 | 1015 |
public String getTokenizerWordElement() {
|
| 1009 | 1016 |
IEclipsePreferences params = this.getPreferencesScope().getNode("Tokenizer");
|
| 1010 |
return params.get("wordelement", "w");
|
|
| 1017 |
return params.get("word_tags", "w");
|
|
| 1011 | 1018 |
} |
| 1012 | 1019 |
|
| 1013 | 1020 |
public boolean getDoTokenizerStep() {
|
| tmp/org.txm.core/src/java/org/txm/scripts/importer/MileStoneProjection.groovy (revision 1177) | ||
|---|---|---|
| 60 | 60 |
switch (event) {
|
| 61 | 61 |
case XMLStreamConstants.START_ELEMENT: |
| 62 | 62 |
localname = parser.getLocalName() |
| 63 |
if (start && localname == mileStoneTag) {
|
|
| 63 |
if (start && localname.equals(mileStoneTag)) {
|
|
| 64 | 64 |
milestonesLength << mileStoneDistance; |
| 65 | 65 |
mileStoneDistance = 0 |
| 66 | 66 |
milestonesCounter++; |
| 67 |
} else if (start && localname == wordTag) {
|
|
| 67 |
} else if (start && localname.equals(wordTag)) {
|
|
| 68 | 68 |
mileStoneDistance++ |
| 69 |
} else if (localname == startTag) {
|
|
| 69 |
} else if (localname.equals(startTag)) {
|
|
| 70 | 70 |
start = true |
| 71 | 71 |
} |
| 72 | 72 |
break; |
| ... | ... | |
| 75 | 75 |
milestonesLength[milestonesCounter] = mileStoneDistance; |
| 76 | 76 |
parser.close() |
| 77 | 77 |
milestonesCounter = 0; |
| 78 |
//println milestonesLength |
|
| 78 | 79 |
} |
| 79 | 80 |
|
| 80 | 81 |
|
| ... | ... | |
| 82 | 83 |
public void processStartElement() {
|
| 83 | 84 |
super.processStartElement(); |
| 84 | 85 |
|
| 85 |
if (start && localname == mileStoneTag) {
|
|
| 86 |
if (start && localname.equals(mileStoneTag)) {
|
|
| 86 | 87 |
mileStoneDistance = 0 |
| 87 | 88 |
mileStoneID = parser.getAttributeValue(null, "id") |
| 88 | 89 |
if (mileStoneID == null) mileStoneID = parser.getAttributeValue("xml", "id");
|
| 89 | 90 |
if (mileStoneID == null) mileStoneID = "0"; |
| 90 | 91 |
|
| 91 | 92 |
milestonesCounter++; |
| 92 |
} else if (start && localname == wordTag) {
|
|
| 93 |
} else if (start && localname.equals(wordTag)) {
|
|
| 93 | 94 |
// println "end of $milestonesCounter len="+milestonesLength[milestonesCounter]+" dist="+mileStoneDistance |
| 94 | 95 |
writer.writeAttribute(msEndAttributeName, Integer.toString((milestonesLength[milestonesCounter] - mileStoneDistance - 1))) |
| 95 | 96 |
writer.writeAttribute(msStartAttributeName, Integer.toString(mileStoneDistance)) |
| 96 | 97 |
writer.writeAttribute(msIdAttributeName, mileStoneID) |
| 97 | 98 |
|
| 98 | 99 |
mileStoneDistance++ |
| 99 |
} else if (localname == startTag) {
|
|
| 100 |
} else if (localname.equals(startTag)) {
|
|
| 100 | 101 |
start = true |
| 101 | 102 |
} |
| 102 | 103 |
} |
| 103 | 104 |
|
| 104 | 105 |
public static void main(String[] args) {
|
| 105 |
File inputFile = new File("/home/mdecorde/TXM/corpora/BVHEPISTEMON2016/tokenized/1538_MarotAdole.xml")
|
|
| 106 |
File outputFile = new File("/home/mdecorde/TEMP/tmp.xml")
|
|
| 106 |
File inputFile = new File("/home/mdecorde/TXM-0.8.0-dev/corpora/XTZMILESTONES/tokenized/test.xml")
|
|
| 107 |
File outputFile = new File("/home/mdecorde/TXM-0.8.0-dev/corpora/XTZMILESTONES/tokenized/result.xml")
|
|
| 107 | 108 |
|
| 108 |
MileStoneProjection msp = new MileStoneProjection(inputFile, "body", "w", "lb");
|
|
| 109 |
MileStoneProjection msp = new MileStoneProjection(inputFile, "text", "w", "lb");
|
|
| 109 | 110 |
println "Sucess: "+msp.process(outputFile) |
| 110 | 111 |
} |
| 111 | 112 |
} |
| tmp/org.txm.tokenizer.core/src/org/txm/tokenizer/TokenizerClasses.java (revision 1177) | ||
|---|---|---|
| 85 | 85 |
|
| 86 | 86 |
/** The word_tags. */ |
| 87 | 87 |
public static String word_tags = "w"; |
| 88 |
/** the element to create when a word is created */ |
|
| 89 |
public static String word_element_to_create = "w"; |
|
| 88 | 90 |
|
| 89 | 91 |
/** The intraword_tags. */ |
| 90 | 92 |
public static String intraword_tags = "c|ex|caesura"; |
| ... | ... | |
| 420 | 422 |
for (TTest test : tests) {
|
| 421 | 423 |
System.out.println(" "+test+"");
|
| 422 | 424 |
} |
| 423 |
|
|
| 424 | 425 |
} |
| 425 | 426 |
|
| 426 | 427 |
public static boolean isValid() {
|
| ... | ... | |
| 538 | 539 |
|
| 539 | 540 |
if (debug) System.out.println(" Tokenizer parametrized with "+key+"="+value+"");
|
| 540 | 541 |
|
| 541 |
if (key == "tag_all")
|
|
| 542 |
if (key.equals("tag_all"))
|
|
| 542 | 543 |
tag_all = value; |
| 543 |
else if (key == "enclitics")
|
|
| 544 |
else if (key.equals("enclitics"))
|
|
| 544 | 545 |
enclitics = value; |
| 545 |
else if (key == "encliticsFR")
|
|
| 546 |
else if (key.equals("encliticsFR"))
|
|
| 546 | 547 |
encliticsFR = value; |
| 547 |
else if (key == "div_tags")
|
|
| 548 |
else if (key.equals("div_tags"))
|
|
| 548 | 549 |
div_tags = value; |
| 549 |
else if (key == "q_tags")
|
|
| 550 |
else if (key.equals("q_tags"))
|
|
| 550 | 551 |
q_tags = value; |
| 551 |
else if (key == "extraword1_tags")
|
|
| 552 |
else if (key.equals("extraword1_tags"))
|
|
| 552 | 553 |
extraword1_tags = value; |
| 553 |
else if (key == "corr_tags_no_seg")
|
|
| 554 |
else if (key.equals("corr_tags_no_seg"))
|
|
| 554 | 555 |
corr_tags_no_seg = value; |
| 555 |
else if (key == "word_tags")
|
|
| 556 |
else if (key.equals("word_tags"))
|
|
| 556 | 557 |
word_tags = value; |
| 557 |
else if (key == "intraword_tags")
|
|
| 558 |
else if (key.equals("intraword_tags"))
|
|
| 558 | 559 |
intraword_tags = value; |
| 559 |
else if (key == "punct_quotes")
|
|
| 560 |
else if (key.equals("punct_quotes"))
|
|
| 560 | 561 |
punct_quotes = value; |
| 561 |
else if (key == "punct_strong1")
|
|
| 562 |
else if (key.equals("punct_strong1"))
|
|
| 562 | 563 |
punct_strong1 = value; |
| 563 |
else if (key == "punct_strong2")
|
|
| 564 |
else if (key.equals("punct_strong2"))
|
|
| 564 | 565 |
punct_strong2 = value; |
| 565 |
else if (key == "punct_paren_open1")
|
|
| 566 |
else if (key.equals("punct_paren_open1"))
|
|
| 566 | 567 |
punct_paren_open1 = value; |
| 567 |
else if (key == "punct_paren_open2")
|
|
| 568 |
else if (key.equals("punct_paren_open2"))
|
|
| 568 | 569 |
punct_paren_open2 = value; |
| 569 |
else if (key == "punct_paren_close1")
|
|
| 570 |
else if (key.equals("punct_paren_close1"))
|
|
| 570 | 571 |
punct_paren_close1 = value; |
| 571 |
else if (key == "punct_paren_close2")
|
|
| 572 |
else if (key.equals("punct_paren_close2"))
|
|
| 572 | 573 |
punct_paren_close2 = value; |
| 573 |
else if (key == "punct_weak")
|
|
| 574 |
else if (key .equals("punct_weak"))
|
|
| 574 | 575 |
punct_weak = value; |
| 575 |
else if (key == "entity")
|
|
| 576 |
else if (key.equals("entity"))
|
|
| 576 | 577 |
entity = value; |
| 577 |
else if (key == "seg_tags")
|
|
| 578 |
else if (key.equals("seg_tags"))
|
|
| 578 | 579 |
seg_tags = value; |
| 579 |
else if (key == "regPunct")
|
|
| 580 |
else if (key.equals("regPunct"))
|
|
| 580 | 581 |
regPunct = value; |
| 581 |
else if (key == "regElision")
|
|
| 582 |
else if (key.equals("regElision"))
|
|
| 582 | 583 |
regElision = value; |
| 583 |
else if (key == "whitespaces")
|
|
| 584 |
else if (key.equals("whitespaces"))
|
|
| 584 | 585 |
whitespaces = value; |
| 585 |
else if (key == "punct_strong") // this is temporary
|
|
| 586 |
else if (key.equals("punct_strong")) // this is temporary
|
|
| 586 | 587 |
tmp_strong_punct = value; // this is temporary |
| 587 | 588 |
else |
| 588 | 589 |
System.out.println("MISSING TOKENIZER KEY: "+key);
|
| ... | ... | |
| 642 | 643 |
String tmp_strong_punct = null; |
| 643 | 644 |
IEclipsePreferences params = projectScope.getNode("Tokenizer");
|
| 644 | 645 |
try {
|
| 645 |
for (String key : params.childrenNames()) {
|
|
| 646 |
String[] keys = params.keys(); |
|
| 647 |
for (String key : keys) {
|
|
| 646 | 648 |
// Element param = (Element) params.item(i); |
| 647 | 649 |
// String key = param.getAttribute("name");
|
| 648 |
String value = params.get("value", "");
|
|
| 650 |
String value = params.get(key, "");
|
|
| 649 | 651 |
// if (value == null || value.length() == 0) value = param.getTextContent(); |
| 650 | 652 |
if (value.length() == 0) value = null; |
| 651 | 653 |
|
| 652 | 654 |
if (debug) System.out.println(" Tokenizer parametrized with "+key+"="+value+"");
|
| 653 | 655 |
|
| 654 |
if (key == "tag_all")
|
|
| 656 |
if (key.equals("tag_all"))
|
|
| 655 | 657 |
tag_all = value; |
| 656 |
else if (key == "enclitics")
|
|
| 658 |
else if (key.equals("enclitics"))
|
|
| 657 | 659 |
enclitics = value; |
| 658 |
else if (key == "encliticsFR")
|
|
| 660 |
else if (key.equals("encliticsFR"))
|
|
| 659 | 661 |
encliticsFR = value; |
| 660 |
else if (key == "div_tags")
|
|
| 662 |
else if (key.equals("div_tags"))
|
|
| 661 | 663 |
div_tags = value; |
| 662 |
else if (key == "q_tags")
|
|
| 664 |
else if (key.equals("q_tags"))
|
|
| 663 | 665 |
q_tags = value; |
| 664 |
else if (key == "extraword1_tags")
|
|
| 666 |
else if (key.equals("extraword1_tags"))
|
|
| 665 | 667 |
extraword1_tags = value; |
| 666 |
else if (key == "corr_tags_no_seg")
|
|
| 668 |
else if (key.equals("corr_tags_no_seg"))
|
|
| 667 | 669 |
corr_tags_no_seg = value; |
| 668 |
else if (key == "word_tags")
|
|
| 670 |
else if (key.equals("word_tags")) {
|
|
| 669 | 671 |
word_tags = value; |
| 670 |
else if (key == "intraword_tags") |
|
| 672 |
word_element_to_create = value; // FIXME for now |
|
| 673 |
} else if (key.equals("intraword_tags"))
|
|
| 671 | 674 |
intraword_tags = value; |
| 672 |
else if (key == "punct_quotes")
|
|
| 675 |
else if (key.equals("punct_quotes"))
|
|
| 673 | 676 |
punct_quotes = value; |
| 674 |
else if (key == "punct_strong1")
|
|
| 677 |
else if (key.equals("punct_strong1"))
|
|
| 675 | 678 |
punct_strong1 = value; |
| 676 |
else if (key == "punct_strong2")
|
|
| 679 |
else if (key.equals("punct_strong2"))
|
|
| 677 | 680 |
punct_strong2 = value; |
| 678 |
else if (key == "punct_paren_open1")
|
|
| 681 |
else if (key.equals("punct_paren_open1"))
|
|
| 679 | 682 |
punct_paren_open1 = value; |
| 680 |
else if (key == "punct_paren_open2")
|
|
| 683 |
else if (key.equals("punct_paren_open2"))
|
|
| 681 | 684 |
punct_paren_open2 = value; |
| 682 |
else if (key == "punct_paren_close1")
|
|
| 685 |
else if (key.equals("punct_paren_close1"))
|
|
| 683 | 686 |
punct_paren_close1 = value; |
| 684 |
else if (key == "punct_paren_close2")
|
|
| 687 |
else if (key.equals("punct_paren_close2"))
|
|
| 685 | 688 |
punct_paren_close2 = value; |
| 686 |
else if (key == "punct_weak")
|
|
| 689 |
else if (key.equals("punct_weak"))
|
|
| 687 | 690 |
punct_weak = value; |
| 688 |
else if (key == "entity")
|
|
| 691 |
else if (key.equals("entity"))
|
|
| 689 | 692 |
entity = value; |
| 690 |
else if (key == "seg_tags")
|
|
| 693 |
else if (key.equals("seg_tags"))
|
|
| 691 | 694 |
seg_tags = value; |
| 692 |
else if (key == "regPunct")
|
|
| 695 |
else if (key.equals("regPunct"))
|
|
| 693 | 696 |
regPunct = value; |
| 694 |
else if (key == "regElision")
|
|
| 697 |
else if (key.equals("regElision"))
|
|
| 695 | 698 |
regElision = value; |
| 696 |
else if (key == "whitespaces")
|
|
| 699 |
else if (key.equals("whitespaces"))
|
|
| 697 | 700 |
whitespaces = value; |
| 698 |
else if (key == "punct_strong") // this is temporary
|
|
| 701 |
else if (key.equals("punct_strong")) // this is temporary
|
|
| 699 | 702 |
tmp_strong_punct = value; // this is temporary |
| 700 |
else |
|
| 701 |
System.out.println("MISSING TOKENIZER KEY: "+key);
|
|
| 703 |
// else
|
|
| 704 |
// System.out.println("MISSING TOKENIZER KEY: "+key);
|
|
| 702 | 705 |
} |
| 703 | 706 |
|
| 704 | 707 |
//recombine |
| tmp/org.txm.groovy.core/src/java/org/txm/groovy/core/GroovyScriptedImportEngine.java (revision 1177) | ||
|---|---|---|
| 123 | 123 |
Log.info("\nEnd of Groovy import script: " + ExecTimer.stop()); //$NON-NLS-1$
|
| 124 | 124 |
monitor.worked(90); |
| 125 | 125 |
|
| 126 |
TokenizerClasses.reset(); |
|
| 126 |
TokenizerClasses.reset(); // restore default tokenizer classes
|
|
| 127 | 127 |
|
| 128 | 128 |
Object ready = binding.getVariable("readyToLoad"); //$NON-NLS-1$
|
| 129 | 129 |
if (ready != null && ready instanceof Boolean) {
|
| tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZImporter.groovy (revision 1177) | ||
|---|---|---|
| 224 | 224 |
return false |
| 225 | 225 |
} else {
|
| 226 | 226 |
if (inputFile.delete()) {
|
| 227 |
FileCopy.copy(outputFile, new File(outputFile.getParent(), "copy.xml")) |
|
| 227 | 228 |
outputFile.renameTo(inputFile) |
| 228 | 229 |
} else {
|
| 229 | 230 |
println "Fail to encode $milestone in ${inputFile}: could not replace the file."
|
| ... | ... | |
| 358 | 359 |
File infile = f; |
| 359 | 360 |
File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName()); |
| 360 | 361 |
SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang) |
| 361 |
if (wordTag.length() > 0) |
|
| 362 |
tokenizer.setWord_tags(wordTag); |
|
| 363 |
tokenizer.setWord_element_to_create(wordTag) |
|
| 364 | 362 |
|
| 365 | 363 |
if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) {
|
| 366 | 364 |
tokenizer.setOutSideTextTags(outSideTextTagsRegex) |
| tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZCompiler.groovy (revision 1177) | ||
|---|---|---|
| 33 | 33 |
corpusname = module.getProject().getName(); |
| 34 | 34 |
regPath = module.getBinaryDirectory().getAbsolutePath() + "/registry/"+corpusname.toLowerCase() |
| 35 | 35 |
|
| 36 |
wtag = module.getProject().getTokenizerParameter("wordelement", "w");
|
|
| 36 |
wtag = module.getProject().getTokenizerWordElement();
|
|
| 37 | 37 |
|
| 38 | 38 |
doNormalizeAttributeValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEATTRIBUTEVALUES, "false"))
|
| 39 | 39 |
doNormalizeAnaValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEANAVALUES, "false"))
|
| tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/filters/Tokeniser/SimpleTokenizerXml.groovy (revision 1177) | ||
|---|---|---|
| 59 | 59 |
|
| 60 | 60 |
/** The word_tags. */ |
| 61 | 61 |
String word_tags = TokenizerClasses.word_tags; |
| 62 |
String word_element_to_create = "w"
|
|
| 62 |
String word_element_to_create = TokenizerClasses.word_element_to_create;
|
|
| 63 | 63 |
Pattern reg_word_tags; |
| 64 | 64 |
/** The intraword_tags. */ |
| 65 | 65 |
String intraword_tags = TokenizerClasses.intraword_tags; |
| ... | ... | |
| 153 | 153 |
|
| 154 | 154 |
word_tags = TokenizerClasses.word_tags; |
| 155 | 155 |
reg_word_tags = Pattern.compile(word_tags); |
| 156 |
|
|
| 156 | 157 |
intraword_tags = TokenizerClasses.intraword_tags; |
| 157 | 158 |
punct_strong = TokenizerClasses.punct_strong; |
| 158 | 159 |
punct_all = TokenizerClasses.punct_all; |
Formats disponibles : Unified diff