Révision 1177
tmp/org.txm.rcp/src/main/java/org/txm/rcp/editors/imports/sections/TokenizerSection.java (revision 1177) | ||
---|---|---|
303 | 303 |
textPunctsStrong.setText(project.getTokenizerParameter("punct_strong", TokenizerClasses.punct_strong)); |
304 | 304 |
textElisions.setText(project.getTokenizerParameter("regElision", TokenizerClasses.regElision)); |
305 | 305 |
|
306 |
String wtagElement = project.getTokenizerParameter("wordelement", "w");
|
|
306 |
String wtagElement = project.getTokenizerWordElement();
|
|
307 | 307 |
wordElementText.setText(wtagElement); |
308 | 308 |
|
309 | 309 |
boolean doTokenizeStep = project.getDoTokenizerStep(); |
... | ... | |
327 | 327 |
project.addTokenizerParameter("regPunct", textPuncts.getText()); //$NON-NLS-1$ |
328 | 328 |
project.addTokenizerParameter("punct_strong", textPunctsStrong.getText()); //$NON-NLS-1$ |
329 | 329 |
project.addTokenizerParameter("regElision", textElisions.getText()); //$NON-NLS-1$ |
330 |
project.addTokenizerParameter("wordelement", wordElementText.getText()); //$NON-NLS-1$
|
|
330 |
project.addTokenizerParameter("word_tags", wordElementText.getText()); //$NON-NLS-1$
|
|
331 | 331 |
project.addTokenizerParameter("doTokenizeStep", ""+doTokenizeStepButton.getSelection()); //$NON-NLS-1$ |
332 | 332 |
project.addTokenizerParameter("onlyThoseTests", "false"); //$NON-NLS-1$ |
333 | 333 |
//project.setTokenizerAdditionalTests(additionalTokenizerTests); //$NON-NLS-1$ |
tmp/org.txm.core/src/java/org/txm/objects/Project.java (revision 1177) | ||
---|---|---|
802 | 802 |
// } |
803 | 803 |
|
804 | 804 |
if (!rcpProject.getName().equals(pName)) { // project directory and project name must stay in sync |
805 |
IPath path = rcpProject.getProjectRelativePath(); |
|
806 |
String name = rcpProject.getName(); |
|
805 | 807 |
IPath destination = new Path(this.getName()); |
806 |
if (destination.toFile().exists()) { |
|
807 |
System.out.println("Error: could not rename project since one already existing with the same new name="+this.getName()); |
|
808 |
return false; |
|
808 |
|
|
809 |
IWorkspace rcpWorkspace = ResourcesPlugin.getWorkspace(); |
|
810 |
IProject p2 = rcpWorkspace.getRoot().getProject(this.getName()); |
|
811 |
if (p2.exists()) { |
|
812 |
//System.out.println("Error: could not rename project since one already existing with the same new name="+this.getName()); |
|
813 |
//return false; |
|
814 |
rcpProject = p2; |
|
815 |
} else { |
|
816 |
rcpProject.move(destination, true, null); |
|
809 | 817 |
} |
810 |
rcpProject.move(destination, true, null); |
|
811 | 818 |
} |
812 | 819 |
|
813 | 820 |
ImportEngines engines = Toolbox.getImportEngines(); |
... | ... | |
1007 | 1014 |
|
1008 | 1015 |
public String getTokenizerWordElement() { |
1009 | 1016 |
IEclipsePreferences params = this.getPreferencesScope().getNode("Tokenizer"); |
1010 |
return params.get("wordelement", "w");
|
|
1017 |
return params.get("word_tags", "w");
|
|
1011 | 1018 |
} |
1012 | 1019 |
|
1013 | 1020 |
public boolean getDoTokenizerStep() { |
tmp/org.txm.core/src/java/org/txm/scripts/importer/MileStoneProjection.groovy (revision 1177) | ||
---|---|---|
60 | 60 |
switch (event) { |
61 | 61 |
case XMLStreamConstants.START_ELEMENT: |
62 | 62 |
localname = parser.getLocalName() |
63 |
if (start && localname == mileStoneTag) {
|
|
63 |
if (start && localname.equals(mileStoneTag)) {
|
|
64 | 64 |
milestonesLength << mileStoneDistance; |
65 | 65 |
mileStoneDistance = 0 |
66 | 66 |
milestonesCounter++; |
67 |
} else if (start && localname == wordTag) {
|
|
67 |
} else if (start && localname.equals(wordTag)) {
|
|
68 | 68 |
mileStoneDistance++ |
69 |
} else if (localname == startTag) {
|
|
69 |
} else if (localname.equals(startTag)) {
|
|
70 | 70 |
start = true |
71 | 71 |
} |
72 | 72 |
break; |
... | ... | |
75 | 75 |
milestonesLength[milestonesCounter] = mileStoneDistance; |
76 | 76 |
parser.close() |
77 | 77 |
milestonesCounter = 0; |
78 |
//println milestonesLength |
|
78 | 79 |
} |
79 | 80 |
|
80 | 81 |
|
... | ... | |
82 | 83 |
public void processStartElement() { |
83 | 84 |
super.processStartElement(); |
84 | 85 |
|
85 |
if (start && localname == mileStoneTag) {
|
|
86 |
if (start && localname.equals(mileStoneTag)) {
|
|
86 | 87 |
mileStoneDistance = 0 |
87 | 88 |
mileStoneID = parser.getAttributeValue(null, "id") |
88 | 89 |
if (mileStoneID == null) mileStoneID = parser.getAttributeValue("xml", "id"); |
89 | 90 |
if (mileStoneID == null) mileStoneID = "0"; |
90 | 91 |
|
91 | 92 |
milestonesCounter++; |
92 |
} else if (start && localname == wordTag) {
|
|
93 |
} else if (start && localname.equals(wordTag)) {
|
|
93 | 94 |
// println "end of $milestonesCounter len="+milestonesLength[milestonesCounter]+" dist="+mileStoneDistance |
94 | 95 |
writer.writeAttribute(msEndAttributeName, Integer.toString((milestonesLength[milestonesCounter] - mileStoneDistance - 1))) |
95 | 96 |
writer.writeAttribute(msStartAttributeName, Integer.toString(mileStoneDistance)) |
96 | 97 |
writer.writeAttribute(msIdAttributeName, mileStoneID) |
97 | 98 |
|
98 | 99 |
mileStoneDistance++ |
99 |
} else if (localname == startTag) {
|
|
100 |
} else if (localname.equals(startTag)) {
|
|
100 | 101 |
start = true |
101 | 102 |
} |
102 | 103 |
} |
103 | 104 |
|
104 | 105 |
public static void main(String[] args) { |
105 |
File inputFile = new File("/home/mdecorde/TXM/corpora/BVHEPISTEMON2016/tokenized/1538_MarotAdole.xml")
|
|
106 |
File outputFile = new File("/home/mdecorde/TEMP/tmp.xml")
|
|
106 |
File inputFile = new File("/home/mdecorde/TXM-0.8.0-dev/corpora/XTZMILESTONES/tokenized/test.xml")
|
|
107 |
File outputFile = new File("/home/mdecorde/TXM-0.8.0-dev/corpora/XTZMILESTONES/tokenized/result.xml")
|
|
107 | 108 |
|
108 |
MileStoneProjection msp = new MileStoneProjection(inputFile, "body", "w", "lb");
|
|
109 |
MileStoneProjection msp = new MileStoneProjection(inputFile, "text", "w", "lb");
|
|
109 | 110 |
println "Sucess: "+msp.process(outputFile) |
110 | 111 |
} |
111 | 112 |
} |
tmp/org.txm.tokenizer.core/src/org/txm/tokenizer/TokenizerClasses.java (revision 1177) | ||
---|---|---|
85 | 85 |
|
86 | 86 |
/** The word_tags. */ |
87 | 87 |
public static String word_tags = "w"; |
88 |
/** the element to create when a word is created */ |
|
89 |
public static String word_element_to_create = "w"; |
|
88 | 90 |
|
89 | 91 |
/** The intraword_tags. */ |
90 | 92 |
public static String intraword_tags = "c|ex|caesura"; |
... | ... | |
420 | 422 |
for (TTest test : tests) { |
421 | 423 |
System.out.println(" "+test+""); |
422 | 424 |
} |
423 |
|
|
424 | 425 |
} |
425 | 426 |
|
426 | 427 |
public static boolean isValid() { |
... | ... | |
538 | 539 |
|
539 | 540 |
if (debug) System.out.println(" Tokenizer parametrized with "+key+"="+value+""); |
540 | 541 |
|
541 |
if (key == "tag_all")
|
|
542 |
if (key.equals("tag_all"))
|
|
542 | 543 |
tag_all = value; |
543 |
else if (key == "enclitics")
|
|
544 |
else if (key.equals("enclitics"))
|
|
544 | 545 |
enclitics = value; |
545 |
else if (key == "encliticsFR")
|
|
546 |
else if (key.equals("encliticsFR"))
|
|
546 | 547 |
encliticsFR = value; |
547 |
else if (key == "div_tags")
|
|
548 |
else if (key.equals("div_tags"))
|
|
548 | 549 |
div_tags = value; |
549 |
else if (key == "q_tags")
|
|
550 |
else if (key.equals("q_tags"))
|
|
550 | 551 |
q_tags = value; |
551 |
else if (key == "extraword1_tags")
|
|
552 |
else if (key.equals("extraword1_tags"))
|
|
552 | 553 |
extraword1_tags = value; |
553 |
else if (key == "corr_tags_no_seg")
|
|
554 |
else if (key.equals("corr_tags_no_seg"))
|
|
554 | 555 |
corr_tags_no_seg = value; |
555 |
else if (key == "word_tags")
|
|
556 |
else if (key.equals("word_tags"))
|
|
556 | 557 |
word_tags = value; |
557 |
else if (key == "intraword_tags")
|
|
558 |
else if (key.equals("intraword_tags"))
|
|
558 | 559 |
intraword_tags = value; |
559 |
else if (key == "punct_quotes")
|
|
560 |
else if (key.equals("punct_quotes"))
|
|
560 | 561 |
punct_quotes = value; |
561 |
else if (key == "punct_strong1")
|
|
562 |
else if (key.equals("punct_strong1"))
|
|
562 | 563 |
punct_strong1 = value; |
563 |
else if (key == "punct_strong2")
|
|
564 |
else if (key.equals("punct_strong2"))
|
|
564 | 565 |
punct_strong2 = value; |
565 |
else if (key == "punct_paren_open1")
|
|
566 |
else if (key.equals("punct_paren_open1"))
|
|
566 | 567 |
punct_paren_open1 = value; |
567 |
else if (key == "punct_paren_open2")
|
|
568 |
else if (key.equals("punct_paren_open2"))
|
|
568 | 569 |
punct_paren_open2 = value; |
569 |
else if (key == "punct_paren_close1")
|
|
570 |
else if (key.equals("punct_paren_close1"))
|
|
570 | 571 |
punct_paren_close1 = value; |
571 |
else if (key == "punct_paren_close2")
|
|
572 |
else if (key.equals("punct_paren_close2"))
|
|
572 | 573 |
punct_paren_close2 = value; |
573 |
else if (key == "punct_weak")
|
|
574 |
else if (key .equals("punct_weak"))
|
|
574 | 575 |
punct_weak = value; |
575 |
else if (key == "entity")
|
|
576 |
else if (key.equals("entity"))
|
|
576 | 577 |
entity = value; |
577 |
else if (key == "seg_tags")
|
|
578 |
else if (key.equals("seg_tags"))
|
|
578 | 579 |
seg_tags = value; |
579 |
else if (key == "regPunct")
|
|
580 |
else if (key.equals("regPunct"))
|
|
580 | 581 |
regPunct = value; |
581 |
else if (key == "regElision")
|
|
582 |
else if (key.equals("regElision"))
|
|
582 | 583 |
regElision = value; |
583 |
else if (key == "whitespaces")
|
|
584 |
else if (key.equals("whitespaces"))
|
|
584 | 585 |
whitespaces = value; |
585 |
else if (key == "punct_strong") // this is temporary
|
|
586 |
else if (key.equals("punct_strong")) // this is temporary
|
|
586 | 587 |
tmp_strong_punct = value; // this is temporary |
587 | 588 |
else |
588 | 589 |
System.out.println("MISSING TOKENIZER KEY: "+key); |
... | ... | |
642 | 643 |
String tmp_strong_punct = null; |
643 | 644 |
IEclipsePreferences params = projectScope.getNode("Tokenizer"); |
644 | 645 |
try { |
645 |
for (String key : params.childrenNames()) { |
|
646 |
String[] keys = params.keys(); |
|
647 |
for (String key : keys) { |
|
646 | 648 |
// Element param = (Element) params.item(i); |
647 | 649 |
// String key = param.getAttribute("name"); |
648 |
String value = params.get("value", "");
|
|
650 |
String value = params.get(key, "");
|
|
649 | 651 |
// if (value == null || value.length() == 0) value = param.getTextContent(); |
650 | 652 |
if (value.length() == 0) value = null; |
651 | 653 |
|
652 | 654 |
if (debug) System.out.println(" Tokenizer parametrized with "+key+"="+value+""); |
653 | 655 |
|
654 |
if (key == "tag_all")
|
|
656 |
if (key.equals("tag_all"))
|
|
655 | 657 |
tag_all = value; |
656 |
else if (key == "enclitics")
|
|
658 |
else if (key.equals("enclitics"))
|
|
657 | 659 |
enclitics = value; |
658 |
else if (key == "encliticsFR")
|
|
660 |
else if (key.equals("encliticsFR"))
|
|
659 | 661 |
encliticsFR = value; |
660 |
else if (key == "div_tags")
|
|
662 |
else if (key.equals("div_tags"))
|
|
661 | 663 |
div_tags = value; |
662 |
else if (key == "q_tags")
|
|
664 |
else if (key.equals("q_tags"))
|
|
663 | 665 |
q_tags = value; |
664 |
else if (key == "extraword1_tags")
|
|
666 |
else if (key.equals("extraword1_tags"))
|
|
665 | 667 |
extraword1_tags = value; |
666 |
else if (key == "corr_tags_no_seg")
|
|
668 |
else if (key.equals("corr_tags_no_seg"))
|
|
667 | 669 |
corr_tags_no_seg = value; |
668 |
else if (key == "word_tags")
|
|
670 |
else if (key.equals("word_tags")) {
|
|
669 | 671 |
word_tags = value; |
670 |
else if (key == "intraword_tags") |
|
672 |
word_element_to_create = value; // FIXME for now |
|
673 |
} else if (key.equals("intraword_tags")) |
|
671 | 674 |
intraword_tags = value; |
672 |
else if (key == "punct_quotes")
|
|
675 |
else if (key.equals("punct_quotes"))
|
|
673 | 676 |
punct_quotes = value; |
674 |
else if (key == "punct_strong1")
|
|
677 |
else if (key.equals("punct_strong1"))
|
|
675 | 678 |
punct_strong1 = value; |
676 |
else if (key == "punct_strong2")
|
|
679 |
else if (key.equals("punct_strong2"))
|
|
677 | 680 |
punct_strong2 = value; |
678 |
else if (key == "punct_paren_open1")
|
|
681 |
else if (key.equals("punct_paren_open1"))
|
|
679 | 682 |
punct_paren_open1 = value; |
680 |
else if (key == "punct_paren_open2")
|
|
683 |
else if (key.equals("punct_paren_open2"))
|
|
681 | 684 |
punct_paren_open2 = value; |
682 |
else if (key == "punct_paren_close1")
|
|
685 |
else if (key.equals("punct_paren_close1"))
|
|
683 | 686 |
punct_paren_close1 = value; |
684 |
else if (key == "punct_paren_close2")
|
|
687 |
else if (key.equals("punct_paren_close2"))
|
|
685 | 688 |
punct_paren_close2 = value; |
686 |
else if (key == "punct_weak")
|
|
689 |
else if (key.equals("punct_weak"))
|
|
687 | 690 |
punct_weak = value; |
688 |
else if (key == "entity")
|
|
691 |
else if (key.equals("entity"))
|
|
689 | 692 |
entity = value; |
690 |
else if (key == "seg_tags")
|
|
693 |
else if (key.equals("seg_tags"))
|
|
691 | 694 |
seg_tags = value; |
692 |
else if (key == "regPunct")
|
|
695 |
else if (key.equals("regPunct"))
|
|
693 | 696 |
regPunct = value; |
694 |
else if (key == "regElision")
|
|
697 |
else if (key.equals("regElision"))
|
|
695 | 698 |
regElision = value; |
696 |
else if (key == "whitespaces")
|
|
699 |
else if (key.equals("whitespaces"))
|
|
697 | 700 |
whitespaces = value; |
698 |
else if (key == "punct_strong") // this is temporary
|
|
701 |
else if (key.equals("punct_strong")) // this is temporary
|
|
699 | 702 |
tmp_strong_punct = value; // this is temporary |
700 |
else |
|
701 |
System.out.println("MISSING TOKENIZER KEY: "+key); |
|
703 |
// else
|
|
704 |
// System.out.println("MISSING TOKENIZER KEY: "+key);
|
|
702 | 705 |
} |
703 | 706 |
|
704 | 707 |
//recombine |
tmp/org.txm.groovy.core/src/java/org/txm/groovy/core/GroovyScriptedImportEngine.java (revision 1177) | ||
---|---|---|
123 | 123 |
Log.info("\nEnd of Groovy import script: " + ExecTimer.stop()); //$NON-NLS-1$ |
124 | 124 |
monitor.worked(90); |
125 | 125 |
|
126 |
TokenizerClasses.reset(); |
|
126 |
TokenizerClasses.reset(); // restore default tokenizer classes
|
|
127 | 127 |
|
128 | 128 |
Object ready = binding.getVariable("readyToLoad"); //$NON-NLS-1$ |
129 | 129 |
if (ready != null && ready instanceof Boolean) { |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZImporter.groovy (revision 1177) | ||
---|---|---|
224 | 224 |
return false |
225 | 225 |
} else { |
226 | 226 |
if (inputFile.delete()) { |
227 |
FileCopy.copy(outputFile, new File(outputFile.getParent(), "copy.xml")) |
|
227 | 228 |
outputFile.renameTo(inputFile) |
228 | 229 |
} else { |
229 | 230 |
println "Fail to encode $milestone in ${inputFile}: could not replace the file." |
... | ... | |
358 | 359 |
File infile = f; |
359 | 360 |
File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName()); |
360 | 361 |
SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang) |
361 |
if (wordTag.length() > 0) |
|
362 |
tokenizer.setWord_tags(wordTag); |
|
363 |
tokenizer.setWord_element_to_create(wordTag) |
|
364 | 362 |
|
365 | 363 |
if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) { |
366 | 364 |
tokenizer.setOutSideTextTags(outSideTextTagsRegex) |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZCompiler.groovy (revision 1177) | ||
---|---|---|
33 | 33 |
corpusname = module.getProject().getName(); |
34 | 34 |
regPath = module.getBinaryDirectory().getAbsolutePath() + "/registry/"+corpusname.toLowerCase() |
35 | 35 |
|
36 |
wtag = module.getProject().getTokenizerParameter("wordelement", "w");
|
|
36 |
wtag = module.getProject().getTokenizerWordElement();
|
|
37 | 37 |
|
38 | 38 |
doNormalizeAttributeValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEATTRIBUTEVALUES, "false")) |
39 | 39 |
doNormalizeAnaValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEANAVALUES, "false")) |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/filters/Tokeniser/SimpleTokenizerXml.groovy (revision 1177) | ||
---|---|---|
59 | 59 |
|
60 | 60 |
/** The word_tags. */ |
61 | 61 |
String word_tags = TokenizerClasses.word_tags; |
62 |
String word_element_to_create = "w"
|
|
62 |
String word_element_to_create = TokenizerClasses.word_element_to_create;
|
|
63 | 63 |
Pattern reg_word_tags; |
64 | 64 |
/** The intraword_tags. */ |
65 | 65 |
String intraword_tags = TokenizerClasses.intraword_tags; |
... | ... | |
153 | 153 |
|
154 | 154 |
word_tags = TokenizerClasses.word_tags; |
155 | 155 |
reg_word_tags = Pattern.compile(word_tags); |
156 |
|
|
156 | 157 |
intraword_tags = TokenizerClasses.intraword_tags; |
157 | 158 |
punct_strong = TokenizerClasses.punct_strong; |
158 | 159 |
punct_all = TokenizerClasses.punct_all; |
Formats disponibles : Unified diff