Révision 1177

tmp/org.txm.rcp/src/main/java/org/txm/rcp/editors/imports/sections/TokenizerSection.java (revision 1177)
303 303
		textPunctsStrong.setText(project.getTokenizerParameter("punct_strong", TokenizerClasses.punct_strong));
304 304
		textElisions.setText(project.getTokenizerParameter("regElision", TokenizerClasses.regElision));
305 305

  
306
		String wtagElement = project.getTokenizerParameter("wordelement", "w");
306
		String wtagElement = project.getTokenizerWordElement();
307 307
		wordElementText.setText(wtagElement);
308 308

  
309 309
		boolean doTokenizeStep = project.getDoTokenizerStep();
......
327 327
		project.addTokenizerParameter("regPunct", textPuncts.getText()); //$NON-NLS-1$
328 328
		project.addTokenizerParameter("punct_strong", textPunctsStrong.getText()); //$NON-NLS-1$
329 329
		project.addTokenizerParameter("regElision", textElisions.getText()); //$NON-NLS-1$
330
		project.addTokenizerParameter("wordelement", wordElementText.getText()); //$NON-NLS-1$
330
		project.addTokenizerParameter("word_tags", wordElementText.getText()); //$NON-NLS-1$
331 331
		project.addTokenizerParameter("doTokenizeStep", ""+doTokenizeStepButton.getSelection()); //$NON-NLS-1$
332 332
		project.addTokenizerParameter("onlyThoseTests", "false"); //$NON-NLS-1$
333 333
		//project.setTokenizerAdditionalTests(additionalTokenizerTests); //$NON-NLS-1$
tmp/org.txm.core/src/java/org/txm/objects/Project.java (revision 1177)
802 802
		//		}
803 803

  
804 804
		if (!rcpProject.getName().equals(pName)) { // project directory and project name must stay in sync
805
			IPath path = rcpProject.getProjectRelativePath();
806
			String name = rcpProject.getName();
805 807
			IPath destination = new Path(this.getName());
806
			if (destination.toFile().exists()) {
807
				System.out.println("Error: could not rename project since one already existing with the same new name="+this.getName());
808
				return false;
808
			
809
			IWorkspace rcpWorkspace = ResourcesPlugin.getWorkspace();
810
			IProject p2 = rcpWorkspace.getRoot().getProject(this.getName());
811
			if (p2.exists()) {
812
				//System.out.println("Error: could not rename project since one already existing with the same new name="+this.getName());
813
				//return false;
814
				rcpProject = p2;
815
			} else {
816
				rcpProject.move(destination, true, null);
809 817
			}
810
			rcpProject.move(destination, true, null);
811 818
		}
812 819

  
813 820
		ImportEngines engines = Toolbox.getImportEngines();
......
1007 1014

  
1008 1015
	public String getTokenizerWordElement() {
1009 1016
		IEclipsePreferences params = this.getPreferencesScope().getNode("Tokenizer");
1010
		return params.get("wordelement", "w");
1017
		return params.get("word_tags", "w");
1011 1018
	}
1012 1019

  
1013 1020
	public boolean getDoTokenizerStep() {
tmp/org.txm.core/src/java/org/txm/scripts/importer/MileStoneProjection.groovy (revision 1177)
60 60
			switch (event) {
61 61
				case XMLStreamConstants.START_ELEMENT:
62 62
					localname = parser.getLocalName()
63
					if (start && localname == mileStoneTag) {
63
					if (start && localname.equals(mileStoneTag)) {
64 64
						milestonesLength << mileStoneDistance;
65 65
						mileStoneDistance = 0
66 66
						milestonesCounter++;
67
					} else if (start && localname == wordTag) {
67
					} else if (start && localname.equals(wordTag)) {
68 68
						mileStoneDistance++
69
					} else if (localname == startTag) {
69
					} else if (localname.equals(startTag)) {
70 70
						start = true
71 71
					}
72 72
					break;
......
75 75
		milestonesLength[milestonesCounter] = mileStoneDistance;
76 76
		parser.close()
77 77
		milestonesCounter = 0;
78
		//println milestonesLength
78 79
	}
79 80

  
80 81

  
......
82 83
	public void processStartElement() {
83 84
		super.processStartElement();
84 85

  
85
		if (start && localname == mileStoneTag) {
86
		if (start && localname.equals(mileStoneTag)) {
86 87
			mileStoneDistance = 0
87 88
			mileStoneID = parser.getAttributeValue(null, "id")
88 89
			if (mileStoneID == null) mileStoneID = parser.getAttributeValue("xml", "id");
89 90
			if (mileStoneID == null) mileStoneID = "0";
90 91
			
91 92
			milestonesCounter++;
92
		} else if (start && localname == wordTag) {
93
		} else if (start && localname.equals(wordTag)) {
93 94
			// println "end of $milestonesCounter len="+milestonesLength[milestonesCounter]+" dist="+mileStoneDistance
94 95
			writer.writeAttribute(msEndAttributeName, Integer.toString((milestonesLength[milestonesCounter] - mileStoneDistance - 1)))
95 96
			writer.writeAttribute(msStartAttributeName, Integer.toString(mileStoneDistance))
96 97
			writer.writeAttribute(msIdAttributeName, mileStoneID)
97 98

  
98 99
			mileStoneDistance++
99
		} else if (localname == startTag) {
100
		} else if (localname.equals(startTag)) {
100 101
			start = true
101 102
		}
102 103
	}
103 104

  
104 105
	public static void main(String[] args) {
105
		File inputFile = new File("/home/mdecorde/TXM/corpora/BVHEPISTEMON2016/tokenized/1538_MarotAdole.xml")
106
		File outputFile = new File("/home/mdecorde/TEMP/tmp.xml")
106
		File inputFile = new File("/home/mdecorde/TXM-0.8.0-dev/corpora/XTZMILESTONES/tokenized/test.xml")
107
		File outputFile = new File("/home/mdecorde/TXM-0.8.0-dev/corpora/XTZMILESTONES/tokenized/result.xml")
107 108

  
108
		MileStoneProjection msp = new MileStoneProjection(inputFile, "body", "w", "lb");
109
		MileStoneProjection msp = new MileStoneProjection(inputFile, "text", "w", "lb");
109 110
		println "Sucess: "+msp.process(outputFile)
110 111
	}
111 112
}
tmp/org.txm.tokenizer.core/src/org/txm/tokenizer/TokenizerClasses.java (revision 1177)
85 85

  
86 86
	/** The word_tags. */
87 87
	public static String word_tags = "w";
88
	/** the element to create when a word is created */
89
	public static String word_element_to_create = "w";
88 90

  
89 91
	/** The intraword_tags. */
90 92
	public static String intraword_tags = "c|ex|caesura";
......
420 422
		for (TTest test : tests) {
421 423
			System.out.println(" "+test+"");
422 424
		}
423

  
424 425
	}
425 426

  
426 427
	public static boolean isValid() {
......
538 539

  
539 540
			if (debug) System.out.println(" Tokenizer parametrized with "+key+"="+value+"");
540 541

  
541
			if (key == "tag_all")
542
			if (key.equals("tag_all"))
542 543
				tag_all = value;
543
			else if (key == "enclitics")
544
			else if (key.equals("enclitics"))
544 545
				enclitics = value;
545
			else if (key == "encliticsFR")
546
			else if (key.equals("encliticsFR"))
546 547
				encliticsFR = value;
547
			else if (key == "div_tags")
548
			else if (key.equals("div_tags"))
548 549
				div_tags = value;
549
			else if (key == "q_tags")
550
			else if (key.equals("q_tags"))
550 551
				q_tags = value;
551
			else if (key == "extraword1_tags")
552
			else if (key.equals("extraword1_tags"))
552 553
				extraword1_tags = value;
553
			else if (key == "corr_tags_no_seg")
554
			else if (key.equals("corr_tags_no_seg"))
554 555
				corr_tags_no_seg = value;
555
			else if (key == "word_tags")
556
			else if (key.equals("word_tags"))
556 557
				word_tags = value;
557
			else if (key == "intraword_tags")
558
			else if (key.equals("intraword_tags"))
558 559
				intraword_tags = value;
559
			else if (key == "punct_quotes")
560
			else if (key.equals("punct_quotes"))
560 561
				punct_quotes = value;
561
			else if (key == "punct_strong1")
562
			else if (key.equals("punct_strong1"))
562 563
				punct_strong1 = value;
563
			else if (key == "punct_strong2")
564
			else if (key.equals("punct_strong2"))
564 565
				punct_strong2 = value;
565
			else if (key == "punct_paren_open1")
566
			else if (key.equals("punct_paren_open1"))
566 567
				punct_paren_open1 = value;
567
			else if (key == "punct_paren_open2")
568
			else if (key.equals("punct_paren_open2"))
568 569
				punct_paren_open2 = value;
569
			else if (key == "punct_paren_close1")
570
			else if (key.equals("punct_paren_close1"))
570 571
				punct_paren_close1 = value;
571
			else if (key == "punct_paren_close2")
572
			else if (key.equals("punct_paren_close2"))
572 573
				punct_paren_close2 = value;
573
			else if (key == "punct_weak")
574
			else if (key .equals("punct_weak"))
574 575
				punct_weak = value;
575
			else if (key == "entity")
576
			else if (key.equals("entity"))
576 577
				entity = value;
577
			else if (key == "seg_tags")
578
			else if (key.equals("seg_tags"))
578 579
				seg_tags = value;
579
			else if (key == "regPunct")
580
			else if (key.equals("regPunct"))
580 581
				regPunct = value;
581
			else if (key == "regElision")
582
			else if (key.equals("regElision"))
582 583
				regElision = value;
583
			else if (key == "whitespaces")
584
			else if (key.equals("whitespaces"))
584 585
				whitespaces = value;
585
			else if (key == "punct_strong") // this is temporary
586
			else if (key.equals("punct_strong")) // this is temporary
586 587
				tmp_strong_punct = value; // this is temporary
587 588
			else
588 589
				System.out.println("MISSING TOKENIZER KEY: "+key);
......
642 643
		String tmp_strong_punct = null;
643 644
		IEclipsePreferences params = projectScope.getNode("Tokenizer");
644 645
		try {
645
			for (String key : params.childrenNames()) {
646
			String[] keys = params.keys();
647
			for (String key : keys) {
646 648
				//					Element param = (Element) params.item(i);
647 649
				//					String key = param.getAttribute("name");
648
				String value = params.get("value", "");
650
				String value = params.get(key, "");
649 651
				//					if (value == null || value.length() == 0) value = param.getTextContent();
650 652
				if (value.length() == 0) value = null;
651 653

  
652 654
				if (debug) System.out.println(" Tokenizer parametrized with "+key+"="+value+"");
653 655

  
654
				if (key == "tag_all")
656
				if (key.equals("tag_all"))
655 657
					tag_all = value;
656
				else if (key == "enclitics")
658
				else if (key.equals("enclitics"))
657 659
					enclitics = value;
658
				else if (key == "encliticsFR")
660
				else if (key.equals("encliticsFR"))
659 661
					encliticsFR = value;
660
				else if (key == "div_tags")
662
				else if (key.equals("div_tags"))
661 663
					div_tags = value;
662
				else if (key == "q_tags")
664
				else if (key.equals("q_tags"))
663 665
					q_tags = value;
664
				else if (key == "extraword1_tags")
666
				else if (key.equals("extraword1_tags"))
665 667
					extraword1_tags = value;
666
				else if (key == "corr_tags_no_seg")
668
				else if (key.equals("corr_tags_no_seg"))
667 669
					corr_tags_no_seg = value;
668
				else if (key == "word_tags")
670
				else if (key.equals("word_tags")) {
669 671
					word_tags = value;
670
				else if (key == "intraword_tags")
672
					word_element_to_create = value; // FIXME for now
673
				} else if (key.equals("intraword_tags"))
671 674
					intraword_tags = value;
672
				else if (key == "punct_quotes")
675
				else if (key.equals("punct_quotes"))
673 676
					punct_quotes = value;
674
				else if (key == "punct_strong1")
677
				else if (key.equals("punct_strong1"))
675 678
					punct_strong1 = value;
676
				else if (key == "punct_strong2")
679
				else if (key.equals("punct_strong2"))
677 680
					punct_strong2 = value;
678
				else if (key == "punct_paren_open1")
681
				else if (key.equals("punct_paren_open1"))
679 682
					punct_paren_open1 = value;
680
				else if (key == "punct_paren_open2")
683
				else if (key.equals("punct_paren_open2"))
681 684
					punct_paren_open2 = value;
682
				else if (key == "punct_paren_close1")
685
				else if (key.equals("punct_paren_close1"))
683 686
					punct_paren_close1 = value;
684
				else if (key == "punct_paren_close2")
687
				else if (key.equals("punct_paren_close2"))
685 688
					punct_paren_close2 = value;
686
				else if (key == "punct_weak")
689
				else if (key.equals("punct_weak"))
687 690
					punct_weak = value;
688
				else if (key == "entity")
691
				else if (key.equals("entity"))
689 692
					entity = value;
690
				else if (key == "seg_tags")
693
				else if (key.equals("seg_tags"))
691 694
					seg_tags = value;
692
				else if (key == "regPunct")
695
				else if (key.equals("regPunct"))
693 696
					regPunct = value;
694
				else if (key == "regElision")
697
				else if (key.equals("regElision"))
695 698
					regElision = value;
696
				else if (key == "whitespaces")
699
				else if (key.equals("whitespaces"))
697 700
					whitespaces = value;
698
				else if (key == "punct_strong") // this is temporary
701
				else if (key.equals("punct_strong")) // this is temporary
699 702
					tmp_strong_punct = value; // this is temporary
700
				else
701
					System.out.println("MISSING TOKENIZER KEY: "+key);
703
//				else
704
//					System.out.println("MISSING TOKENIZER KEY: "+key);
702 705
			}
703 706

  
704 707
			//recombine
tmp/org.txm.groovy.core/src/java/org/txm/groovy/core/GroovyScriptedImportEngine.java (revision 1177)
123 123
			Log.info("\nEnd of Groovy import script: " + ExecTimer.stop()); //$NON-NLS-1$
124 124
			monitor.worked(90);
125 125

  
126
			TokenizerClasses.reset();
126
			TokenizerClasses.reset(); // restore default tokenizer classes
127 127

  
128 128
			Object ready = binding.getVariable("readyToLoad"); //$NON-NLS-1$
129 129
			if (ready != null && ready instanceof Boolean) {
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZImporter.groovy (revision 1177)
224 224
					return false
225 225
				} else {
226 226
					if (inputFile.delete()) {
227
						FileCopy.copy(outputFile, new File(outputFile.getParent(), "copy.xml"))
227 228
						outputFile.renameTo(inputFile)
228 229
					} else {
229 230
						println "Fail to encode $milestone in ${inputFile}: could not replace the file."
......
358 359
				File infile = f;
359 360
				File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
360 361
				SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
361
				if (wordTag.length() > 0)
362
					tokenizer.setWord_tags(wordTag);
363
				tokenizer.setWord_element_to_create(wordTag)
364 362

  
365 363
				if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) {
366 364
					tokenizer.setOutSideTextTags(outSideTextTagsRegex)
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZCompiler.groovy (revision 1177)
33 33
		corpusname = module.getProject().getName();
34 34
		regPath = module.getBinaryDirectory().getAbsolutePath() + "/registry/"+corpusname.toLowerCase()
35 35
		
36
		wtag = module.getProject().getTokenizerParameter("wordelement", "w");
36
		wtag = module.getProject().getTokenizerWordElement();
37 37
		
38 38
		doNormalizeAttributeValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEATTRIBUTEVALUES, "false"))
39 39
		doNormalizeAnaValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEANAVALUES, "false"))
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/filters/Tokeniser/SimpleTokenizerXml.groovy (revision 1177)
59 59

  
60 60
	/** The word_tags. */
61 61
	String word_tags = TokenizerClasses.word_tags;
62
	String word_element_to_create = "w"
62
	String word_element_to_create = TokenizerClasses.word_element_to_create;
63 63
	Pattern reg_word_tags;
64 64
	/** The intraword_tags. */
65 65
	String intraword_tags = TokenizerClasses.intraword_tags;
......
153 153

  
154 154
		word_tags = TokenizerClasses.word_tags;
155 155
		reg_word_tags = Pattern.compile(word_tags);
156
		
156 157
		intraword_tags = TokenizerClasses.intraword_tags;
157 158
		punct_strong = TokenizerClasses.punct_strong;
158 159
		punct_all = TokenizerClasses.punct_all;

Formats disponibles : Unified diff