Révision 3796

TXM/trunk/bundles/org.txm.annotation.core/src/org/txm/annotation/core/AnnotationEngine.java (revision 3796)
39 39
	 * @throws Exception
40 40
	 */
41 41
	public StringTokenizer getStringTokenizer(String lang) throws Exception {
42
		return new SimpleStringTokenizer(lang);
42
		return null;
43 43
	}
44 44
	
45 45
	/**
TXM/trunk/bundles/org.txm.tokenizer.core/src/org/txm/tokenizer/SimpleStringTokenizer.groovy (revision 3796)
142 142
		ArrayList<String> sresult = new ArrayList<String>()
143 143
		if (regSplitWhiteSpaces != null) {
144 144
			for (String s : regSplitWhiteSpaces.split(text)) {		// separate with unicode white spaces
145
				// if (DEBUG){println "process $s"}
145
				if (DEBUG){println "process $s"}
146 146
				sresult.addAll(iterate(s));
147 147
			}
148 148
		}
......
162 162
	protected ArrayList<String> iterate(String s) {
163 163
		ArrayList<String> result = new ArrayList<String>();
164 164
		while (s != null && s.length() > 0) {
165
			//	if (DEBUG){println "  > $s"}
165
			if (DEBUG){println "  > $s"}
166 166
			s = standardChecks(result, s);
167 167
		}
168 168
		return result;
......
194 194
		}
195 195
		
196 196
		if (fclitics != null && (m = s =~ regFClitics) ) {
197
			//	if (DEBUG) println "CLITIC found: $s ->"+ m
197
			if (DEBUG) println "CLITIC found: $s ->"+ m
198 198
			result.addAll(iterate(m.group(1)))
199 199
			
200 200
			result.add(m.group(2));
......
209 209
			
210 210
			return "";
211 211
		} else if (regElision != null && (m = s =~ regElision) ) {
212
			//	if (DEBUG) println "Elision found: $s ->"+ m.group(1)+" + "+m.group(2)+" + "+m.group(3)
212
			if (DEBUG) println "Elision found: $s ->"+ m.group(1)+" + "+m.group(2)+" + "+m.group(3)
213 213
			//iterate(m.group(1))
214 214
			
215 215
			//			int sep = s.indexOf("'");
......
224 224
			
225 225
			return "";
226 226
		} else if (reg3pts != null && (m = s =~ reg3pts) )	{
227
			//	if(DEBUG){println "REG '...' found: $s -> "+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
227
			if(DEBUG){println "REG '...' found: $s -> "+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
228 228
			result.addAll(iterate(m.group(1)))
229 229
			
230 230
			result.add("...");
231 231
			
232 232
			return m.group(3);
233 233
		} else if (regPunct != null && (m = s =~ regPunct) ) {
234
			if(DEBUG){println "PUNCT found: $s ->"+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
234
			if(DEBUG){println "PUNCT '$regPunct' found: $s ->"+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
235 235
			result.addAll(iterate(m.group(1)))
236 236
			
237 237
			result.add(m.group(2));
TXM/trunk/bundles/org.txm.tokenizer.core/src/org/txm/tokenizer/TokenizerClasses.java (revision 3796)
45 45
import org.w3c.dom.NodeList;
46 46
import org.xml.sax.SAXException;
47 47

  
48
import cern.colt.Arrays;
49

  
48 50
/**
49 51
 * Holder for tokenizer rules regexp
50 52
 */
51 53
public class TokenizerClasses {
52
	
54

  
53 55
	public String lang;
54
	
56

  
55 57
	public TokenizerClasses(String lang) {
56 58
		if (lang != null) {
57 59
			this.lang = lang;
......
61 63
		}
62 64
		reset();
63 65
	}
64
	
66

  
65 67
	public TokenizerClasses() {
66 68
		this(Locale.getDefault().getCountry());
67 69
	}
68
	
70

  
69 71
	/**
70 72
	 * Helper
71 73
	 * 
......
78 80
		tc.lang = lang;
79 81
		return tc;
80 82
	}
81
	
83

  
82 84
	//// ROOTS ? ////
83 85
	public boolean debug = false;
84
	
86

  
85 87
	/** The tag_all. */
86 88
	public String tag_all = null;
87
	
89

  
88 90
	/** The Weblex enclitics. */
89 91
	public String enclitics = null;
90
	
92

  
91 93
	public String encliticsFR = null;
92
	
94

  
93 95
	/** The TT enclitics. */
94 96
	public String FClitic_en = null;
95
	
97

  
96 98
	public String PClitic_fr = null;
97
	
99

  
98 100
	public String FClitic_fr = null;
99
	
101

  
100 102
	public String PClitic_it = null;
101
	
103

  
102 104
	public String FClitic_gl = null;
103
	
105

  
104 106
	/** The div_tags. */
105 107
	public String div_tags = null;
106
	
108

  
107 109
	/** The q_tags. */
108 110
	public String q_tags = null;
109
	
111

  
110 112
	/** The extraword_tags. */
111 113
	public String extraword1_tags = null;
112
	
114

  
113 115
	/** The corr_tags_no_seg. */
114 116
	public String corr_tags_no_seg = null;
115
	
117

  
116 118
	/** The word_tags. */
117 119
	public String word_tags = null;
118
	
120

  
119 121
	/** the element to create when a word is created */
120 122
	public String word_element_to_create = null;
121
	
123

  
122 124
	/** The intraword_tags. */
123 125
	public String intraword_tags = null;
124
	
126

  
125 127
	/** The punct_quotes. */
126 128
	public String punct_quotes = null;
127
	
129

  
128 130
	/** The punct_strong1. */
129 131
	public String punct_strong1 = null;
130
	
132

  
131 133
	/** The punct_strong2. */
132 134
	public String punct_strong2 = null;
133
	
135

  
134 136
	/** The punct_paren_open1. */
135 137
	public String punct_paren_open1 = null;
136
	
138

  
137 139
	/** The punct_paren_open2. */
138 140
	public String punct_paren_open2 = null;
139
	
141

  
140 142
	/** The punct_paren_close1. */
141 143
	public String punct_paren_close1 = null;
142
	
144

  
143 145
	/** The punct_paren_close2. */
144 146
	public String punct_paren_close2 = null;
145
	
147

  
146 148
	/** The punct_weak. */
147 149
	public String punct_weak = null;
148
	
150

  
149 151
	public String entity = null;
150
	
152

  
151 153
	public String seg_tags = null;
152
	
154

  
153 155
	//// COMBINAISONS ////
154
	
156

  
155 157
	/** The corr_tags. */
156 158
	public String corr_tags = null;
157
	
159

  
158 160
	/** The extraword_tags. */
159 161
	public String extraword_tags = null;
160
	
162

  
161 163
	/** The punct_strong. */
162 164
	public String punct_strong = null;
163
	
165

  
164 166
	/** The punct_paren_open. */
165 167
	public String punct_paren_open = null;
166
	
168

  
167 169
	/** The punct_paren_close. */
168 170
	public String punct_paren_close = null;
169
	
171

  
170 172
	/** The punct_paren. */
171 173
	public String punct_paren = null;
172
	
174

  
173 175
	/** The punct_all. */
174 176
	public String punct_all = null;
175
	
177

  
176 178
	/** The word_chars. */
177 179
	public String word_chars = null;
178
	
180

  
179 181
	public TTest[] tests = {
180 182
			// new TTest(/\A(.*-)(je|m[eo]i|tu|t[eo]i|lui|luy|ilz|ils?|no?u?s|vo?u?s|on|leurs|ce|ci|là|elles?)(.*)\Z/),
181 183
			// new TTest(/\A(.*)(-)()\Z/, "pon"),
......
185 187
			// de|un groupe de|je ne sais quel|De sorte que|bien que|parce que|c'est pourquoi|de même que|depuis que|Quant à|au-dessus de|par rapport à|à côté de|grâce à|hors de|le long de|Par ma
186 188
			// barbe !|Mon dieu !|Eh bien !|Bonne nuit !|Pas de panique !)(.*)\Z/),
187 189
			// new TTest(/\A([Aa]ujourd'hui)(.*)/,0,1,2),
188
			
190

  
189 191
			// BFM ONLY
190 192
			// new TTest(/\A()([.·][^ .·]+[.·])(.*)\Z/, "num"),
191
			
193

  
192 194
			// TXM REFMAN ONLY
193 195
			// new TTest(/\A(.*)($tag_all)(.*)\Z/, "tag"),
194 196
			// new TTest(/\A(.*)([0-9][0-9][\/][0-9][0-9][\/][0-9][0-9])(.*)\Z/, "date", 1,2,3),
......
198 200
			// new TTest("^(.*)(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)(.*)$", "url", 1, 2, 5),
199 201
			// new TTest(/\A()([0-9]+([.,][0-9]+)?)()\Z/, "num", 1,2,4),
200 202
			// new TTest(/\A()(([A-Z]:)?([\/\\][^\/\\]+)+)()\Z/, "path", 1,2,5),
201
			
203

  
202 204
			// new TTest(/\A([\p{Ps}|\p{Pe}|\p{Pi}|\p{Pf}|\p{Po}|\p{S}]*)(\p{L}++['‘’])(.*)\Z/, "w", 1, 2, 3),
203 205
			// new TTest(/\A(.*)(\.\.\.)(.*)\Z/, "pon"),
204 206
			// new TTest(/\A(.*)(\p{Ps}|\p{Pe}|\p{Pi}|\p{Pf}|\p{Po}|\p{S})(.*)\Z/, "pon")
205 207
	};
206
			
208

  
207 209
	public String whitespaces = null;
208
	
210

  
209 211
	public String regElision = null;
210
	
212

  
211 213
	public String regPunct = null;
212
	
214

  
213 215
	// punct_strong1 = ".!?";
214 216
	// punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
215 217
	// punct_strong = "[$punct_strong1]|$punct_strong2";
......
223 225
	// punct_weak = ",;∼ˆ·*:\"`'“”\\+±=/\\-≤≥<>\\—_ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
224 226
	// punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
225 227
	// word_chars = "[^ $punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|&[^;]+;";
226
	
228

  
227 229
	public void reset() {
228
		tag_all = "<[A-Za-z][^>]+>";
229 230
		
231
		if (debug) System.out.println("Reset TC");
232
		tag_all = "<[A-Za-z][^>]+>";
233

  
230 234
		div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl|opener|dateline";
231 235
		q_tags = "q|quote|said|item|stage|cit|label|heraldry";
232 236
		extraword1_tags = "expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
......
244 248
		punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\≤≥<>\\—ν√μ•@≈→αγ∞≡σ&%|#¼½Θĩ†φθΓ§ẽə∈";
245 249
		entity = "&[^;]+;";
246 250
		seg_tags = "seg";
247
		
251

  
248 252
		enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
249 253
		encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
250
		
254

  
251 255
		/** The TT enclitics. */
252 256
		FClitic_en = "'(s|re|ve|d|m|em|ll)|n['‘’]t";
253 257
		PClitic_fr = "[dcjlmnstyDCJLNMSTY][\'‘’]|[Qq]u[\'‘’]|[Jj]usqu[\'‘’]|[Ll]orsqu[\'‘’]|[Pp]uisqu[\'‘’]|[Qq]uoiqu[\'‘’]";
254 258
		FClitic_fr = "-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m[\'‘’]|-moi|-nous|-on|-toi|-tu|-t[\'‘’]|-vous|-en|-y|-ci|-là";
255 259
		PClitic_it = "[dD][ae]ll[\'‘’]|[nN]ell[\'‘’]|[Aa]ll[\'‘’]|[lLDd][\'‘’]|[Ss]ull[\'‘’]|[Qq]uest[\'‘’]|[Uu]n[\'‘’]|[Ss]enz[\'‘’]|[Tt]utt[\'‘’]";
256 260
		FClitic_gl = "-la|-las|-lo|-los|-nos";
257
		
261

  
258 262
		whitespaces = "[\\p{Z}\\p{C}]+";
259 263
		regElision = "['‘’]";
260 264
		regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]";
261
		
265

  
262 266
		recombine();
263 267
	}
264
	
268

  
265 269
	public void recombine() {
266
		
270

  
267 271
		if (corr_tags == null) {
268 272
			corr_tags = "" + corr_tags_no_seg + "|" + seg_tags + "";
269 273
		}
......
289 293
			word_chars = "[^ " + punct_quotes + "" + punct_strong1 + "" + punct_paren_open1 + "" + punct_paren_close1 + "" + punct_weak + "]+|" + entity + "";
290 294
		}
291 295
	}
292
	
296

  
293 297
	/**
294 298
	 * To dom.
295 299
	 *
......
299 303
	 */
300 304
	@Deprecated
301 305
	public void toDom(Document doc, Node parent) {
306
		
302 307
		Element tokenizer = doc.createElement("tokenizer");
303 308
		tokenizer.setAttribute("onlyThoseTests", "false");
304 309
		parent.appendChild(tokenizer);
305
		
310

  
306 311
		// String tag_all = "<[^>]+>";
307 312
		Element p = doc.createElement("param");
308 313
		p.setAttribute("key", "tag_all");
......
353 358
		p.setAttribute("key", "intraword_tags");
354 359
		p.setTextContent(intraword_tags);
355 360
		tokenizer.appendChild(p);
356
		
361

  
357 362
		// String punct_strong1 = ".!?";
358 363
		p = doc.createElement("param");
359 364
		p.setAttribute("key", "punct_strong1");
......
435 440
		p.setTextContent(regPunct);
436 441
		tokenizer.appendChild(p);
437 442
	}
438
	
443

  
439 444
	/**
440 445
	 * Dump.
441 446
	 */
......
473 478
		System.out.println("regElision = " + regElision + "");
474 479
		System.out.println("regPunct = " + regPunct + "");
475 480
		System.out.println("TESTS:");
476
		
481

  
477 482
		for (TTest test : tests) {
478 483
			System.out.println(" " + test + "");
479 484
		}
480 485
	}
481
	
486

  
482 487
	public boolean isValid() {
483 488
		try {
484 489
			Pattern.compile(tag_all);
......
487 492
			System.out.println("tag_all=" + tag_all + ": " + e);
488 493
			return false;
489 494
		}
490
		
495

  
491 496
		try {
492 497
			Pattern.compile(enclitics);
493 498
		}
......
495 500
			System.out.println("enclitics=" + enclitics + ": " + e);
496 501
			return false;
497 502
		}
498
		
503

  
499 504
		try {
500 505
			Pattern.compile(encliticsFR);
501 506
		}
......
503 508
			System.out.println("encliticsFR=" + encliticsFR + ": " + e);
504 509
			return false;
505 510
		}
506
		
511

  
507 512
		try {
508 513
			Pattern.compile(div_tags);
509 514
		}
......
511 516
			System.out.println("div_tags=" + div_tags + ": " + e);
512 517
			return false;
513 518
		}
514
		
519

  
515 520
		try {
516 521
			Pattern.compile(q_tags);
517 522
		}
......
519 524
			System.out.println("q_tags=" + q_tags + ": " + e);
520 525
			return false;
521 526
		}
522
		
527

  
523 528
		try {
524 529
			Pattern.compile(extraword1_tags);
525 530
		}
......
527 532
			System.out.println("extraword1_tags=" + extraword1_tags + ": " + e);
528 533
			return false;
529 534
		}
530
		
535

  
531 536
		try {
532 537
			Pattern.compile(corr_tags_no_seg);
533 538
		}
......
535 540
			System.out.println("corr_tags_no_seg=" + corr_tags_no_seg + ": " + e);
536 541
			return false;
537 542
		}
538
		
543

  
539 544
		try {
540 545
			Pattern.compile(word_tags);
541 546
		}
......
543 548
			System.out.println("word_tags=" + word_tags + ": " + e);
544 549
			return false;
545 550
		}
546
		
551

  
547 552
		try {
548 553
			Pattern.compile(intraword_tags);
549 554
		}
......
551 556
			System.out.println("intraword_tags=" + intraword_tags + ": " + e);
552 557
			return false;
553 558
		}
554
		
559

  
555 560
		try {
556 561
			Pattern.compile(punct_quotes);
557 562
		}
......
559 564
			System.out.println("punct_quotes=" + punct_quotes + ": " + e);
560 565
			return false;
561 566
		}
562
		
567

  
563 568
		try {
564 569
			Pattern.compile(punct_strong1);
565 570
		}
......
567 572
			System.out.println("punct_strong1=" + punct_strong1 + ": " + e);
568 573
			return false;
569 574
		}
570
		
575

  
571 576
		try {
572 577
			Pattern.compile(punct_strong2);
573 578
		}
......
575 580
			System.out.println("punct_strong2=" + punct_strong2 + ": " + e);
576 581
			return false;
577 582
		}
578
		
583

  
579 584
		try {
580 585
			Pattern.compile(punct_paren_open1);
581 586
		}
......
583 588
			System.out.println("punct_paren_open1=" + punct_paren_open1 + ": " + e);
584 589
			return false;
585 590
		}
586
		
591

  
587 592
		try {
588 593
			Pattern.compile(punct_paren_open2);
589 594
		}
......
591 596
			System.out.println("punct_paren_open2=" + punct_paren_open2 + ": " + e);
592 597
			return false;
593 598
		}
594
		
599

  
595 600
		try {
596 601
			Pattern.compile(punct_paren_close1);
597 602
		}
......
599 604
			System.out.println("punct_paren_close1=" + punct_paren_close1 + ": " + e);
600 605
			return false;
601 606
		}
602
		
607

  
603 608
		try {
604 609
			Pattern.compile(punct_paren_close2);
605 610
		}
......
607 612
			System.out.println("punct_paren_close2=" + punct_paren_close2 + ": " + e);
608 613
			return false;
609 614
		}
610
		
615

  
611 616
		try {
612 617
			Pattern.compile(punct_weak);
613 618
		}
......
615 620
			System.out.println("punct_weak=" + punct_weak + ": " + e);
616 621
			return false;
617 622
		}
618
		
623

  
619 624
		try {
620 625
			Pattern.compile(entity);
621 626
		}
......
623 628
			System.out.println("entity=" + entity + ": " + e);
624 629
			return false;
625 630
		}
626
		
631

  
627 632
		try {
628 633
			Pattern.compile(seg_tags);
629 634
		}
......
631 636
			System.out.println("seg_tags=" + seg_tags + ": " + e);
632 637
			return false;
633 638
		}
634
		
639

  
635 640
		try {
636 641
			Pattern.compile(corr_tags);
637 642
		}
......
639 644
			System.out.println("corr_tags: " + e);
640 645
			return false;
641 646
		}
642
		
647

  
643 648
		try {
644 649
			Pattern.compile(extraword_tags);
645 650
		}
......
647 652
			System.out.println("extraword_tags: " + e);
648 653
			return false;
649 654
		}
650
		
655

  
651 656
		if (punct_strong != null)
652 657
			try {
653
			Pattern.compile(punct_strong);
658
				Pattern.compile(punct_strong);
654 659
			}
655
			catch (Exception e) {
660
		catch (Exception e) {
656 661
			System.out.println("punct_strong: " + e);
657 662
			return false;
658
			}
659
			
663
		}
664

  
660 665
		try {
661 666
			Pattern.compile(punct_paren_open);
662 667
		}
......
664 669
			System.out.println("punct_paren_open: " + e);
665 670
			return false;
666 671
		}
667
		
672

  
668 673
		try {
669 674
			Pattern.compile(punct_paren_close);
670 675
		}
......
672 677
			System.out.println("punct_paren_close: " + e);
673 678
			return false;
674 679
		}
675
		
680

  
676 681
		try {
677 682
			Pattern.compile(punct_paren);
678 683
		}
......
680 685
			System.out.println("punct_paren: " + e);
681 686
			return false;
682 687
		}
683
		
688

  
684 689
		try {
685 690
			Pattern.compile(punct_all);
686 691
		}
......
688 693
			System.out.println("punct_all: " + e);
689 694
			return false;
690 695
		}
691
		
696

  
692 697
		try {
693 698
			Pattern.compile(word_chars);
694 699
		}
......
696 701
			System.out.println("word_chars: " + e);
697 702
			return false;
698 703
		}
699
		
704

  
700 705
		if (regPunct != null)
701 706
			try {
702
			Pattern.compile(regPunct);
707
				Pattern.compile(regPunct);
703 708
			}
704
			catch (Exception e) {
709
		catch (Exception e) {
705 710
			System.out.println("regPunct=" + regPunct + ": " + e);
706 711
			return false;
707
			}
708
			
712
		}
713

  
709 714
		if (regElision != null)
710 715
			try {
711
			Pattern.compile(regElision);
716
				Pattern.compile(regElision);
712 717
			}
713
			catch (Exception e) {
718
		catch (Exception e) {
714 719
			System.out.println("regElision=" + regElision + ": " + e);
715 720
			return false;
716
			}
717
			
721
		}
722

  
718 723
		if (whitespaces != null)
719 724
			try {
720
			Pattern.compile(whitespaces);
725
				Pattern.compile(whitespaces);
721 726
			}
722
			catch (Exception e) {
727
		catch (Exception e) {
723 728
			System.out.println("whitespaces=" + whitespaces + ": " + e);
724 729
			return false;
725
			}
726
			
730
		}
731

  
727 732
		for (TTest test : tests) {
728 733
			try {
729 734
				Pattern.compile(test.getRegex());
......
735 740
		}
736 741
		return true;
737 742
	}
738
	
743

  
739 744
	/**
740 745
	 * old way used in TXM 0.7.9 AND in corpus 0.7.9 -> 0.8.0 restoration
741 746
	 * 
......
753 758
			String value = param.getAttribute("value");
754 759
			if (value == null || value.length() == 0) value = param.getTextContent();
755 760
			if (value.length() == 0) value = null;
756
			
761

  
757 762
			if (debug) System.out.println(" Tokenizer parametrized with " + key + "=" + value + "");
758
			
763

  
759 764
			if (key.equals("tag_all"))
760 765
				tag_all = value;
761 766
			else if (key.equals("enclitics"))
......
808 813
		// recombine
809 814
		recombine();
810 815
		if (tmp_strong_punct != null) punct_strong = tmp_strong_punct;  // this is temporary
811
		
816

  
812 817
		String shouldResetTests = tokenizerElement.getAttribute("onlyThoseTests");
813 818
		if ("true".equals(shouldResetTests)) {
814 819
			System.out.println("Warning: tokenizer only using import parameters tests");
815 820
			// tests = new ArrayList<>();
816 821
		}
817
		
822

  
818 823
		NodeList testsList = tokenizerElement.getElementsByTagName("test");
819 824
		if (testsList.getLength() > 0) {
820 825
			// System.out.println("Add "+testsList.getLength()+" tests to the tokenizer"
......
828 833
			}
829 834
			// System.out.println("Tests: "+tests);
830 835
		}
831
		
836

  
832 837
		if (isValid())
833 838
			return true;
834 839
		else {
......
837 842
			return false;
838 843
		}
839 844
	}
840
	
845

  
841 846
	/**
842 847
	 * The main method.
843 848
	 *
......
847 852
	 * @throws ParserConfigurationException
848 853
	 */
849 854
	public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException {
850
		
855

  
851 856
		// Document doc = DomUtils.load(new File("/home/mdecorde/xml/discours/import.xml"));
852 857
		// Node corpora = doc.getDocumentElement().getElementsByTagName("corpora").item(0);
853 858
		// Node corpus = ((Element)corpora).getElementsByTagName("corpus").item(0);
854 859
		// Node tokenizer = ((Element)corpus).getElementsByTagName("tokenizer").item(0);
855 860
		// System.out.println("OK? "+ TokenizerClasses.loadFromNode((Element) tokenizer));
856 861
	}
857
	
862

  
858 863
	public boolean loadFromProject(ProjectScope projectScope) {
859 864
		// load params
860 865
		IEclipsePreferences params = projectScope.getNode("Tokenizer");
861
		
862 866
		try {
863
			
867
			if (debug) System.out.println(Arrays.toString(params.keys()));
868
			if (debug) System.out.println(Arrays.toString(params.childrenNames()));
869
		} catch (BackingStoreException e1) {
870
			// TODO Auto-generated catch block
871
			e1.printStackTrace();
872
		}
873
		try {
874

  
864 875
			Field[] fields = TokenizerClasses.class.getFields();
865 876
			for (Field field : fields) {
866 877
				int m = field.getModifiers();
867 878
				if (!Modifier.isStatic(m) && Modifier.isPublic(m) && field.getType().equals(String.class)) {
868 879
					String name = field.getName();
869
					String value = params.get(name, null);
880
					String defaultValue = null;
881
					try {
882
						if (field.get(this) != null) {
883
							defaultValue = field.get(this).toString();
884
						}
885
					} catch (Exception e) {
886
						e.printStackTrace();
887
					}
888
					String value = params.get(name, defaultValue);
870 889
					if (value != null) {
871 890
						try {
872 891
							if (debug) System.out.println(" Tokenizer parametrized with " + name + "=" + value);
......
878 897
					}
879 898
				}
880 899
			}
881
			// // old way, now using Reflection
882
			
883
			// String[] keys = params.keys();
884
			// for (String key : keys) {
885
			// // Element param = (Element) params.item(i);
886
			// // String key = param.getAttribute("name");
887
			// String value = params.get(key, "");
888
			// // if (value == null || value.length() == 0) value = param.getTextContent();
889
			// if (value.length() == 0) value = null;
890
			//
891
			// if (debug) System.out.println(" Tokenizer parametrized with "+key+"="+value+"");
892
			//
893
			// if (key.equals("tag_all"))
894
			// tag_all = value;
895
			// else if (key.equals("enclitics"))
896
			// enclitics = value;
897
			// else if (key.equals("encliticsFR"))
898
			// encliticsFR = value;
899
			// else if (key.equals("div_tags"))
900
			// div_tags = value;
901
			// else if (key.equals("q_tags"))
902
			// q_tags = value;
903
			// else if (key.equals("extraword1_tags"))
904
			// extraword1_tags = value;
905
			// else if (key.equals("corr_tags_no_seg"))
906
			// corr_tags_no_seg = value;
907
			// else if (key.equals("word_tags")) {
908
			// word_tags = value;
909
			// word_element_to_create = value; // FIXME for now
910
			// } else if (key.equals("intraword_tags"))
911
			// intraword_tags = value;
912
			// else if (key.equals("punct_quotes"))
913
			// punct_quotes = value;
914
			// else if (key.equals("punct_strong1"))
915
			// punct_strong1 = value;
916
			// else if (key.equals("punct_strong2"))
917
			// punct_strong2 = value;
918
			// else if (key.equals("punct_paren_open1"))
919
			// punct_paren_open1 = value;
920
			// else if (key.equals("punct_paren_open2"))
921
			// punct_paren_open2 = value;
922
			// else if (key.equals("punct_paren_close1"))
923
			// punct_paren_close1 = value;
924
			// else if (key.equals("punct_paren_close2"))
925
			// punct_paren_close2 = value;
926
			// else if (key.equals("punct_weak"))
927
			// punct_weak = value;
928
			// else if (key.equals("entity"))
929
			// entity = value;
930
			// else if (key.equals("seg_tags"))
931
			// seg_tags = value;
932
			// else if (key.equals("regPunct"))
933
			// regPunct = value;
934
			// else if (key.equals("regElision"))
935
			// regElision = value;
936
			// else if (key.equals("whitespaces"))
937
			// whitespaces = value;
938
			// else if (key.equals("punct_strong")) // this is temporary
939
			// tmp_strong_punct = value; // this is temporary
940
			//// else
941
			//// System.out.println("MISSING TOKENIZER KEY: "+key);
942
			// }
943
			
900

  
944 901
			// recombine
945 902
			recombine();
946
			
903

  
947 904
			if (params.getBoolean("onlyThoseTests", false)) {
948 905
				System.out.println("Warning: tokenizer only using import parameters tests");
949 906
				// tests = new ArrayList<>();
950 907
			}
951
			
908

  
952 909
			org.osgi.service.prefs.Preferences testsList = params.node("tests");
953 910
			String[] tests;
954 911
			tests = testsList.childrenNames();
955 912
			// System.out.println("Add "+testsList.getLength()+" tests to the tokenizer"
956 913
			for (String testname : tests) {
957 914
				org.osgi.service.prefs.Preferences testdef = testsList.node(testname);
958
				
915

  
959 916
				// Element test = (Element) testsList.item(i);
960 917
				TTest t = new TTest(testdef.get("content", null), testdef.get("type", null),
961 918
						testdef.getInt("before", 0),
......
963 920
						testdef.getInt("after", 0));
964 921
			}
965 922
			// System.out.println("Tests: "+tests);
966
			
923

  
967 924
		}
968 925
		catch (BackingStoreException e) {
969 926
			// TODO Auto-generated catch block
970 927
			e.printStackTrace();
971 928
		}
972
		
929

  
973 930
		if (isValid()) {
974 931
			return true;
975 932
		}
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZImporter.groovy (revision 3796)
430 430
				cpb.tick()
431 431
				File infile = f;
432 432
				File outfile = new File(module.getBinaryDirectory(), "tokenized/"+f.getName());
433
				SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, TokenizerClasses.newTokenizerClasses(project.getPreferencesScope(), lang))
433
				def tc = TokenizerClasses.newTokenizerClasses(project.getPreferencesScope(), lang)
434
				SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, tc)
434 435
				if (module.getProject().getAnnotate()) { // an annotation will be done, does the annotation engine needs another tokenizer ?
435 436
					String engineName = module.getProject().getImportParameters().node("annotate").get("engine", "TreeTagger")
436 437
					def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine(engineName)
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/filters/Tokeniser/SimpleTokenizerXml.groovy (revision 3796)
130 130
	 * @param outfile the outfile
131 131
	 */
132 132
	public SimpleTokenizerXml(File infile, File outfile, TokenizerClasses tc) {
133
		
133 134
		this.lang = tc.lang;
134
		this.stringTokenizer = new SimpleStringTokenizer(lang);
135
		this.stringTokenizer = new SimpleStringTokenizer(tc);
135 136
		
136 137
		word_tags = tc.word_tags;
137 138
		word_element_to_create = tc.word_element_to_create;

Formats disponibles : Unified diff