Révision 3880

TXM/trunk/bundles/org.txm.tokenizer.core/src/org/txm/tokenizer/TokenizerClasses.java (revision 3880)
228 228

  
229 229
	public void reset() {
230 230
		
231
		if (debug) System.out.println("Reset TC");
232
		tag_all = "<[A-Za-z][^>]+>";
231
		if (debug) System.out.println("Reset TC"); //$NON-NLS-1$
232
		tag_all = "<[A-Za-z][^>]+>"; //$NON-NLS-1$
233 233

  
234
		div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl|opener|dateline";
235
		q_tags = "q|quote|said|item|stage|cit|label|heraldry";
236
		extraword1_tags = "expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
237
		corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno|surplus";
238
		word_tags = "w";
239
		word_element_to_create = "w";
240
		intraword_tags = "c|ex|caesura";
241
		punct_quotes = "'‘’’";
242
		punct_strong1 = ".!?";
243
		punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
244
		punct_paren_open1 = "«";
245
		punct_paren_open2 = "``|\\(|\\[|\\{";
246
		punct_paren_close1 = "»";
247
		punct_paren_close2 = "''|‘‘|’’|\\)|\\]|\\}";
248
		punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\≤≥<>\\—ν√μ•@≈→αγ∞≡σ&%|#¼½Θĩ†φθΓ§ẽə∈";
249
		entity = "&[^;]+;";
250
		seg_tags = "seg";
234
		div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl|opener|dateline"; //$NON-NLS-1$
235
		q_tags = "q|quote|said|item|stage|cit|label|heraldry"; //$NON-NLS-1$
236
		extraword1_tags = "expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail"; //$NON-NLS-1$
237
		corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno|surplus"; //$NON-NLS-1$
238
		word_tags = "w"; //$NON-NLS-1$
239
		word_element_to_create = "w"; //$NON-NLS-1$
240
		intraword_tags = "c|ex|caesura"; //$NON-NLS-1$
241
		punct_quotes = "'‘’’"; //$NON-NLS-1$
242
		punct_strong1 = ".!?"; //$NON-NLS-1$
243
		punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|"; //$NON-NLS-1$
244
		punct_paren_open1 = "«"; //$NON-NLS-1$
245
		punct_paren_open2 = "``|\\(|\\[|\\{"; //$NON-NLS-1$
246
		punct_paren_close1 = "»"; //$NON-NLS-1$
247
		punct_paren_close2 = "''|‘‘|’’|\\)|\\]|\\}"; //$NON-NLS-1$
248
		punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\≤≥<>\\—ν√μ•@≈→αγ∞≡σ&%|#¼½Θĩ†φθΓ§ẽə∈"; //$NON-NLS-1$
249
		entity = "&[^;]+;"; //$NON-NLS-1$
250
		seg_tags = "seg"; //$NON-NLS-1$
251 251

  
252
		enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
253
		encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
252
		enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO //$NON-NLS-1$
253
		encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR //$NON-NLS-1$
254 254

  
255 255
		/** The TT enclitics. */
256
		FClitic_en = "'(s|re|ve|d|m|em|ll)|n['‘’]t";
257
		PClitic_fr = "[dcjlmnstyDCJLNMSTY][\'‘’]|[Qq]u[\'‘’]|[Jj]usqu[\'‘’]|[Ll]orsqu[\'‘’]|[Pp]uisqu[\'‘’]|[Qq]uoiqu[\'‘’]";
258
		FClitic_fr = "-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m[\'‘’]|-moi|-nous|-on|-toi|-tu|-t[\'‘’]|-vous|-en|-y|-ci|-là";
259
		PClitic_it = "[dD][ae]ll[\'‘’]|[nN]ell[\'‘’]|[Aa]ll[\'‘’]|[lLDd][\'‘’]|[Ss]ull[\'‘’]|[Qq]uest[\'‘’]|[Uu]n[\'‘’]|[Ss]enz[\'‘’]|[Tt]utt[\'‘’]";
260
		FClitic_gl = "-la|-las|-lo|-los|-nos";
256
		FClitic_en = "'(s|re|ve|d|m|em|ll)|n['‘’]t"; //$NON-NLS-1$
257
		PClitic_fr = "[dcjlmnstyDCJLNMSTY][\'‘’]|[Qq]u[\'‘’]|[Jj]usqu[\'‘’]|[Ll]orsqu[\'‘’]|[Pp]uisqu[\'‘’]|[Qq]uoiqu[\'‘’]"; //$NON-NLS-1$
258
		FClitic_fr = "-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m[\'‘’]|-moi|-nous|-on|-toi|-tu|-t[\'‘’]|-vous|-en|-y|-ci|-là"; //$NON-NLS-1$
259
		PClitic_it = "[dD][ae]ll[\'‘’]|[nN]ell[\'‘’]|[Aa]ll[\'‘’]|[lLDd][\'‘’]|[Ss]ull[\'‘’]|[Qq]uest[\'‘’]|[Uu]n[\'‘’]|[Ss]enz[\'‘’]|[Tt]utt[\'‘’]"; //$NON-NLS-1$
260
		FClitic_gl = "-la|-las|-lo|-los|-nos"; //$NON-NLS-1$
261 261

  
262
		whitespaces = "[\\p{Z}\\p{C}]+";
263
		regElision = "['‘’]";
264
		regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]";
262
		whitespaces = "[\\p{Z}\\p{C}]+"; //$NON-NLS-1$
263
		regElision = "['‘’]"; //$NON-NLS-1$
264
		regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]"; //$NON-NLS-1$
265 265

  
266 266
		recombine();
267 267
	}
......
269 269
	public void recombine() {
270 270

  
271 271
		if (corr_tags == null) {
272
			corr_tags = "" + corr_tags_no_seg + "|" + seg_tags + "";
272
			corr_tags = "" + corr_tags_no_seg + "|" + seg_tags + ""; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
273 273
		}
274 274
		if (extraword_tags == null) {
275
			extraword_tags = "" + div_tags + "|" + q_tags + "|" + extraword1_tags + "";
275
			extraword_tags = "" + div_tags + "|" + q_tags + "|" + extraword1_tags + ""; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
276 276
		}
277 277
		if (punct_strong == null) {
278
			punct_strong = "[" + punct_strong1 + "]|" + punct_strong2 + "";
278
			punct_strong = "[" + punct_strong1 + "]|" + punct_strong2 + ""; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
279 279
		}
280 280
		if (punct_paren_open == null) {
281
			punct_paren_open = "" + punct_paren_open1 + "|" + punct_paren_open2 + "";
281
			punct_paren_open = "" + punct_paren_open1 + "|" + punct_paren_open2 + ""; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
282 282
		}
283 283
		if (punct_paren_close == null) {
284
			punct_paren_close = "" + punct_paren_close1 + "|" + punct_paren_close2 + "";
284
			punct_paren_close = "" + punct_paren_close1 + "|" + punct_paren_close2 + ""; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
285 285
		}
286 286
		if (punct_paren == null) {
287
			punct_paren = "" + punct_paren_open + "|" + punct_paren_close + "";
287
			punct_paren = "" + punct_paren_open + "|" + punct_paren_close + ""; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
288 288
		}
289 289
		if (punct_all == null) {
290
			punct_all = "" + punct_strong + "|" + punct_paren + "|[" + punct_weak + "]";
290
			punct_all = "" + punct_strong + "|" + punct_paren + "|[" + punct_weak + "]"; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
291 291
		}
292 292
		if (word_chars == null) {
293
			word_chars = "[^ " + punct_quotes + "" + punct_strong1 + "" + punct_paren_open1 + "" + punct_paren_close1 + "" + punct_weak + "]+|" + entity + "";
293
			word_chars = "[^ " + punct_quotes + "" + punct_strong1 + "" + punct_paren_open1 + "" + punct_paren_close1 + "" + punct_weak + "]+|" + entity + ""; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$
294 294
		}
295 295
	}
296 296

  
......
304 304
	@Deprecated
305 305
	public void toDom(Document doc, Node parent) {
306 306
		
307
		Element tokenizer = doc.createElement("tokenizer");
308
		tokenizer.setAttribute("onlyThoseTests", "false");
307
		Element tokenizer = doc.createElement("tokenizer"); //$NON-NLS-1$
308
		tokenizer.setAttribute("onlyThoseTests", "false"); //$NON-NLS-1$ //$NON-NLS-2$
309 309
		parent.appendChild(tokenizer);
310 310

  
311 311
		// String tag_all = "<[^>]+>";
312
		Element p = doc.createElement("param");
313
		p.setAttribute("key", "tag_all");
312
		Element p = doc.createElement("param"); //$NON-NLS-1$
313
		p.setAttribute("key", "tag_all"); //$NON-NLS-1$ //$NON-NLS-2$
314 314
		p.setTextContent(tag_all);
315 315
		tokenizer.appendChild(p);
316 316
		// String enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
317
		p = doc.createElement("param");
318
		p.setAttribute("key", "enclitics");
317
		p = doc.createElement("param"); //$NON-NLS-1$
318
		p.setAttribute("key", "enclitics"); //$NON-NLS-1$ //$NON-NLS-2$
319 319
		p.setTextContent(enclitics);
320 320
		tokenizer.appendChild(p);
321 321
		// String encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
322
		p = doc.createElement("param");
323
		p.setAttribute("key", "encliticsFR");
322
		p = doc.createElement("param"); //$NON-NLS-1$
323
		p.setAttribute("key", "encliticsFR"); //$NON-NLS-1$ //$NON-NLS-2$
324 324
		p.setTextContent(encliticsFR);
325 325
		tokenizer.appendChild(p);
326 326
		// String div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl";
327
		p = doc.createElement("param");
328
		p.setAttribute("key", "div_tags");
327
		p = doc.createElement("param"); //$NON-NLS-1$
328
		p.setAttribute("key", "div_tags"); //$NON-NLS-1$ //$NON-NLS-2$
329 329
		p.setTextContent(div_tags);
330 330
		tokenizer.appendChild(p);
331 331
		// String q_tags = "q|quote|item|stage";
332
		p = doc.createElement("param");
333
		p.setAttribute("key", "q_tags");
332
		p = doc.createElement("param"); //$NON-NLS-1$
333
		p.setAttribute("key", "q_tags"); //$NON-NLS-1$ //$NON-NLS-2$
334 334
		p.setTextContent(q_tags);
335 335
		tokenizer.appendChild(p);
336 336
		// String extraword_tags = "$div_tags|$q_tags|expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
337
		p = doc.createElement("param");
338
		p.setAttribute("key", "extraword_tags");
337
		p = doc.createElement("param"); //$NON-NLS-1$
338
		p.setAttribute("key", "extraword_tags"); //$NON-NLS-1$ //$NON-NLS-2$
339 339
		p.setTextContent(extraword_tags);
340 340
		tokenizer.appendChild(p);
341 341
		// String corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno";
342
		p = doc.createElement("param");
343
		p.setAttribute("key", "enclitics");
342
		p = doc.createElement("param"); //$NON-NLS-1$
343
		p.setAttribute("key", "enclitics"); //$NON-NLS-1$ //$NON-NLS-2$
344 344
		p.setTextContent(enclitics);
345 345
		tokenizer.appendChild(p);
346 346
		// String corr_tags = "$corr_tags_no_seg|seg";
347
		p = doc.createElement("param");
348
		p.setAttribute("key", "corr_tags");
347
		p = doc.createElement("param"); //$NON-NLS-1$
348
		p.setAttribute("key", "corr_tags"); //$NON-NLS-1$ //$NON-NLS-2$
349 349
		p.setTextContent(corr_tags);
350 350
		tokenizer.appendChild(p);
351 351
		// String word_tags = "w|abbr|num";
352
		p = doc.createElement("param");
353
		p.setAttribute("key", "word_tags");
352
		p = doc.createElement("param"); //$NON-NLS-1$
353
		p.setAttribute("key", "word_tags"); //$NON-NLS-1$ //$NON-NLS-2$
354 354
		p.setTextContent(word_tags);
355 355
		tokenizer.appendChild(p);
356 356
		// String intraword_tags = "c|ex";
357
		p = doc.createElement("param");
358
		p.setAttribute("key", "intraword_tags");
357
		p = doc.createElement("param"); //$NON-NLS-1$
358
		p.setAttribute("key", "intraword_tags"); //$NON-NLS-1$ //$NON-NLS-2$
359 359
		p.setTextContent(intraword_tags);
360 360
		tokenizer.appendChild(p);
361 361

  
362 362
		// String punct_strong1 = ".!?";
363
		p = doc.createElement("param");
364
		p.setAttribute("key", "punct_strong1");
363
		p = doc.createElement("param"); //$NON-NLS-1$
364
		p.setAttribute("key", "punct_strong1"); //$NON-NLS-1$ //$NON-NLS-2$
365 365
		p.setTextContent(punct_strong1);
366 366
		tokenizer.appendChild(p);
367 367
		// String punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
368
		p = doc.createElement("param");
369
		p.setAttribute("key", "punct_strong2");
368
		p = doc.createElement("param"); //$NON-NLS-1$
369
		p.setAttribute("key", "punct_strong2"); //$NON-NLS-1$ //$NON-NLS-2$
370 370
		p.setTextContent(punct_strong2);
371 371
		tokenizer.appendChild(p);
372 372
		// String punct_strong = "[$punct_strong1]|$punct_strong2";
373
		p = doc.createElement("param");
374
		p.setAttribute("key", "punct_strong");
373
		p = doc.createElement("param"); //$NON-NLS-1$
374
		p.setAttribute("key", "punct_strong"); //$NON-NLS-1$ //$NON-NLS-2$
375 375
		p.setTextContent(punct_strong);
376 376
		tokenizer.appendChild(p);
377 377
		// String punct_paren_open1 = "«";
378
		p = doc.createElement("param");
379
		p.setAttribute("key", "punct_paren_open1");
378
		p = doc.createElement("param"); //$NON-NLS-1$
379
		p.setAttribute("key", "punct_paren_open1"); //$NON-NLS-1$ //$NON-NLS-2$
380 380
		p.setTextContent(punct_paren_open1);
381 381
		tokenizer.appendChild(p);
382 382
		// String punct_paren_open2 = "<<|``|\\(|\\[|\\{";
383
		p = doc.createElement("param");
384
		p.setAttribute("key", "punct_paren_open2");
383
		p = doc.createElement("param"); //$NON-NLS-1$
384
		p.setAttribute("key", "punct_paren_open2"); //$NON-NLS-1$ //$NON-NLS-2$
385 385
		p.setTextContent(punct_paren_open2);
386 386
		tokenizer.appendChild(p);
387 387
		// String punct_paren_open = "$punct_paren_open1|$punct_paren_open2";
388
		p = doc.createElement("param");
389
		p.setAttribute("key", "punct_paren_open");
388
		p = doc.createElement("param"); //$NON-NLS-1$
389
		p.setAttribute("key", "punct_paren_open"); //$NON-NLS-1$ //$NON-NLS-2$
390 390
		p.setTextContent(punct_paren_open);
391 391
		tokenizer.appendChild(p);
392 392
		// String punct_paren_close1 = "»";
393
		p = doc.createElement("param");
394
		p.setAttribute("key", "punct_paren_close1");
393
		p = doc.createElement("param"); //$NON-NLS-1$
394
		p.setAttribute("key", "punct_paren_close1"); //$NON-NLS-1$ //$NON-NLS-2$
395 395
		p.setTextContent(punct_paren_close1);
396 396
		tokenizer.appendChild(p);
397 397
		// String punct_paren_close2 = ">>|''|‘‘|’’|\\)|\\]|\\}";
398
		p = doc.createElement("param");
399
		p.setAttribute("key", "punct_paren_close2");
398
		p = doc.createElement("param"); //$NON-NLS-1$
399
		p.setAttribute("key", "punct_paren_close2"); //$NON-NLS-1$ //$NON-NLS-2$
400 400
		p.setTextContent(punct_paren_close2);
401 401
		tokenizer.appendChild(p);
402 402
		// String punct_paren_close = "$punct_paren_close1|$punct_paren_close2";
403
		p = doc.createElement("param");
404
		p.setAttribute("key", "punct_paren_close");
403
		p = doc.createElement("param"); //$NON-NLS-1$
404
		p.setAttribute("key", "punct_paren_close"); //$NON-NLS-1$ //$NON-NLS-2$
405 405
		p.setTextContent(punct_paren_close);
406 406
		tokenizer.appendChild(p);
407 407
		// String punct_paren = "$punct_paren_open|$punct_paren_close";
408
		p = doc.createElement("param");
409
		p.setAttribute("key", "punct_paren");
408
		p = doc.createElement("param"); //$NON-NLS-1$
409
		p.setAttribute("key", "punct_paren"); //$NON-NLS-1$ //$NON-NLS-2$
410 410
		p.setTextContent(punct_paren);
411 411
		tokenizer.appendChild(p);
412 412
		// String punct_weak = ",;∼ˆ·*:\"`'“”\\+±=/\\-≤≥<>\\—_ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
413
		p = doc.createElement("param");
414
		p.setAttribute("key", "punct_weak");
413
		p = doc.createElement("param"); //$NON-NLS-1$
414
		p.setAttribute("key", "punct_weak"); //$NON-NLS-1$ //$NON-NLS-2$
415 415
		p.setTextContent(punct_weak);
416 416
		tokenizer.appendChild(p);
417 417
		// String punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
418
		p = doc.createElement("param");
419
		p.setAttribute("key", "punct_all");
418
		p = doc.createElement("param"); //$NON-NLS-1$
419
		p.setAttribute("key", "punct_all"); //$NON-NLS-1$ //$NON-NLS-2$
420 420
		p.setTextContent(punct_all);
421 421
		tokenizer.appendChild(p);
422 422
		// String word_chars = "[^ $punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|&[^;]+;";
423
		p = doc.createElement("param");
424
		p.setAttribute("key", "word_chars");
423
		p = doc.createElement("param"); //$NON-NLS-1$
424
		p.setAttribute("key", "word_chars"); //$NON-NLS-1$ //$NON-NLS-2$
425 425
		p.setTextContent(word_chars);
426 426
		tokenizer.appendChild(p);
427 427
		// whitespaces = "[\\p{Z}\\p{C}]+";
428
		p = doc.createElement("param");
429
		p.setAttribute("key", "whitespaces");
428
		p = doc.createElement("param"); //$NON-NLS-1$
429
		p.setAttribute("key", "whitespaces"); //$NON-NLS-1$ //$NON-NLS-2$
430 430
		p.setTextContent(whitespaces);
431 431
		tokenizer.appendChild(p);
432 432
		// regElision = "['‘’]"
433
		p = doc.createElement("param");
434
		p.setAttribute("key", "regElision");
433
		p = doc.createElement("param"); //$NON-NLS-1$
434
		p.setAttribute("key", "regElision"); //$NON-NLS-1$ //$NON-NLS-2$
435 435
		p.setTextContent(regElision);
436 436
		tokenizer.appendChild(p);
437 437
		// regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]"
438
		p = doc.createElement("param");
439
		p.setAttribute("key", "regPunct");
438
		p = doc.createElement("param"); //$NON-NLS-1$
439
		p.setAttribute("key", "regPunct"); //$NON-NLS-1$ //$NON-NLS-2$
440 440
		p.setTextContent(regPunct);
441 441
		tokenizer.appendChild(p);
442 442
	}
......
445 445
	 * Dump.
446 446
	 */
447 447
	public void dump() {
448
		System.out.println("BRUT");
449
		System.out.println("tag_all = " + tag_all + "");
450
		System.out.println("enclitics = " + enclitics + "");
451
		System.out.println("encliticsFR = " + encliticsFR + "");
452
		System.out.println("div_tags = " + div_tags + "");
453
		System.out.println("q_tags = " + q_tags + "");
454
		System.out.println("extraword1_tags = " + extraword1_tags + "");
455
		System.out.println("corr_tags_no_seg = " + corr_tags_no_seg + "");
456
		System.out.println("word_tags = " + word_tags + "");
457
		System.out.println("intraword_tags = " + intraword_tags + "");
458
		System.out.println("punct_quotes = " + punct_quotes + "");
459
		System.out.println("punct_strong1 = " + punct_strong1 + "");
460
		System.out.println("punct_strong2 = " + punct_strong2 + "");
461
		System.out.println("punct_paren_open1 = " + punct_paren_open1 + "");
462
		System.out.println("punct_paren_open2 = " + punct_paren_open2 + "");
463
		System.out.println("punct_paren_close1 = " + punct_paren_close1 + "");
464
		System.out.println("punct_paren_close2 = " + punct_paren_close2 + "");
465
		System.out.println("punct_weak = " + punct_weak + "");
466
		System.out.println("entity = " + entity + "");
467
		System.out.println("seg_tags = " + seg_tags + "");
468
		System.out.println("COMBINED");
469
		System.out.println("corr_tags = " + corr_tags + "");
470
		System.out.println("extraword_tags = " + extraword_tags + "");
471
		System.out.println("punct_strong = " + punct_strong + "");
472
		System.out.println("punct_paren_open = " + punct_paren_open + "");
473
		System.out.println("punct_paren_close = " + punct_paren_close + "");
474
		System.out.println("punct_paren = " + punct_paren + "");
475
		System.out.println("punct_all = " + punct_all + "");
476
		System.out.println("word_chars = " + word_chars + "");
477
		System.out.println("whitespaces = " + whitespaces + "");
478
		System.out.println("regElision = " + regElision + "");
479
		System.out.println("regPunct = " + regPunct + "");
480
		System.out.println("TESTS:");
448
		System.out.println("BRUT"); //$NON-NLS-1$
449
		System.out.println("tag_all = " + tag_all + ""); //$NON-NLS-1$ //$NON-NLS-2$
450
		System.out.println("enclitics = " + enclitics + ""); //$NON-NLS-1$ //$NON-NLS-2$
451
		System.out.println("encliticsFR = " + encliticsFR + ""); //$NON-NLS-1$ //$NON-NLS-2$
452
		System.out.println("div_tags = " + div_tags + ""); //$NON-NLS-1$ //$NON-NLS-2$
453
		System.out.println("q_tags = " + q_tags + ""); //$NON-NLS-1$ //$NON-NLS-2$
454
		System.out.println("extraword1_tags = " + extraword1_tags + ""); //$NON-NLS-1$ //$NON-NLS-2$
455
		System.out.println("corr_tags_no_seg = " + corr_tags_no_seg + ""); //$NON-NLS-1$ //$NON-NLS-2$
456
		System.out.println("word_tags = " + word_tags + ""); //$NON-NLS-1$ //$NON-NLS-2$
457
		System.out.println("intraword_tags = " + intraword_tags + ""); //$NON-NLS-1$ //$NON-NLS-2$
458
		System.out.println("punct_quotes = " + punct_quotes + ""); //$NON-NLS-1$ //$NON-NLS-2$
459
		System.out.println("punct_strong1 = " + punct_strong1 + ""); //$NON-NLS-1$ //$NON-NLS-2$
460
		System.out.println("punct_strong2 = " + punct_strong2 + ""); //$NON-NLS-1$ //$NON-NLS-2$
461
		System.out.println("punct_paren_open1 = " + punct_paren_open1 + ""); //$NON-NLS-1$ //$NON-NLS-2$
462
		System.out.println("punct_paren_open2 = " + punct_paren_open2 + ""); //$NON-NLS-1$ //$NON-NLS-2$
463
		System.out.println("punct_paren_close1 = " + punct_paren_close1 + ""); //$NON-NLS-1$ //$NON-NLS-2$
464
		System.out.println("punct_paren_close2 = " + punct_paren_close2 + ""); //$NON-NLS-1$ //$NON-NLS-2$
465
		System.out.println("punct_weak = " + punct_weak + ""); //$NON-NLS-1$ //$NON-NLS-2$
466
		System.out.println("entity = " + entity + ""); //$NON-NLS-1$ //$NON-NLS-2$
467
		System.out.println("seg_tags = " + seg_tags + ""); //$NON-NLS-1$ //$NON-NLS-2$
468
		System.out.println("COMBINED"); //$NON-NLS-1$
469
		System.out.println("corr_tags = " + corr_tags + ""); //$NON-NLS-1$ //$NON-NLS-2$
470
		System.out.println("extraword_tags = " + extraword_tags + ""); //$NON-NLS-1$ //$NON-NLS-2$
471
		System.out.println("punct_strong = " + punct_strong + ""); //$NON-NLS-1$ //$NON-NLS-2$
472
		System.out.println("punct_paren_open = " + punct_paren_open + ""); //$NON-NLS-1$ //$NON-NLS-2$
473
		System.out.println("punct_paren_close = " + punct_paren_close + ""); //$NON-NLS-1$ //$NON-NLS-2$
474
		System.out.println("punct_paren = " + punct_paren + ""); //$NON-NLS-1$ //$NON-NLS-2$
475
		System.out.println("punct_all = " + punct_all + ""); //$NON-NLS-1$ //$NON-NLS-2$
476
		System.out.println("word_chars = " + word_chars + ""); //$NON-NLS-1$ //$NON-NLS-2$
477
		System.out.println("whitespaces = " + whitespaces + ""); //$NON-NLS-1$ //$NON-NLS-2$
478
		System.out.println("regElision = " + regElision + ""); //$NON-NLS-1$ //$NON-NLS-2$
479
		System.out.println("regPunct = " + regPunct + ""); //$NON-NLS-1$ //$NON-NLS-2$
480
		System.out.println("TESTS:"); //$NON-NLS-1$
481 481

  
482 482
		for (TTest test : tests) {
483
			System.out.println(" " + test + "");
483
			System.out.println(" " + test + ""); //$NON-NLS-1$ //$NON-NLS-2$
484 484
		}
485 485
	}
486 486

  
......
489 489
			Pattern.compile(tag_all);
490 490
		}
491 491
		catch (Exception e) {
492
			System.out.println("tag_all=" + tag_all + ": " + e);
492
			System.out.println("tag_all=" + tag_all + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
493 493
			return false;
494 494
		}
495 495

  
......
497 497
			Pattern.compile(enclitics);
498 498
		}
499 499
		catch (Exception e) {
500
			System.out.println("enclitics=" + enclitics + ": " + e);
500
			System.out.println("enclitics=" + enclitics + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
501 501
			return false;
502 502
		}
503 503

  
......
505 505
			Pattern.compile(encliticsFR);
506 506
		}
507 507
		catch (Exception e) {
508
			System.out.println("encliticsFR=" + encliticsFR + ": " + e);
508
			System.out.println("encliticsFR=" + encliticsFR + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
509 509
			return false;
510 510
		}
511 511

  
......
513 513
			Pattern.compile(div_tags);
514 514
		}
515 515
		catch (Exception e) {
516
			System.out.println("div_tags=" + div_tags + ": " + e);
516
			System.out.println("div_tags=" + div_tags + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
517 517
			return false;
518 518
		}
519 519

  
......
521 521
			Pattern.compile(q_tags);
522 522
		}
523 523
		catch (Exception e) {
524
			System.out.println("q_tags=" + q_tags + ": " + e);
524
			System.out.println("q_tags=" + q_tags + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
525 525
			return false;
526 526
		}
527 527

  
......
529 529
			Pattern.compile(extraword1_tags);
530 530
		}
531 531
		catch (Exception e) {
532
			System.out.println("extraword1_tags=" + extraword1_tags + ": " + e);
532
			System.out.println("extraword1_tags=" + extraword1_tags + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
533 533
			return false;
534 534
		}
535 535

  
......
537 537
			Pattern.compile(corr_tags_no_seg);
538 538
		}
539 539
		catch (Exception e) {
540
			System.out.println("corr_tags_no_seg=" + corr_tags_no_seg + ": " + e);
540
			System.out.println("corr_tags_no_seg=" + corr_tags_no_seg + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
541 541
			return false;
542 542
		}
543 543

  
......
545 545
			Pattern.compile(word_tags);
546 546
		}
547 547
		catch (Exception e) {
548
			System.out.println("word_tags=" + word_tags + ": " + e);
548
			System.out.println("word_tags=" + word_tags + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
549 549
			return false;
550 550
		}
551 551

  
......
553 553
			Pattern.compile(intraword_tags);
554 554
		}
555 555
		catch (Exception e) {
556
			System.out.println("intraword_tags=" + intraword_tags + ": " + e);
556
			System.out.println("intraword_tags=" + intraword_tags + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
557 557
			return false;
558 558
		}
559 559

  
......
561 561
			Pattern.compile(punct_quotes);
562 562
		}
563 563
		catch (Exception e) {
564
			System.out.println("punct_quotes=" + punct_quotes + ": " + e);
564
			System.out.println("punct_quotes=" + punct_quotes + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
565 565
			return false;
566 566
		}
567 567

  
......
569 569
			Pattern.compile(punct_strong1);
570 570
		}
571 571
		catch (Exception e) {
572
			System.out.println("punct_strong1=" + punct_strong1 + ": " + e);
572
			System.out.println("punct_strong1=" + punct_strong1 + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
573 573
			return false;
574 574
		}
575 575

  
......
577 577
			Pattern.compile(punct_strong2);
578 578
		}
579 579
		catch (Exception e) {
580
			System.out.println("punct_strong2=" + punct_strong2 + ": " + e);
580
			System.out.println("punct_strong2=" + punct_strong2 + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
581 581
			return false;
582 582
		}
583 583

  
......
585 585
			Pattern.compile(punct_paren_open1);
586 586
		}
587 587
		catch (Exception e) {
588
			System.out.println("punct_paren_open1=" + punct_paren_open1 + ": " + e);
588
			System.out.println("punct_paren_open1=" + punct_paren_open1 + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
589 589
			return false;
590 590
		}
591 591

  
......
593 593
			Pattern.compile(punct_paren_open2);
594 594
		}
595 595
		catch (Exception e) {
596
			System.out.println("punct_paren_open2=" + punct_paren_open2 + ": " + e);
596
			System.out.println("punct_paren_open2=" + punct_paren_open2 + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
597 597
			return false;
598 598
		}
599 599

  
......
601 601
			Pattern.compile(punct_paren_close1);
602 602
		}
603 603
		catch (Exception e) {
604
			System.out.println("punct_paren_close1=" + punct_paren_close1 + ": " + e);
604
			System.out.println("punct_paren_close1=" + punct_paren_close1 + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
605 605
			return false;
606 606
		}
607 607

  
......
609 609
			Pattern.compile(punct_paren_close2);
610 610
		}
611 611
		catch (Exception e) {
612
			System.out.println("punct_paren_close2=" + punct_paren_close2 + ": " + e);
612
			System.out.println("punct_paren_close2=" + punct_paren_close2 + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
613 613
			return false;
614 614
		}
615 615

  
......
617 617
			Pattern.compile(punct_weak);
618 618
		}
619 619
		catch (Exception e) {
620
			System.out.println("punct_weak=" + punct_weak + ": " + e);
620
			System.out.println("punct_weak=" + punct_weak + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
621 621
			return false;
622 622
		}
623 623

  
......
625 625
			Pattern.compile(entity);
626 626
		}
627 627
		catch (Exception e) {
628
			System.out.println("entity=" + entity + ": " + e);
628
			System.out.println("entity=" + entity + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
629 629
			return false;
630 630
		}
631 631

  
......
633 633
			Pattern.compile(seg_tags);
634 634
		}
635 635
		catch (Exception e) {
636
			System.out.println("seg_tags=" + seg_tags + ": " + e);
636
			System.out.println("seg_tags=" + seg_tags + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
637 637
			return false;
638 638
		}
639 639

  
......
641 641
			Pattern.compile(corr_tags);
642 642
		}
643 643
		catch (Exception e) {
644
			System.out.println("corr_tags: " + e);
644
			System.out.println("corr_tags: " + e); //$NON-NLS-1$
645 645
			return false;
646 646
		}
647 647

  
......
649 649
			Pattern.compile(extraword_tags);
650 650
		}
651 651
		catch (Exception e) {
652
			System.out.println("extraword_tags: " + e);
652
			System.out.println("extraword_tags: " + e); //$NON-NLS-1$
653 653
			return false;
654 654
		}
655 655

  
......
658 658
				Pattern.compile(punct_strong);
659 659
			}
660 660
		catch (Exception e) {
661
			System.out.println("punct_strong: " + e);
661
			System.out.println("punct_strong: " + e); //$NON-NLS-1$
662 662
			return false;
663 663
		}
664 664

  
......
666 666
			Pattern.compile(punct_paren_open);
667 667
		}
668 668
		catch (Exception e) {
669
			System.out.println("punct_paren_open: " + e);
669
			System.out.println("punct_paren_open: " + e); //$NON-NLS-1$
670 670
			return false;
671 671
		}
672 672

  
......
674 674
			Pattern.compile(punct_paren_close);
675 675
		}
676 676
		catch (Exception e) {
677
			System.out.println("punct_paren_close: " + e);
677
			System.out.println("punct_paren_close: " + e); //$NON-NLS-1$
678 678
			return false;
679 679
		}
680 680

  
......
682 682
			Pattern.compile(punct_paren);
683 683
		}
684 684
		catch (Exception e) {
685
			System.out.println("punct_paren: " + e);
685
			System.out.println("punct_paren: " + e); //$NON-NLS-1$
686 686
			return false;
687 687
		}
688 688

  
......
690 690
			Pattern.compile(punct_all);
691 691
		}
692 692
		catch (Exception e) {
693
			System.out.println("punct_all: " + e);
693
			System.out.println("punct_all: " + e); //$NON-NLS-1$
694 694
			return false;
695 695
		}
696 696

  
......
698 698
			Pattern.compile(word_chars);
699 699
		}
700 700
		catch (Exception e) {
701
			System.out.println("word_chars: " + e);
701
			System.out.println("word_chars: " + e); //$NON-NLS-1$
702 702
			return false;
703 703
		}
704 704

  
......
707 707
				Pattern.compile(regPunct);
708 708
			}
709 709
		catch (Exception e) {
710
			System.out.println("regPunct=" + regPunct + ": " + e);
710
			System.out.println("regPunct=" + regPunct + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
711 711
			return false;
712 712
		}
713 713

  
......
716 716
				Pattern.compile(regElision);
717 717
			}
718 718
		catch (Exception e) {
719
			System.out.println("regElision=" + regElision + ": " + e);
719
			System.out.println("regElision=" + regElision + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
720 720
			return false;
721 721
		}
722 722

  
......
725 725
				Pattern.compile(whitespaces);
726 726
			}
727 727
		catch (Exception e) {
728
			System.out.println("whitespaces=" + whitespaces + ": " + e);
728
			System.out.println("whitespaces=" + whitespaces + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
729 729
			return false;
730 730
		}
731 731

  
......
734 734
				Pattern.compile(test.getRegex());
735 735
			}
736 736
			catch (Exception e) {
737
				System.out.println("" + test.getRegex() + ": " + e);
737
				System.out.println("" + test.getRegex() + ": " + e); //$NON-NLS-1$ //$NON-NLS-2$
738 738
				return false;
739 739
			}
740 740
		}
......
751 751
	public boolean loadFromNode(Element tokenizerElement) {
752 752
		// load params
753 753
		String tmp_strong_punct = null;
754
		NodeList params = tokenizerElement.getElementsByTagName("param");
754
		NodeList params = tokenizerElement.getElementsByTagName("param"); //$NON-NLS-1$
755 755
		for (int i = 0; i < params.getLength(); i++) {
756 756
			Element param = (Element) params.item(i);
757
			String key = param.getAttribute("name");
758
			String value = param.getAttribute("value");
757
			String key = param.getAttribute("name"); //$NON-NLS-1$
758
			String value = param.getAttribute("value"); //$NON-NLS-1$
759 759
			if (value == null || value.length() == 0) value = param.getTextContent();
760 760
			if (value.length() == 0) value = null;
761 761

  
762
			if (debug) System.out.println(" Tokenizer parametrized with " + key + "=" + value + "");
762
			if (debug) System.out.println(" Tokenizer parametrized with " + key + "=" + value + ""); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
763 763

  
764
			if (key.equals("tag_all"))
764
			if (key.equals("tag_all")) //$NON-NLS-1$
765 765
				tag_all = value;
766
			else if (key.equals("enclitics"))
766
			else if (key.equals("enclitics")) //$NON-NLS-1$
767 767
				enclitics = value;
768
			else if (key.equals("encliticsFR"))
768
			else if (key.equals("encliticsFR")) //$NON-NLS-1$
769 769
				encliticsFR = value;
770
			else if (key.equals("div_tags"))
770
			else if (key.equals("div_tags")) //$NON-NLS-1$
771 771
				div_tags = value;
772
			else if (key.equals("q_tags"))
772
			else if (key.equals("q_tags")) //$NON-NLS-1$
773 773
				q_tags = value;
774
			else if (key.equals("extraword1_tags"))
774
			else if (key.equals("extraword1_tags")) //$NON-NLS-1$
775 775
				extraword1_tags = value;
776
			else if (key.equals("corr_tags_no_seg"))
776
			else if (key.equals("corr_tags_no_seg")) //$NON-NLS-1$
777 777
				corr_tags_no_seg = value;
778
			else if (key.equals("word_tags"))
778
			else if (key.equals("word_tags")) //$NON-NLS-1$
779 779
				word_tags = value;
780
			else if (key.equals("intraword_tags"))
780
			else if (key.equals("intraword_tags")) //$NON-NLS-1$
781 781
				intraword_tags = value;
782
			else if (key.equals("punct_quotes"))
782
			else if (key.equals("punct_quotes")) //$NON-NLS-1$
783 783
				punct_quotes = value;
784
			else if (key.equals("punct_strong1"))
784
			else if (key.equals("punct_strong1")) //$NON-NLS-1$
785 785
				punct_strong1 = value;
786
			else if (key.equals("punct_strong2"))
786
			else if (key.equals("punct_strong2")) //$NON-NLS-1$
787 787
				punct_strong2 = value;
788
			else if (key.equals("punct_paren_open1"))
788
			else if (key.equals("punct_paren_open1")) //$NON-NLS-1$
789 789
				punct_paren_open1 = value;
790
			else if (key.equals("punct_paren_open2"))
790
			else if (key.equals("punct_paren_open2")) //$NON-NLS-1$
791 791
				punct_paren_open2 = value;
792
			else if (key.equals("punct_paren_close1"))
792
			else if (key.equals("punct_paren_close1")) //$NON-NLS-1$
793 793
				punct_paren_close1 = value;
794
			else if (key.equals("punct_paren_close2"))
794
			else if (key.equals("punct_paren_close2")) //$NON-NLS-1$
795 795
				punct_paren_close2 = value;
796
			else if (key.equals("punct_weak"))
796
			else if (key.equals("punct_weak")) //$NON-NLS-1$
797 797
				punct_weak = value;
798
			else if (key.equals("entity"))
798
			else if (key.equals("entity")) //$NON-NLS-1$
799 799
				entity = value;
800
			else if (key.equals("seg_tags"))
800
			else if (key.equals("seg_tags")) //$NON-NLS-1$
801 801
				seg_tags = value;
802
			else if (key.equals("regPunct"))
802
			else if (key.equals("regPunct")) //$NON-NLS-1$
803 803
				regPunct = value;
804
			else if (key.equals("regElision"))
804
			else if (key.equals("regElision")) //$NON-NLS-1$
805 805
				regElision = value;
806
			else if (key.equals("whitespaces"))
806
			else if (key.equals("whitespaces")) //$NON-NLS-1$
807 807
				whitespaces = value;
808
			else if (key.equals("punct_strong")) // this is temporary
808
			else if (key.equals("punct_strong")) // this is temporary //$NON-NLS-1$
809 809
				tmp_strong_punct = value; // this is temporary
810 810
			else
811
				System.out.println("MISSING TOKENIZER KEY: " + key);
811
				System.out.println("MISSING TOKENIZER KEY: " + key); //$NON-NLS-1$
812 812
		}
813 813
		// recombine
814 814
		recombine();
815 815
		if (tmp_strong_punct != null) punct_strong = tmp_strong_punct;  // this is temporary
816 816

  
817
		String shouldResetTests = tokenizerElement.getAttribute("onlyThoseTests");
818
		if ("true".equals(shouldResetTests)) {
819
			System.out.println("Warning: tokenizer only using import parameters tests");
817
		String shouldResetTests = tokenizerElement.getAttribute("onlyThoseTests"); //$NON-NLS-1$
818
		if ("true".equals(shouldResetTests)) { //$NON-NLS-1$
819
			System.out.println("Warning: tokenizer only using import parameters tests"); //$NON-NLS-1$
820 820
			// tests = new ArrayList<>();
821 821
		}
822 822

  
823
		NodeList testsList = tokenizerElement.getElementsByTagName("test");
823
		NodeList testsList = tokenizerElement.getElementsByTagName("test"); //$NON-NLS-1$
824 824
		if (testsList.getLength() > 0) {
825 825
			// System.out.println("Add "+testsList.getLength()+" tests to the tokenizer"
826 826
			for (int i = 0; i < testsList.getLength(); i++) {
......
862 862

  
863 863
	public boolean loadFromProject(ProjectScope projectScope) {
864 864
		// load params
865
		IEclipsePreferences params = projectScope.getNode("Tokenizer");
865
		IEclipsePreferences params = projectScope.getNode("Tokenizer"); //$NON-NLS-1$
866 866
		try {
867 867
			if (debug) System.out.println(Arrays.toString(params.keys()));
868 868
			if (debug) System.out.println(Arrays.toString(params.childrenNames()));
......
888 888
					String value = params.get(name, defaultValue);
889 889
					if (value != null) {
890 890
						try {
891
							if (debug) System.out.println(" Tokenizer parametrized with " + name + "=" + value);
891
							if (debug) System.out.println(" Tokenizer parametrized with " + name + "=" + value); //$NON-NLS-1$ //$NON-NLS-2$
892 892
							field.set(this, value);
893 893
						}
894 894
						catch (Exception e) {
......
901 901
			// recombine
902 902
			recombine();
903 903

  
904
			if (params.getBoolean("onlyThoseTests", false)) {
905
				System.out.println("Warning: tokenizer only using import parameters tests");
904
			if (params.getBoolean("onlyThoseTests", false)) { //$NON-NLS-1$
905
				System.out.println("Warning: tokenizer only using import parameters tests"); //$NON-NLS-1$
906 906
				// tests = new ArrayList<>();
907 907
			}
908 908

  
909
			org.osgi.service.prefs.Preferences testsList = params.node("tests");
909
			org.osgi.service.prefs.Preferences testsList = params.node("tests"); //$NON-NLS-1$
910 910
			String[] tests;
911 911
			tests = testsList.childrenNames();
912 912
			// System.out.println("Add "+testsList.getLength()+" tests to the tokenizer"
......
914 914
				org.osgi.service.prefs.Preferences testdef = testsList.node(testname);
915 915

  
916 916
				// Element test = (Element) testsList.item(i);
917
				TTest t = new TTest(testdef.get("content", null), testdef.get("type", null),
918
						testdef.getInt("before", 0),
919
						testdef.getInt("hit", 0),
920
						testdef.getInt("after", 0));
917
				TTest t = new TTest(testdef.get("content", null), testdef.get("type", null), //$NON-NLS-1$ //$NON-NLS-2$
918
						testdef.getInt("before", 0), //$NON-NLS-1$
919
						testdef.getInt("hit", 0), //$NON-NLS-1$
920
						testdef.getInt("after", 0)); //$NON-NLS-1$
921 921
			}
922 922
			// System.out.println("Tests: "+tests);
923 923

  
TXM/trunk/bundles/org.txm.tokenizer.core/src/org/txm/tokenizer/SimpleStringTokenizer.groovy (revision 3880)
43 43
	
44 44
	String pclitics = null; // default behavior don't manage clitics
45 45
	
46
	Pattern reg_punct_other = Pattern.compile("\\p{P}");
46
	Pattern reg_punct_other = Pattern.compile("\\p{P}"); //$NON-NLS-1$
47 47
	
48 48
	/** The DEBUG. */
49 49
	public boolean DEBUG = false;
......
90 90
		this.tc = tc
91 91
		this.lang = tc.lang;
92 92
		if (lang != null)
93
			if (lang.startsWith("en")) {
93
			if (lang.startsWith("en")) { //$NON-NLS-1$
94 94
				fclitics = tc.FClitic_en;
95
			} else if (lang.startsWith("fr")) {
95
			} else if (lang.startsWith("fr")) { //$NON-NLS-1$
96 96
				fclitics = tc.FClitic_fr;
97 97
				pclitics = tc.PClitic_fr;
98
			} else if (lang.startsWith("gl")) {
98
			} else if (lang.startsWith("gl")) { //$NON-NLS-1$
99 99
				fclitics = tc.FClitic_gl;
100
			} else if (lang.startsWith("it")) {
100
			} else if (lang.startsWith("it")) { //$NON-NLS-1$
101 101
				pclitics = tc.PClitic_it;
102 102
			}
103 103
		
......
130 130
		}
131 131
	}
132 132
	
133
	public final static String WHITESPACE = " ";
133
	public final static String WHITESPACE = " "; //$NON-NLS-1$
134 134
	
135
	public final static String EMPTY = "";
135
	public final static String EMPTY = ""; //$NON-NLS-1$
136 136
	
137 137
	/**
138 138
	 * Process word.
......
142 142
		ArrayList<String> sresult = new ArrayList<String>()
143 143
		if (regSplitWhiteSpaces != null) {
144 144
			for (String s : regSplitWhiteSpaces.split(text)) {		// separate with unicode white spaces
145
				if (DEBUG){println "process $s"}
145
				if (DEBUG){println "process $s"} //$NON-NLS-1$
146 146
				sresult.addAll(iterate(s));
147 147
			}
148 148
		}
......
162 162
	protected ArrayList<String> iterate(String s) {
163 163
		ArrayList<String> result = new ArrayList<String>();
164 164
		while (s != null && s.length() > 0) {
165
			if (DEBUG){println "  > $s"}
165
			if (DEBUG){println "  > $s"} //$NON-NLS-1$
166 166
			s = standardChecks(result, s);
167 167
		}
168 168
		return result;
......
180 180
		
181 181
		for (TTest test : tc.tests) {
182 182
			if ((m = s =~ test.regex)) {
183
				if (DEBUG) {println "test : "+test.regex}
183
				if (DEBUG) {println "test : "+test.regex} //$NON-NLS-1$
184 184
				if (test.before > 0) {
185 185
					result.addAll(iterate(m[0][test.before]))
186 186
				}
......
194 194
		}
195 195
		
196 196
		if (fclitics != null && (m = s =~ regFClitics) ) {
197
			if (DEBUG) println "CLITIC found: $s ->"+ m
197
			if (DEBUG) println "CLITIC found: $s ->"+ m //$NON-NLS-1$
198 198
			result.addAll(iterate(m.group(1)))
199 199
			
200 200
			result.add(m.group(2));
201 201
			
202
			return "";
202
			return ""; //$NON-NLS-1$
203 203
		} else if (pclitics != null && (m = s =~ regPClitics) ) {
204
			if (DEBUG) println "PCLITIC found: $s ->"+ m
204
			if (DEBUG) println "PCLITIC found: $s ->"+ m //$NON-NLS-1$
205 205
			
206 206
			result.add(m.group(1));
207 207
			
208 208
			result.addAll(iterate(m.group(2)))
209 209
			
210
			return "";
210
			return ""; //$NON-NLS-1$
211 211
		} else if (regElision != null && (m = s =~ regElision) ) {
212
			if (DEBUG) println "Elision found: $s ->"+ m.group(1)+" + "+m.group(2)+" + "+m.group(3)
212
			if (DEBUG) println "Elision found: $s ->"+ m.group(1)+" + "+m.group(2)+" + "+m.group(3) //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
213 213
			//iterate(m.group(1))
214 214
			
215 215
			//			int sep = s.indexOf("'");
......
222 222
			
223 223
			result.addAll(iterate(m.group(2)))
224 224
			
225
			return "";
225
			return ""; //$NON-NLS-1$
226 226
		} else if (reg3pts != null && (m = s =~ reg3pts) )	{
227
			if(DEBUG){println "REG '...' found: $s -> "+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
227
			if(DEBUG){println "REG '...' found: $s -> "+m.group(1)+" + "+m.group(2)+" + "+m.group(3)} //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
228 228
			result.addAll(iterate(m.group(1)))
229 229
			
230
			result.add("...");
230
			result.add("..."); //$NON-NLS-1$
231 231
			
232 232
			return m.group(3);
233 233
		} else if (regPunct != null && (m = s =~ regPunct) ) {
234
			if(DEBUG){println "PUNCT '$regPunct' found: $s ->"+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
234
			if(DEBUG){println "PUNCT '$regPunct' found: $s ->"+m.group(1)+" + "+m.group(2)+" + "+m.group(3)} //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
235 235
			result.addAll(iterate(m.group(1)))
236 236
			
237 237
			result.add(m.group(2));
......
241 241
			//		if(DEBUG){println "Other found: "+s}
242 242
			result.add(s);
243 243
			
244
			return "";
244
			return ""; //$NON-NLS-1$
245 245
		}
246 246
	}
247 247
	
......
252 252
	 */
253 253
	public static void main(String[] args) {
254 254
		def tests = [
255
			["fr", "c'est un test."],
256
			["fr", "C'est un autre test."],
255
			["fr", "c'est un test."], //$NON-NLS-1$ //$NON-NLS-2$
256
			["fr", "C'est un autre test."], //$NON-NLS-1$ //$NON-NLS-2$
257 257
			[
258
				"fr",
259
				"C'est une version 1.2.3 un 01:12:12 test vers http://un.site.web.fr, fin."
258
				"fr", //$NON-NLS-1$
259
				"C'est une version 1.2.3 un 01:12:12 test vers http://un.site.web.fr, fin." //$NON-NLS-1$
260 260
			],
261
			["en", "This is a test."],
262
			["en", "It's a test."]
261
			["en", "This is a test."], //$NON-NLS-1$ //$NON-NLS-2$
262
			["en", "It's a test."] //$NON-NLS-1$ //$NON-NLS-2$
263 263
		]
264 264
		
265 265
		for (def d : tests) {
266 266
			String lang = d[0]
267 267
			String text = d[1]
268 268
			SimpleStringTokenizer tokenizer = new SimpleStringTokenizer(lang)
269
			println "Process: $text"
270
			println "Result : "+tokenizer.processText(text).collect{"<"+it+">"}
269
			println "Process: $text" //$NON-NLS-1$
270
			println "Result : "+tokenizer.processText(text).collect{"<"+it+">"} //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
271 271
		}
272 272
	}
273 273
}
TXM/trunk/bundles/org.txm.tokenizer.core/src/org/txm/tokenizer/TTest.java (revision 3880)
4 4
	
5 5
	String regex;
6 6
	
7
	String type = "w";
7
	String type = "w"; //$NON-NLS-1$
8 8
	
9 9
	int before = 1, hit = 2, after = 3;
10 10
	
......
42 42
	 */
43 43
	@Override
44 44
	public String toString() {
45
		return "[regex=" + regex + ", type=" + type + ", before=" + before + ", hit=" + hit + ", after=" + after + "]";
45
		return "[regex=" + regex + ", type=" + type + ", before=" + before + ", hit=" + hit + ", after=" + after + "]"; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$
46 46
	}
47 47
}
48 48

  

Formats disponibles : Unified diff