Révision 3706

TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZImporter.groovy (revision 3706)
408 408
				if (outSideTextTagsAndKeepContentRegex != null && outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
409 409
					tokenizer.setOutSideTextTagsAndKeepContent(outSideTextTagsAndKeepContentRegex)
410 410
				}
411
				
412
				if (noteRegex != null && noteRegex.trim().length() > 0) {
413
					tokenizer.setNote(noteRegex)
414
				}
415
				
411
							
416 412
				// tokenize !
417 413
				if (!tokenizer.process()) {
418 414
					println("Failed to process "+f)
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/filters/Tokeniser/ChunkTokenizerXml.groovy (revision 3706)
580 580
	}
581 581
	
582 582
	/**
583
	 * Set element content to NOT tokenize
584
	 *
585
	 * @param regexp
586
	 */
587
	public void setNote(String regexp) {
588
		this.note_content = regexp;
589
		this.reg_note_content = Pattern.compile(note_content);
590
	}
591
	
592
	/**
593 583
	 * Set the element to ignore but not their content
594 584
	 *
595 585
	 * @param regexp
......
695 685
		
696 686
		ChunkTokenizerXml tokenizer = new ChunkTokenizerXml(inFile, lang)
697 687
		tokenizer.setRetokenize(false)
698
		tokenizer.setNote("note")
699 688
		//tokenizer.setOutSideTextTags("outsideToEdit")
700 689
		tokenizer.setOutSideTextTagsAndKeepContent("outsideToEdit")
701 690
		//tokenizer.setDEBUG false
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/filters/Tokeniser/SimpleTokenizerXml.groovy (revision 3706)
464 464
		this.outside_text_tags = regexp;
465 465
		this.reg_outside_text_tags = Pattern.compile(outside_text_tags);
466 466
	}
467
	
467
		
468 468
	/**
469
	 * Set element content to NOT tokenize
470
	 *
471
	 * @param regexp
472
	 */
473
	public void setNote(String regexp) {
474
		this.note_content = regexp;
475
		this.reg_note_content = Pattern.compile(note_content);
476
	}
477
	
478
	/**
479 469
	 * Set the element to ignore but not their content
480 470
	 *
481 471
	 * @param regexp
......
595 585
	 * @param str the str
596 586
	 * @return the list
597 587
	 */
598
	public List<String> tokenize(String str)
599
	{
588
	public List<String> tokenize(String str) {
600 589
		return str.tokenize()	// cut by whitespace
601 590
	}
602 591
	
......
605 594
	 *
606 595
	 * @param args the arguments
607 596
	 */
608
	public static void main(String[] args)
609
	{
597
	public static void main(String[] args) {
598
		
610 599
		//		File SVNDIR = new File(System.getProperty("user.home"), "xml/tokenizernum");
611 600
		//		Runtime.getRuntime().exec("svn", "update", SVNDIR.getAbsolutePath());
612 601
		//		File inputFile = new File(SVNDIR, "baye1.xml")
613 602
		//		File outputFile = new File(SVNDIR, "baye1-t.xml")
614 603
		//		SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, "fr")
615 604
		//		tokenizer.DEBUG = true
616
		//		tokenizer.setNote("note")
617 605
		//		//tokenizer.setOutSideTextTags("teiHeader")
618 606
		//		tokenizer.setOutSideTextTagsAndKeepContent("teiHeader")
619 607
		//		println outputFile.toString() + " : "+tokenizer.process();
......
686 674
		
687 675
		SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inFile, outFile, lang)
688 676
		tokenizer.setRetokenize(false)
689
		tokenizer.setNote("note")
690 677
		//tokenizer.setOutSideTextTags("outsideToEdit")
691 678
		tokenizer.setOutSideTextTagsAndKeepContent("outsideToEdit")
692 679
		//tokenizer.setDEBUG false

Formats disponibles : Unified diff