Revision 479 tmp/org.txm.groovy.core/src/groovy/filters/Tokeniser/SimpleTokenizerXml.groovy

SimpleTokenizerXml.groovy (revision 479)
51 51

  
52 52
import javax.xml.stream.*
53 53

  
54
import org.txm.importer.PersonalNamespaceContext
54
import org.txm.importer.graal.PersonalNamespaceContext
55 55
import org.txm.tokenizer.TokenizerClasses
56 56

  
57 57
@CompileStatic
......
59 59

  
60 60
	/** The word_tags. */
61 61
	String word_tags = TokenizerClasses.word_tags;
62
	String word_element_to_create = "w"
62 63
	Pattern reg_word_tags;
63 64
	/** The intraword_tags. */
64 65
	String intraword_tags = TokenizerClasses.intraword_tags;
......
160 161
		this.outfile = outfile;
161 162
		this.infile = infile;
162 163
		this.filename = infile.getName();
163
		int index = filename.indexOf(".");
164
		int index = filename.lastIndexOf(".");
164 165
		if (index > 0) filename = filename.substring(0, index);
165 166

  
166 167
		String strRegElision = TokenizerClasses.regElision;
......
212 213
	 */
213 214
	public donothing(int event, Integer wordid) {
214 215
		if (event == XMLStreamConstants.START_ELEMENT ) {
216
			
215 217
			localname = parser.getLocalName();
218
			if (wordid != null) localname = word_element_to_create;
219
			
216 220
			if (prefix != null && prefix.length() > 0)
217 221
				writer.writeStartElement(prefix+":"+localname);
218 222
			else
......
253 257
					writer.writeAttribute(attname, parser.getAttributeValue(i))
254 258
			}
255 259

  
256
			if (wordid != null && !hasId && localname == "w")
260
			if (wordid != null && !hasId && localname == word_element_to_create)
257 261
				writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
258 262

  
259
			//if (!hasType && localname == "w") writer.writeAttribute("type", "w");
260

  
261
			if (!hasN && localname == "w")
263
			if (!hasN && localname == word_element_to_create)
262 264
				writer.writeAttribute("n", ""+wordcount);
263 265

  
264 266
			writer.writeCharacters("\n");
......
390 392
						//println "Found pretagged word";
391 393
						wordcount++;
392 394
						donothing(event, wordcount);
393
						ignorecontent = true;
395
						//ignorecontent = true;
394 396
						insideword = true;
395 397
					} else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
396 398
						// ignore the tag only
......
425 427
						donothing(event, null);
426 428
					}
427 429
				} else if (event == XMLStreamConstants.CHARACTERS) {
428
					if (ignorecontent) {
430
					if (ignorecontent || insideword) {
429 431
						//println " dont tokenize chars: "+parser.getText().trim();
430 432
						donothing(event, null);
431 433
					} else {
......
553 555
			iterate(m.group(1))
554 556
			
555 557
			wordcount++;
556
			writer.writeStartElement("w");
558
			writer.writeStartElement(word_element_to_create);
557 559
			writeWordAttributes();// id
558 560
			//writer.writeAttribute("type", "w");
559 561
			writer.writeCharacters(m.group(2));
......
565 567
			if (DEBUG) println "PCLITIC found: $s ->"+ m
566 568
			
567 569
			wordcount++;
568
			writer.writeStartElement("w");
570
			writer.writeStartElement(word_element_to_create);
569 571
			writeWordAttributes();// id
570 572
			//writer.writeAttribute("type", "w");
571 573
			writer.writeCharacters(m.group(1));
......
586 588
//				sep = s.indexOf("‘");
587 589

  
588 590
			wordcount++;
589
			writer.writeStartElement("w");
591
			writer.writeStartElement(word_element_to_create);
590 592
			writeWordAttributes();// id
591 593
			//writer.writeAttribute("type", "w");
592 594
			writer.writeCharacters(m.group(1));
......
601 603
			iterate(m.group(1))
602 604

  
603 605
			wordcount++;
604
			writer.writeStartElement("w");
606
			writer.writeStartElement(word_element_to_create);
605 607
			writeWordAttributes();// id
606 608
			//writer.writeAttribute("type","pon");
607 609
			writer.writeCharacters("...");
......
614 616
			iterate(m.group(1));
615 617

  
616 618
			wordcount++;
617
			writer.writeStartElement("w");
619
			writer.writeStartElement(word_element_to_create);
618 620
			writeWordAttributes();// id
619 621
			//writer.writeAttribute("type","pon");
620 622
			writer.writeCharacters(m.group(2));
......
625 627
		} else {
626 628
	//		if(DEBUG){println "Other found: "+s}
627 629
			wordcount++;
628
			writer.writeStartElement("w");
630
			writer.writeStartElement(word_element_to_create);
629 631
			writeWordAttributes();// id
630 632
//			if (reg_punct_other.matcher(s).matches())
631 633
//				writer.writeAttribute("type","pon");
......
673 675
	 */
674 676
	public static void main(String[] args)
675 677
	{
676
		File SVNDIR = new File(System.getProperty("user.home"), "SVN/TXMSVN/trunk/corpora/tokenizer");
678
		File SVNDIR = new File(System.getProperty("user.home"), "xml/tokenizernum");
677 679
		Runtime.getRuntime().exec("svn", "update", SVNDIR.getAbsolutePath());
678
		File inputFile = new File(SVNDIR, "test1.xml")
679
		File outputFile = new File(SVNDIR, "test1-t.xml")
680
		File inputFile = new File(SVNDIR, "baye1.xml")
681
		File outputFile = new File(SVNDIR, "baye1-t.xml")
680 682
		SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, "fr")
681 683
		tokenizer.DEBUG = true
682 684
		tokenizer.setNote("note")
683
		tokenizer.setOutSideTextTags("outside")
684
		tokenizer.setOutSideTextTagsAndKeepContent("outsideToEdit")
685
		//tokenizer.setOutSideTextTags("teiHeader")
686
		tokenizer.setOutSideTextTagsAndKeepContent("teiHeader")
685 687
		println outputFile.toString() + " : "+tokenizer.process();
686 688
		// FIXME: tests UTF-8 XML ranges
687 689
		//		File inputDir = new File("C:/Tools/Textometrie/___corpus/xml/test_bug_import_chinois/")

Also available in: Unified diff