Révision 3003

tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/importer.groovy (revision 3003)
187 187
			File tfile = new File(tokenizedDir, pfile.getName())
188 188
			try {
189 189
				TranscriberTokenizer tokenizer = new TranscriberTokenizer(pfile, tfile, lang)
190
				tokenizer.setRetokenize("true" == project.getTokenizerParameter("doRetokenizeStep", "false"))
190 191
				if (!tokenizer.process()) {
191 192
					println("Failed to tokenize "+pfile)
192 193
				}
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/TranscriberTokenizer.groovy (revision 3003)
108 108
//			}
109 109

  
110 110
			//test events
111
			if (s.startsWith("^^")) {
111
			if (s.startsWith("^^") && s.length() > 2) {
112 112
				event += "#orth";			
113 113
				s = s.substring(2);
114 114
			}
115
			if (s.startsWith("*")) {
115
			if (s.startsWith("*") && s.length() > 1) {
116 116
				event += "#corr";
117 117
				s = s.substring(1);
118 118
			}
......
121 121
				event = event.substring(1);//remove '|'
122 122
			
123 123
			// test audio
124
			if (s.contains("(") || s.contains(")")) // contains ( or )
124
			if ((s.contains("(") || s.contains(")")) && && s.length() > 2) // contains ( or )
125 125
			{
126 126
				if (s.contains("(") ^ s.contains(")")) // ert(ert XOR ert)ert
127 127
				{
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZImporter.groovy (revision 3003)
331 331
		new File(module.getBinaryDirectory(),"tokenized").deleteDir()
332 332
		new File(module.getBinaryDirectory(),"tokenized").mkdir()
333 333
		
334
		boolean retokenize = "true" == module.getProject().getTokenizerParameter("doRetokenizeStep", "false")
334 335
		String outSideTextTagsRegex = "";
335 336
		String outSideTextTagsAndKeepContentRegex = "";
336 337
		String noteRegex = "";
......
380 381
				File infile = f;
381 382
				File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
382 383
				SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
383
				
384
				tokenizer.setRetokenize(retokenize)
384 385
				if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) {
385 386
					tokenizer.setOutSideTextTags(outSideTextTagsRegex)
386 387
				}
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/filters/Tokeniser/SimpleTokenizerXml.groovy (revision 3003)
51 51

  
52 52
import javax.xml.stream.*
53 53

  
54
import org.txm.scripts.importer.graal.PersonalNamespaceContext
54
import org.txm.importer.PersonalNamespaceContext
55 55
import org.txm.tokenizer.TokenizerClasses
56 56

  
57 57
@CompileStatic
58 58
public class SimpleTokenizerXml {
59

  
59
	
60
	boolean retokenize = false
61
	LinkedHashMap<String, String>retokenizedWordProperties = new LinkedHashMap()
62
	
60 63
	/** The word_tags. */
61 64
	String word_tags = TokenizerClasses.word_tags;
62 65
	String word_element_to_create = TokenizerClasses.word_element_to_create;
63 66
	Pattern reg_word_tags;
64 67
	/** The intraword_tags. */
65 68
	String intraword_tags = TokenizerClasses.intraword_tags;
66

  
69
	
67 70
	/** The punct_strong. */
68 71
	String punct_strong = TokenizerClasses.punct_strong;
69

  
72
	
70 73
	/** The punct_all. */
71 74
	String punct_all = TokenizerClasses.punct_all;
72

  
75
	
73 76
	/** The word_chars. */
74 77
	String word_chars = TokenizerClasses.word_chars;
75

  
78
	
76 79
	String fclitics = null; // default behavior don't manage clitics
77 80
	String pclitics = null; // default behavior don't manage clitics
78

  
81
	
79 82
	/** The outside_text_tags_ignore_content. */
80 83
	String note_content = null;
81 84
	String outside_text_tags_keep_content = null // tag and content NOT removed but not tokenized
82 85
	String outside_text_tags = null // tag and content removed
83 86
	String startTag = null
84 87
	Pattern reg_note_content;
85
	Pattern reg_outside_text_tags_keep_content; 
88
	Pattern reg_outside_text_tags_keep_content;
86 89
	Pattern reg_outside_text_tags;
87 90
	Pattern reg_startTag;
88 91
	
89 92
	Pattern reg_punct_other = Pattern.compile("\\p{P}")
90

  
93
	
91 94
	/** The DEBUG. */
92 95
	public boolean DEBUG = false;
93

  
96
	
94 97
	/** The outfile. */
95 98
	File outfile;
96

  
99
	
97 100
	/** The infile. */
98 101
	File infile;
99

  
102
	
100 103
	String lang;
101

  
104
	
102 105
	/** The buffer. */
103 106
	StringBuffer buffer;
104

  
107
	
105 108
	/** The writer. */
106 109
	XMLStreamWriter writer;
107 110
	BufferedOutputStream output;
108

  
111
	
109 112
	/** The parser. */
110 113
	XMLStreamReader parser
111

  
114
	
112 115
	/** The localname. */
113 116
	String localname;
114

  
117
	
115 118
	/** The prefix. */
116 119
	String prefix;
117 120
	String filename;
118

  
121
	
119 122
	def regElision = null;
120 123
	def reg3pts = null;
121 124
	def regPunct;
......
126 129
	Pattern regLN;
127 130
	Pattern regCTRL;
128 131
	Pattern regSplitWhiteSpaces;
129

  
132
	
130 133
	public SimpleTokenizerXml(File infile, File outfile) {
131 134
		this(infile, outfile, "");
132 135
	}
133

  
136
	
134 137
	/**
135 138
	 * Instantiates a new simple tokenizer xml.
136 139
	 *
......
150 153
			} else if (lang.startsWith("it")) {
151 154
				pclitics = TokenizerClasses.PClitic_it
152 155
			}
153

  
156
		
154 157
		word_tags = TokenizerClasses.word_tags;
155 158
		reg_word_tags = Pattern.compile(word_tags);
156 159
		
......
158 161
		punct_strong = TokenizerClasses.punct_strong;
159 162
		punct_all = TokenizerClasses.punct_all;
160 163
		word_chars = TokenizerClasses.word_chars;
161

  
164
		
162 165
		this.outfile = outfile;
163 166
		this.infile = infile;
164 167
		this.filename = infile.getName();
165 168
		int index = filename.lastIndexOf(".");
166 169
		if (index > 0) filename = filename.substring(0, index);
167

  
170
		
168 171
		String strRegElision = TokenizerClasses.regElision;
169 172
		if (strRegElision != null && strRegElision.length() > 0)
170 173
			regElision = ~/^([\p{L}-]++${strRegElision}[\p{L}-]++)(.*)$/ // the test must start with the start of string  ^
......
176 179
		
177 180
		if (fclitics != null && fclitics.length() > 0)
178 181
			regFClitics = ~/(.+)($fclitics)$/ // the test must end with the end of string $
179
			
182
		
180 183
		if (pclitics != null && pclitics.length() > 0)
181 184
			regPClitics = /^($pclitics)(.*)/ // the test must start with the start of string  ^
182
			
185
		
183 186
		if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0)
184 187
			regWhitespaces = ~TokenizerClasses.whitespaces;
185

  
188
		
186 189
		if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0)
187 190
			regSplitWhiteSpaces = Pattern.compile(TokenizerClasses.whitespaces);
188
			
191
		
189 192
		regLN = Pattern.compile("/\n/");
190 193
		regCTRL = Pattern.compile("/\\p{C}/");
191 194
	}
192

  
195
	
193 196
	/**
194 197
	 * Fill infos.
195 198
	 *
......
202 205
			prefix = parser.getPrefix();
203 206
		}
204 207
	}
205

  
208
	
206 209
	/**
207 210
	 * Donothing: just write what is read
208 211
	 * 
......
225 228
				//					writer.writeStartElement(namespace, localname);
226 229
				//				else
227 230
				writer.writeStartElement(localname);
228

  
231
			
229 232
			//			if(parser.getNamespaceCount() > 0)
230 233
			//				writer.writeDefaultNamespace(parser.getNamespaceURI(0))
231 234
			//			for(int i = 1 ; i < parser.getNamespaceCount() ; i++)
232 235
			//				writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
233

  
236
			
234 237
			String namespace_prefix;
235 238
			for (int i = 0 ; i<   parser.getNamespaceCount() ; i++) {
236 239
				namespace_prefix = parser.getNamespacePrefix(i);
......
240 243
					writer.writeDefaultNamespace(parser.getNamespaceURI(i));
241 244
				}
242 245
			}
243

  
246
			
244 247
			String attrprefix, attname;
245 248
			boolean hasId = false;
246 249
			//boolean hasType = false
......
251 254
				if ("id".equals(attname)) hasId = true;
252 255
				//if ("type".equals(attname)) hasType = true;
253 256
				if ("n".equals(attname)) hasN = true;
254

  
257
				
255 258
				if (attrprefix != null && attrprefix.length() > 0)
256 259
					writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i))
257 260
				else
258 261
					writer.writeAttribute(attname, parser.getAttributeValue(i))
259 262
			}
260

  
263
			
261 264
			if (wordid != null && !hasId && localname == word_element_to_create)
262 265
				writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
263

  
266
			
264 267
			if (!hasN && localname == word_element_to_create)
265 268
				writer.writeAttribute("n", ""+wordcount);
266

  
267
			writer.writeCharacters("\n");
269
			
270
			if (!reg_word_tags.matcher(localname).matches()) {
271
				writer.writeCharacters("\n");
272
			}
268 273
		}
269 274
		else if(event == XMLStreamConstants.END_ELEMENT)
270 275
		{
......
275 280
		{
276 281
			//println parser.getText();
277 282
			//writer.writeCharacters("𦟛");
278

  
279

  
283
			
284
			
280 285
			// checks if the token starts with an high surrogate
281 286
			//			if(isHighSurrogate(parser.getText().charAt(0)))	{
282 287
			//				println "warning: invalid UTF-8 XML range, token " + parser.getText() + " has been replaced.";
......
292 297
			//				writer.writeCharacters("&#" + scalar + ";");
293 298
			//			}
294 299
			//			else
295

  
296

  
300
			
301
			
297 302
			if (insideword) { // ensure there is not \t or \n in the word form value
298 303
				writer.writeCharacters(parser.getText().trim().replace("\n", " ").replace("\t", " "));
299 304
			} else {
......
301 306
			}
302 307
		}
303 308
	}
304

  
305

  
309
	
310
	
306 311
	/**
307 312
	 * Converts the specified surrogates pair to scalar.
308 313
	 * @param highSurrogate
......
312 317
	public int surrogatesPairToScalar(char highSurrogate, char lowSurrogate)	{
313 318
		return ((highSurrogate - 0xD800) * 0x400) + (lowSurrogate - 0xDC00) + 0x10000;
314 319
	}
315

  
320
	
316 321
	/**
317 322
	 * Checks if the specified character is an high/leading surrogate.
318 323
	 * @param character
......
321 326
	public boolean isHighSurrogate(char character)	{
322 327
		return (character >= 0xD800 && character <= 0xDBFF);
323 328
	}
324

  
325

  
326

  
329
	
330
	
331
	
327 332
	/** The wordcount. */
328 333
	int wordcount = 0;
329

  
334
	
330 335
	/** The ignorecontent. */
331 336
	boolean ignorecontent = true; // tokenization starts with element name matching $reg_startTag
332 337
	boolean insideword = false;
......
345 350
		output = new BufferedOutputStream(new FileOutputStream(outfile))
346 351
		writer = factory.createXMLStreamWriter(output, "UTF-8")
347 352
		writer.setNamespaceContext(new PersonalNamespaceContext());
348

  
353
		
349 354
		def inputData = infile.toURI().toURL().openStream();
350 355
		def inputfactory = XMLInputFactory.newInstance();
351 356
		//inputfactory.setProperty("http://apache.org/xml/properties/input-buffer-size", new Integer(2048));
352 357
		//inputfactory.setExpandEntityReferences(false);
353
		parser = inputfactory.createXMLStreamReader(inputData, "UTF-8");
358
		XMLInputFactory.newInstance();
359
		parser = inputfactory.createXMLStreamReader(inputData);
354 360
		//println "PARSER: "+parser.getClass()
355 361
		writer.writeStartDocument("UTF-8","1.0");
356 362
		writer.writeCharacters("\n");
357

  
363
		
358 364
		int previousEvent = 0;
359 365
		boolean startProcess = false;
360 366
		if (startTag == null) // if no startTag specified we process from the start
......
363 369
		
364 370
		buffer = new StringBuffer();
365 371
		//println "process - start start tag: "+startTag+" startProcess: $startProcess"
366
//		println "reg_outside_text_tags_keep_content=$reg_outside_text_tags_keep_content"
367
//		println "reg_outside_text_tags=$reg_outside_text_tags"
368
//		println "reg_note_content=$reg_note_content"
372
		//		println "reg_outside_text_tags_keep_content=$reg_outside_text_tags_keep_content"
373
		//		println "reg_outside_text_tags=$reg_outside_text_tags"
374
		//		println "reg_note_content=$reg_note_content"
369 375
		try {
370 376
			for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
371 377
				if (!startProcess) {
......
380 386
						continue;
381 387
					}
382 388
				}
383

  
389
				
384 390
				if (previousEvent == XMLStreamConstants.CHARACTERS && previousEvent != event) {
385 391
					processWord(); // tokenize now!
386 392
					buffer.setLength(0);
......
391 397
					localname = parser.getLocalName()
392 398
					if (reg_word_tags.matcher(localname).matches()) { // ignore the content of the word but keep counting
393 399
						//println "Found pretagged word";
394
						wordcount++;
395
						donothing(event, wordcount);
396
						//ignorecontent = true;
397
						insideword = true;
400
						
401
						if (retokenize) {
402
							retokenizedWordProperties.clear()
403
							for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
404
								retokenizedWordProperties[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i)
405
							}
406
						} else {
407
							wordcount++;
408
							donothing(event, wordcount);
409
							//ignorecontent = true;
410
							insideword = true;
411
						}
398 412
					} else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
399 413
						// ignore the tag only
400 414
						donothing(event, null); // write the tag
......
414 428
					//println "Close: "+localname;
415 429
					localname = parser.getLocalName()
416 430
					if (reg_word_tags.matcher(localname).matches()) {
417
						//ignorecontent = false;
418
						insideword = false;
419
						writer.writeEndElement();
420
						writer.writeCharacters("\n");
431
						if (retokenize) {
432
							retokenizedWordProperties.clear()
433
						} else {
434
							//ignorecontent = false;
435
							insideword = false;
436
							writer.writeEndElement();
437
							writer.writeCharacters("\n");
438
						}
421 439
					} else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag
422 440
						ignorecontent = false;
423 441
						donothing(event, null);
......
448 466
				}
449 467
				previousEvent = event;
450 468
			}
451

  
469
			
452 470
			parser.close()
453 471
			writer.close();
454 472
			output.close();
......
464 482
		}
465 483
		return true;
466 484
	}
467

  
485
	
486
	public void setRetokenize(boolean retokenize) {
487
		this.retokenize = retokenize
488
	}
489
	
468 490
	/**
469 491
	 * Set the element and content to ignore
470 492
	 * 
......
526 548
		if (regSplitWhiteSpaces != null) {
527 549
			for (String s : regSplitWhiteSpaces.split(text)) {		// separate with unicode white spaces
528 550
				//	if (DEBUG){println "process $s"}
529
					iterate(s);
530
				}
551
				iterate(s);
552
			}
531 553
		} else {
532 554
			iterate(text);
533 555
		}
534 556
	}
535

  
557
	
536 558
	/**
537 559
	 * Iterate. a String, should be called when a word is found in a String
538 560
	 *
......
541 563
	 */
542 564
	protected iterate(String s) {
543 565
		while (s != null && s.length() > 0) {
544
		//	if (DEBUG){println "  > $s"}
566
			//	if (DEBUG){println "  > $s"}
545 567
			s = standardChecks(s);
546 568
		}
547 569
	}
548

  
570
	
549 571
	/**
550 572
	 * Standard checks.
551 573
	 *
......
556 578
	public String standardChecks(String s) {
557 579
		Matcher m;
558 580
		if (fclitics != null && (m = s =~ regFClitics) ) {
559
		//	if (DEBUG) println "CLITIC found: $s ->"+ m
581
			//	if (DEBUG) println "CLITIC found: $s ->"+ m
560 582
			iterate(m.group(1))
561 583
			
562 584
			wordcount++;
......
583 605
			
584 606
			return "";
585 607
		} else if (regElision != null && (m = s =~ regElision) ) {
586
		//	if (DEBUG) println "Elision found: $s ->"+ m.group(1)+" + "+m.group(2)+" + "+m.group(3)
608
			//	if (DEBUG) println "Elision found: $s ->"+ m.group(1)+" + "+m.group(2)+" + "+m.group(3)
587 609
			//iterate(m.group(1))
588

  
589
//			int sep = s.indexOf("'");
590
//			if (sep < 0)
591
//				sep = s.indexOf("’");
592
//			if (sep < 0)
593
//				sep = s.indexOf("‘");
594

  
610
			
611
			//			int sep = s.indexOf("'");
612
			//			if (sep < 0)
613
			//				sep = s.indexOf("’");
614
			//			if (sep < 0)
615
			//				sep = s.indexOf("‘");
616
			
595 617
			wordcount++;
596 618
			writer.writeStartElement(word_element_to_create);
597 619
			writeWordAttributes();// id
......
599 621
			writer.writeCharacters(m.group(1));
600 622
			writer.writeEndElement();
601 623
			writer.writeCharacters("\n");
602

  
624
			
603 625
			iterate(m.group(2))
604 626
			
605 627
			return "";
606 628
		} else if (reg3pts != null && (m = s =~ reg3pts) )	{
607
		//	if(DEBUG){println "REG '...' found: $s -> "+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
629
			//	if(DEBUG){println "REG '...' found: $s -> "+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
608 630
			iterate(m.group(1))
609

  
631
			
610 632
			wordcount++;
611 633
			writer.writeStartElement(word_element_to_create);
612 634
			writeWordAttributes();// id
......
614 636
			writer.writeCharacters("...");
615 637
			writer.writeEndElement();
616 638
			writer.writeCharacters("\n");
617

  
639
			
618 640
			return m.group(3);
619 641
		} else if (regPunct != null && (m = s =~ regPunct) ) {
620 642
			if(DEBUG){println "PUNCT found: $s ->"+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
621 643
			iterate(m.group(1));
622

  
644
			
623 645
			wordcount++;
624 646
			writer.writeStartElement(word_element_to_create);
625 647
			writeWordAttributes();// id
......
627 649
			writer.writeCharacters(m.group(2));
628 650
			writer.writeEndElement();
629 651
			writer.writeCharacters("\n");
630

  
652
			
631 653
			return m.group(3);
632 654
		} else {
633
	//		if(DEBUG){println "Other found: "+s}
655
			//		if(DEBUG){println "Other found: "+s}
634 656
			wordcount++;
635 657
			writer.writeStartElement(word_element_to_create);
636 658
			writeWordAttributes();// id
637
//			if (reg_punct_other.matcher(s).matches())
638
//				writer.writeAttribute("type","pon");
639
//			else
640
//				writer.writeAttribute("type","w");
659
			//			if (reg_punct_other.matcher(s).matches())
660
			//				writer.writeAttribute("type","pon");
661
			//			else
662
			//				writer.writeAttribute("type","w");
641 663
			writer.writeCharacters(s);
642 664
			writer.writeEndElement();
643 665
			writer.writeCharacters("\n");
644

  
666
			
645 667
			return "";
646 668
		}
647 669
	}
648

  
670
	
649 671
	/**
650 672
	 * Write word attributes.
651 673
	 *
......
654 676
	protected writeWordAttributes() {
655 677
		writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
656 678
		writer.writeAttribute("n",""+wordcount);
679
		for (String attr : retokenizedWordProperties.keySet()) {
680
			if ("id" == attr) {
681
				writer.writeAttribute("previous-id", retokenizedWordProperties[attr]);
682
			} else if ("n" == attr) {
683
				writer.writeAttribute("previous-n", retokenizedWordProperties[attr]);
684
			} else {
685
				writer.writeAttribute(attr, retokenizedWordProperties[attr]);
686
			}
687
		}
657 688
	}
658

  
689
	
659 690
	public void setStartTag(String tag)
660 691
	{
661 692
		this.startTag = tag;
662 693
		this.reg_startTag = Pattern.compile(startTag);
663 694
	}
664

  
695
	
665 696
	/**
666 697
	 * Tokenize.
667 698
	 *
......
672 703
	{
673 704
		return str.tokenize()	// cut by whitespace
674 705
	}
675

  
706
	
676 707
	/**
677 708
	 * The main method.
678 709
	 *
......
680 711
	 */
681 712
	public static void main(String[] args)
682 713
	{
683
		File SVNDIR = new File(System.getProperty("user.home"), "xml/tokenizernum");
684
		Runtime.getRuntime().exec("svn", "update", SVNDIR.getAbsolutePath());
685
		File inputFile = new File(SVNDIR, "baye1.xml")
686
		File outputFile = new File(SVNDIR, "baye1-t.xml")
687
		SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, "fr")
688
		tokenizer.DEBUG = true
689
		tokenizer.setNote("note")
690
		//tokenizer.setOutSideTextTags("teiHeader")
691
		tokenizer.setOutSideTextTagsAndKeepContent("teiHeader")
692
		println outputFile.toString() + " : "+tokenizer.process();
714
		//		File SVNDIR = new File(System.getProperty("user.home"), "xml/tokenizernum");
715
		//		Runtime.getRuntime().exec("svn", "update", SVNDIR.getAbsolutePath());
716
		//		File inputFile = new File(SVNDIR, "baye1.xml")
717
		//		File outputFile = new File(SVNDIR, "baye1-t.xml")
718
		//		SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, "fr")
719
		//		tokenizer.DEBUG = true
720
		//		tokenizer.setNote("note")
721
		//		//tokenizer.setOutSideTextTags("teiHeader")
722
		//		tokenizer.setOutSideTextTagsAndKeepContent("teiHeader")
723
		//		println outputFile.toString() + " : "+tokenizer.process();
724
		
725
		
693 726
		// FIXME: tests UTF-8 XML ranges
694 727
		//		File inputDir = new File("C:/Tools/Textometrie/___corpus/xml/test_bug_import_chinois/")
695 728
		//	File inputFile = new File(inputDir, "problem2.xml");
696

  
729
		
697 730
		/*
698
		File inputDir = new File("/home/mdecorde/xml/bvhepistemon2016")
699
		File outputDir = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast")
700
		File outputDir2 = new File("/home/mdecorde/xml/bvhepistemon2016/out-slow")
701
		File outputDir3 = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast2")
702
		File inputFile = new File(inputDir, "1542_RabelaisGrgt.xml");
703
		outputDir.deleteDir()
704
		outputDir2.deleteDir()
705
		outputDir3.deleteDir()
706
		outputDir.mkdir()
707
		outputDir2.mkdir()
708
		outputDir3.mkdir()
709
		long time;
710
		for (String lang : ["none"]) {
711
			println "LANG: $lang"
712

  
713
			time = System.currentTimeMillis();
714
			File outputFile = new File(outputDir, "${lang}.xml")
715
			SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, lang)
716
			tokenizer.process();
717
			println "FAST $lang: "+(System.currentTimeMillis()-time)
718

  
719
//			time = System.currentTimeMillis();
720
//			File outputFile3 = new File(outputDir3, "${lang}.xml")
721
//			FastSimpleTokenizerXml tokenizer3 = new FastSimpleTokenizerXml(inputFile, outputFile3, lang)
722
//			tokenizer3.process();
723
//			println "FAST2 $lang: "+(System.currentTimeMillis()-time)
724
//			
725
//			time = System.currentTimeMillis();
726
//			File outputFile2 = new File(outputDir2, "${lang}.xml")
727
//			OldSimpleTokenizerXml tokenizer2 = new OldSimpleTokenizerXml(inputFile, outputFile2, lang)
728
//			tokenizer2.process();
729
//			println "SLOW $lang: "+(System.currentTimeMillis()-time)
730
		}
731
*/
731
		 File inputDir = new File("/home/mdecorde/xml/bvhepistemon2016")
732
		 File outputDir = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast")
733
		 File outputDir2 = new File("/home/mdecorde/xml/bvhepistemon2016/out-slow")
734
		 File outputDir3 = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast2")
735
		 File inputFile = new File(inputDir, "1542_RabelaisGrgt.xml");
736
		 outputDir.deleteDir()
737
		 outputDir2.deleteDir()
738
		 outputDir3.deleteDir()
739
		 outputDir.mkdir()
740
		 outputDir2.mkdir()
741
		 outputDir3.mkdir()
742
		 long time;
743
		 for (String lang : ["none"]) {
744
		 println "LANG: $lang"
745
		 time = System.currentTimeMillis();
746
		 File outputFile = new File(outputDir, "${lang}.xml")
747
		 SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, lang)
748
		 tokenizer.process();
749
		 println "FAST $lang: "+(System.currentTimeMillis()-time)
750
		 //			time = System.currentTimeMillis();
751
		 //			File outputFile3 = new File(outputDir3, "${lang}.xml")
752
		 //			FastSimpleTokenizerXml tokenizer3 = new FastSimpleTokenizerXml(inputFile, outputFile3, lang)
753
		 //			tokenizer3.process();
754
		 //			println "FAST2 $lang: "+(System.currentTimeMillis()-time)
755
		 //			
756
		 //			time = System.currentTimeMillis();
757
		 //			File outputFile2 = new File(outputDir2, "${lang}.xml")
758
		 //			OldSimpleTokenizerXml tokenizer2 = new OldSimpleTokenizerXml(inputFile, outputFile2, lang)
759
		 //			tokenizer2.process();
760
		 //			println "SLOW $lang: "+(System.currentTimeMillis()-time)
761
		 }
762
		 */
732 763
		println ""
733 764
		println "Done"
734

  
765
		
735 766
		//		String lang = "en"
736 767
		//		File inDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant")
737 768
		//		File outDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant-t")
......
750 781
		//		}
751 782
		//		println ""
752 783
		//		println "Done"
753

  
784
		
785
		String lang = "fr"
786
		File inFile = new File(System.getProperty("user.home"), "xml/vocapia/test.trs")
787
		File outFile = new File(System.getProperty("user.home"), "xml/vocapia/test-retokenized.trs")
788
		
789
		println "processing "+inFile
790
		
791
		SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inFile, outFile, lang)
792
		tokenizer.setRetokenize(true)
793
		//tokenizer.setDEBUG false
794
		tokenizer.process();
795
		
796
		println "Done"
797
		
754 798
	}
755 799
}
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/13nov/FixTranscription.groovy (revision 3003)
1
package org.txm.macro.transcription
2

  
3
import javax.xml.stream.*
4

  
5
import org.txm.importer.PersonalNamespaceContext
6
import org.txm.xml.IdentityHook
7
import org.txm.xml.*
8

  
9
import java.io.BufferedOutputStream
10
import java.io.FileOutputStream
11
import java.io.IOException
12
import java.net.URL
13
import java.util.*
14
import java.util.Map.Entry
15
import java.util.regex.Pattern
16

  
17
class FixTranscription extends XMLProcessor {
18
	
19
	LocalNamesHookActivator activator;
20
	IdentityHook hook;
21
	boolean retokenizeWords
22
	
23
	public FixTranscription(File xmlfile, boolean retokenizeWords) {
24
		super(xmlfile)
25
		this.retokenizeWords = retokenizeWords
26
		
27
		activator = new LocalNamesHookActivator<>(hook, ["w", "Turn", "Sync"]);
28
		
29
		hook = new IdentityHook("word_hook", activator, this) {
30
					
31
					boolean inTurn = false;
32
					
33
					boolean inW = false;
34
					StringBuilder wordBuffer = new StringBuilder();
35
					
36
					String currentTime;
37
					LinkedHashMap turnInfos = new LinkedHashMap()
38
					LinkedHashMap wInfos = new LinkedHashMap()
39
					boolean other
40
					
41
					@Override
42
					public boolean deactivate() {
43
						return true;
44
					}
45
					
46
					@Override
47
					public boolean _activate() {
48
						return true;
49
					}
50
					
51
					@Override
52
					protected void processStartElement() throws XMLStreamException, IOException {
53
						if (localname.equals("Turn")) {
54
							// store values
55
							inTurn = true;
56
							turnInfos.clear()
57
							for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
58
								turnInfos[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i)
59
							}
60
							currentTime = turnInfos["startTime"]
61
							super.processStartElement();
62
						} else if (localname.equals("Sync")) {
63
							currentTime = parser.getAttributeValue(null, "time")
64
							super.processStartElement();
65
						} else if (localname.equals("w")) {
66
							// store values
67
							inW = true;
68
							wInfos.clear()
69
							for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
70
								wInfos[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i)
71
							}
72
							String time = parser.getAttributeValue(null, "time")
73
							if (time != null && time.length() > 0) {
74
								currentTime = time
75
							}
76
							wordBuffer.setLength(0);
77
							return; // write w later
78
						}
79
						else {
80
							super.processStartElement();
81
						}
82
					}
83
					
84
					@Override
85
					protected void processCharacters() throws XMLStreamException {
86
						if (inW) {
87
							wordBuffer.append(parser.getText())
88
						}
89
						else {
90
							super.processCharacters();
91
						}
92
					}
93
					
94
					@Override
95
					protected void processEndElement() throws XMLStreamException {
96
						if (localname.equals("w")) {
97
							
98
							inW = false
99
							String word = wordBuffer.toString().trim()
100
							if (!other && word.startsWith("*")) {
101
								//close current Turn and start a 'other' Turn
102
								writer.writeEndElement() // current Turn
103
								writer.writeCharacters("\n")
104
								
105
								def tmpInfos = new LinkedHashMap()
106
								for (String attr : turnInfos.keySet()) tmpInfos[attr] = turnInfos[attr]
107
								tmpInfos["orig-speaker"] = turnInfos["speaker"]
108
								tmpInfos["speaker"] = "other"
109
								tmpInfos["startTime"] = currentTime
110
								writer.writeStartElement("Turn")
111
								for (String attr : tmpInfos.keySet()) {
112
									writer.writeAttribute(attr, tmpInfos[attr])
113
								}
114
								
115
								other = true
116
								word = word.substring(1)
117
							}
118
							
119
							boolean shouldCloseOtherTurn = false;
120
							if (other && word.endsWith("*")) {
121
								shouldCloseOtherTurn = true;
122
								
123
								word = word.substring(0, word.length()-1)
124
								other = false
125
							}
126
							
127
							// split before re-writing the word
128
							def puncts = []
129
							if (retokenizeWords) {
130
//								while (word.length() > 0 && word.matches("\\p{Punct}.+")) {
131
//									puncts << word.substring(0, 1)
132
//									word = word.substring(1, word.length())
133
//								}
134
								//						//vocapia fix
135
								//						Pattern reg = Pattern.compile("([^']+')(.+)")
136
								//						def m = reg.matcher(word)
137
								//						while (word.length() > 0 && m.matches()) {
138
								//							puncts << m.group(1)
139
								//							word = m.group(2)
140
								//							m = reg.matcher(word)
141
								//						}
142
							}
143
							
144
							for (def punct : puncts) { // pre-retokenize if any
145
								writer.writeStartElement("w")
146
								for (String attr : wInfos.keySet()) {
147
									writer.writeAttribute(attr, wInfos[attr])
148
								}
149
								writer.writeCharacters(punct)
150
								writer.writeEndElement() // w
151
								writer.writeCharacters("\n")
152
							}
153
							
154
							puncts = []
155
							if (retokenizeWords) {
156
//								while (word.length() > 0 && word.matches(".+\\p{Punct}")) {
157
//									puncts << word.substring(word.length()-1, word.length())
158
//									word = word.substring(0, word.length()-1)
159
//								}
160
							}
161

  
162
							if ("XXX".equals(word)) { // <Event desc="XXX" type="unknown" extent="next"/>
163
								writer.writeStartElement("event") // start the initial word
164
								writer.writeAttribute("desc", "XXX from "+wInfos["start"] + " to "+wInfos["end"])
165
								writer.writeAttribute("type", "unknown")
166
								writer.writeAttribute("extent", "instantaneous")
167
								writer.writeEndElement() // event
168
								word = "" // don't write the word
169
							}
170
														
171
							if (word.length() > 0) {
172
								
173
								writer.writeStartElement("w") // start the initial word
174
								for (String attr : wInfos.keySet() ) {
175
									writer.writeAttribute(attr, wInfos[attr])
176
								}
177
								writer.writeCharacters(word)
178
								writer.writeEndElement() // w
179
							}
180
							
181
							for (String punct : puncts) {  // post-retokenize if any
182
								writer.writeStartElement("w")
183
								for (String attr : wInfos.keySet() ) {
184
									writer.writeAttribute(attr, wInfos[attr])
185
								}
186
								writer.writeCharacters(punct)
187
								writer.writeEndElement() // w
188
							}
189
							
190
							if (shouldCloseOtherTurn) {
191
								shouldCloseOtherTurn = false;
192
								//close the current 'other' Turn and restart the actual Turn
193
								writer.writeEndElement() // current 'other' Turn
194
								
195
								writer.writeStartElement("Turn") // rebuild the orig Turn and fix its start-end infos
196
								turnInfos["startTime"] = wInfos["end"] // fix the startTime using the current word end time
197
								for (String attr : turnInfos.keySet()) {
198
									writer.writeAttribute(attr, turnInfos[attr])
199
								}
200
								
201
								other = false
202
							}
203
						} else {
204
							super.processEndElement();
205
						}
206
					}
207
				}
208
	}
209
	
210
	public static void main(String[] args) {
211
		File infile = new File("/home/mdecorde/xml/vocapia","test.trs")
212
		File outfile = new File("/home/mdecorde/xml/vocapia","test-fixed.trs")
213
		def processor = new FixTranscription(infile, true)
214
		println processor.process(outfile)
215
	}
216
}
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/13nov/FixTranscriptionsMacro.groovy (revision 3003)
1
package org.txm.macro.transcription
2

  
3
import java.time.LocalTime
4
import java.time.format.DateTimeFormatter
5
import org.txm.utils.*
6
import org.txm.utils.logger.*
7

  
8
@Field @Option(name="trsFile", usage="A single vocapia XML file", widget="FileOpen", required=false, def="")
9
		File trsFile;
10

  
11
@Field @Option(name="trsDirectory", usage="A Vocapia XML files directory to process", widget="Folder", required=false, def="")
12
		File trsDirectory;
13

  
14
@Field @Option(name="resultDirectory", usage="The result directory", widget="Folder", required=false, def="")
15
		File resultDirectory;
16

  
17
if (!ParametersDialog.open(this)) return;
18

  
19
if (trsDirectory.equals(resultDirectory) || trsFile.getParentFile().equals(resultDirectory)) {
20
	
21
	return false;
22
}
23

  
24
resultDirectory.mkdirs();
25

  
26
def trsFiles = []
27
if (trsDirectory != null && trsDirectory.exists()) {
28
	
29
	println "Processing TRS directory: $trsDirectory"
30
	for (File file : trsDirectory.listFiles()) {
31
		if (file.getName().toLowerCase().endsWith(".trs")) {
32
			trsFiles << file
33
		}
34
	}
35
} else if (trsFile != null && trsFile.exists()) {
36
	println "Processing TRS file: $trsFile"
37
	trsFiles << trsFile
38
}
39

  
40
if (trsFiles.size() == 0) {
41
	println "No XML file found for parameters trsFile=$trsFile and trsDirectory=$trsDirectory"
42
	return false
43
}
44

  
45
ConsoleProgressBar cpb = new ConsoleProgressBar(trsFiles.size())
46
for (File trsFile : trsFiles) {
47
	cpb.tick()
48
	FixTranscription fixer = new FixTranscription(trsFile)
49
	String name = FileUtils.stripExtension(trsFile)
50
	File outFile = new File(resultDirectory, name+".trs")
51
	
52
	if (!fixer.process(outFile)) {
53
		println "WARNING: ERROR WHILE PROCESSING: "+trsFile
54
		return false
55
	}
56
}
57
cpb.done()
58

  
59
println "Done: "+trsFiles.size()+" files processed. Result files in $resultDirectory"
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/Vocapia2Transcriber.groovy (revision 3003)
36 36
		XMLStreamReader parser = factory.createXMLStreamReader(inputData);
37 37
		
38 38
		boolean flagWord = false
39
		def winfos = [:]
40
		def turninfos = [:]
39
		def winfos = new LinkedHashMap()
40
		def turninfos = new LinkedHashMap()
41 41
		boolean other = false;
42 42
		String word = ""
43 43
		try {
......
89 89
								writer.writeStartElement("Episode")
90 90
							//<Section type="report" startTime="0" endTime="3617.593">
91 91
								writer.writeStartElement("Section")
92
								for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
93
									writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i))
94
								}
92 95
								break;
93 96
							
94 97
							case "SpeechSegment": // <SpeechSegment ch="1" sconf="1.00" stime="9.94" etime="43.81" spkid="Enquêtrice" lang="fre" lconf="1.00" trs="1">
95 98
								writer.writeStartElement("Turn")
96
								
97
								turninfos = ["speaker":parser.getAttributeValue(null, "spkid"),
98
									"startTime":parser.getAttributeValue(null, "stime"),
99
									"endTime":parser.getAttributeValue(null, "etime"),
100
								]
101
								for (String attr : turninfos.keySet()) {
102
									writer.writeAttribute(attr, turninfos[attr])
99
								for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
100
									String name = parser.getAttributeLocalName(i)
101
									if (name == "stime") name = "startTime"
102
									else if (name == "etime") name = "endTime"
103
									else if (name == "spkid") name = "speaker"
104
									
105
									writer.writeAttribute(name, parser.getAttributeValue(i))
103 106
								}
104
								
107
							
105 108
								writer.writeCharacters("\n")
106 109
								writer.writeStartElement("Sync")
107 110
								writer.writeAttribute("time", parser.getAttributeValue(null, "stime"))
......
114 117
							
115 118
							// store w infos in case the word must be splited
116 119
								def endValue = String.format(Locale.US, "%.2f", (Double.parseDouble(parser.getAttributeValue(null, "stime")) + Double.parseDouble(parser.getAttributeValue(null, "dur"))))
117
								winfos = ["time":parser.getAttributeValue(null, "stime"), "start": parser.getAttributeValue(null, "stime"), "end":endValue]
120
								def startValue = parser.getAttributeValue(null, "stime");
121
								winfos.clear()
122
								for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
123
									String name = parser.getAttributeLocalName(i)
124
									if (name == "stime") continue
125
									
126
										winfos[name] = parser.getAttributeValue(i)
127
								}
128
								winfos["time"] = startValue
129
								winfos["start"] = startValue
130
								winfos["end"] = endValue
131
							
132
							// write <w> at the end of <Word> tag
118 133
								break
119 134
						}
120 135
						break;
......
154 169
							case "Word":
155 170
								flagWord = false
156 171
								word = word.trim()
157
								if (word.startsWith("*")) {
158
									//close current Turn and start a 'other' Turn
159
									writer.writeEndElement() // current Turn
160
									writer.writeStartElement("Turn")
161
									writer.writeAttribute("speaker", "other")
162
									writer.writeAttribute("startTime", winfos["time"])
163
									writer.writeAttribute("orig-speaker", turninfos["speaker"])
164
									writer.writeCharacters("\n")
165
									other = true
166
									word = word.substring(1)
167
								}
168 172
							
169
								String otherAttributeValue = Boolean.toString(other) // set now
170
							
171
								if (other && word.endsWith("*")) {
172
									
173
									//close the current 'other' Turn and restart the actual Turn
174
									writer.writeEndElement() // current 'other' Turn
175
									
176
									writer.writeStartElement("Turn") // rebuild the orig Turn with its infos
177
									turninfos["startTime"] = winfos["end"] // fix the startTime using the current word end time
178
									for (String attr : turninfos.keySet()) {
179
										writer.writeAttribute(attr, turninfos[attr])
180
									}
181
									writer.writeCharacters("\n")
182
									
183
									word = word.substring(0, word.length()-1)
184
									other = false
185
								}
186
							
187 173
							// split before the word
188 174
								def puncts = []
189 175
								if (retokenizeWords) {
176
									
190 177
									while (word.length() > 0 && word.matches("\\p{Punct}.+")) {
191
										puncts << word.substring(0, 1)
192
										word = word.substring(1, word.length())
178
										puncts << word.substring(0 ,1)
179
										word = word.substring(1)
193 180
									}
194 181
									
182
									// fix "d'abord" like words
195 183
									Pattern reg = Pattern.compile("([^']+')(.+)")
196 184
									def m = reg.matcher(word)
197 185
									while (word.length() > 0 && m.matches()) {
......
200 188
										m = reg.matcher(word)
201 189
									}
202 190
								}
203
								
191
							
204 192
								for (def punct : puncts) { // pre-retokenize if any
205 193
									writer.writeStartElement("w")
206 194
									for (String attr : winfos.keySet()) {
......
214 202
								puncts = []
215 203
								if (retokenizeWords) {
216 204
									while (word.length() > 0 && word.matches(".+\\p{Punct}")) {
217
										puncts << word.substring(word.length()-1, word.length())
205
										puncts.add(0, word.substring(word.length()-1, word.length()))
218 206
										word = word.substring(0, word.length()-1)
219 207
									}
220 208
								}
221 209
							
222
								writer.writeStartElement("w") // start the initial word
223
								writer.writeAttribute("time", winfos["time"])
224
								writer.writeAttribute("start", winfos["start"])
225
								writer.writeAttribute("end", winfos["end"])
226
								writer.writeAttribute("other", otherAttributeValue)
210
								if (word.length() > 0) {
211
									writer.writeStartElement("w") // start the initial word
212
									for (String attr : winfos.keySet()) {
213
										writer.writeAttribute(attr, winfos[attr])
214
									}
215
									writer.writeCharacters(word)
216
									writer.writeEndElement() // w
217
									writer.writeCharacters("\n")
218
								}
227 219
							
228
								writer.writeCharacters(word)
229
								writer.writeEndElement() // w
230
								writer.writeCharacters("\n")
231
							
232
								for (String punct : puncts) {  // post-retokenize if any
220
								for (String punct : puncts) {  // write post-retokenize if any
233 221
									writer.writeStartElement("w")
234
									writer.writeAttribute("time", winfos["time"])
235
									writer.writeAttribute("start", winfos["end"])
236
									writer.writeAttribute("end", winfos["end"])
222
									for (String attr : winfos.keySet()) {
223
										writer.writeAttribute(attr, winfos[attr])
224
									}
237 225
									writer.writeCharacters(punct)
238 226
									writer.writeEndElement() // w
239 227
									writer.writeCharacters("\n")
......
269 257
		File infile = new File("/home/mdecorde/xml/vocapia","test.xml")
270 258
		File outfile = new File("/home/mdecorde/xml/vocapia","test.trs")
271 259
		def processor = new Vocapia2Transcriber(infile)
272
		println processor.process(outfile)
260
		println processor.process(outfile, true)
273 261
	}
274 262
}

Formats disponibles : Unified diff