Révision 3344

TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/importer.groovy (revision 3344)
188 188
			try {
189 189
				TranscriberTokenizer tokenizer = new TranscriberTokenizer(pfile, tfile, lang)
190 190
				tokenizer.setRetokenize("true" == project.getTokenizerParameter("doRetokenizeStep", "false"))
191
				tokenizer.setDoBuildWordIDs("true" == project.getTokenizerParameter("doBuildWordIds", "true"))
191 192
				if (!tokenizer.process()) {
192 193
					println("Failed to tokenize "+pfile)
193 194
				}
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZImporter.groovy (revision 3344)
340 340
		new File(module.getBinaryDirectory(),"tokenized").deleteDir()
341 341
		new File(module.getBinaryDirectory(),"tokenized").mkdir()
342 342
		
343
		boolean retokenize = "true" == module.getProject().getTokenizerParameter("doRetokenizeStep", "false")
344 343
		String outSideTextTagsRegex = "";
345 344
		String outSideTextTagsAndKeepContentRegex = "";
346 345
		String noteRegex = "";
......
398 397
						tokenizer.setStringTokenizer(stringTokenizer)
399 398
					}
400 399
				}
401
				tokenizer.setRetokenize(retokenize)
400
				tokenizer.setRetokenize("true" == project.getTokenizerParameter("doRetokenizeStep", "false"))
401
				tokenizer.setDoBuildWordIDs("true" == project.getTokenizerParameter("doBuildWordIds", "true"))
402 402
				if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) {
403 403
					tokenizer.setOutSideTextTags(outSideTextTagsRegex)
404 404
				}
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xml/importer.groovy (revision 3344)
164 164
				File infile = f
165 165
				File outfile = new File(binDir, "tokenized/"+f.getName())
166 166
				SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, TokenizerClasses.newTokenizerClasses(project.getPreferencesScope(), lang))
167
				boolean retokenize = "true" == project.getTokenizerParameter("doRetokenizeStep", "false")
168
				tokenizer.setRetokenize(retokenize)
167
				tokenizer.setRetokenize("true" == project.getTokenizerParameter("doRetokenizeStep", "false"))
168
				tokenizer.setDoBuildWordIDs("true" == project.getTokenizerParameter("doBuildWordIds", "true"))
169 169
				if (ignoredElements != null && ignoredElements.trim().length() > 0) {
170 170
					tokenizer.setOutSideTextTagsAndKeepContent(ignoredElements)
171 171
				}
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/filters/Tokeniser/SimpleTokenizerXml.groovy (revision 3344)
61 61
	
62 62
	StringTokenizer stringTokenizer;
63 63
	boolean retokenize = false
64
	boolean doBuildWordIDs = true
65
	ArrayList noIdWrittenErrors = new ArrayList()
64 66
	LinkedHashMap<String, String>retokenizedWordProperties = new LinkedHashMap()
65 67
	
66 68
	/** The word_tags. */
......
183 185
		if (event == XMLStreamConstants.START_ELEMENT ) {
184 186
			
185 187
			localname = parser.getLocalName();
186
			if (wordid != null) {
188
			if (wordid != null) { // normalizing word elements
187 189
				localname = word_element_to_create;
188 190
			}
189 191
			
......
210 212
				}
211 213
			}
212 214
			
213
			String attrprefix, attname;
214
			boolean hasId = false;
215
			//boolean hasType = false
216
			boolean hasN = false
217
			for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
218
				attname = parser.getAttributeLocalName(i);
219
				attrprefix = parser.getAttributePrefix(i);
220
				if ("id".equals(attname)) hasId = true;
221
				//if ("type".equals(attname)) hasType = true;
222
				if ("n".equals(attname)) hasN = true;
223
				
224
				if (attrprefix != null && attrprefix.length() > 0) {
225
					writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i))
226
				} else {
227
					writer.writeAttribute(attname, parser.getAttributeValue(i))
215
			if (word_element_to_create.equals(localname)) { // the localname has been normalized to word_element_to_create
216
				writeWordAttributes(true)
217
			} else {
218
				String attrprefix, attname;
219
				for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
220
					attname = parser.getAttributeLocalName(i);
221
					attrprefix = parser.getAttributePrefix(i);
222
					
223
					if (attrprefix != null && attrprefix.length() > 0) {
224
						writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i))
225
					} else {
226
						writer.writeAttribute(attname, parser.getAttributeValue(i))
227
					}
228 228
				}
229 229
			}
230 230
			
231
			if (wordid != null && !hasId && localname == word_element_to_create) {
232
				writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
233
			}
234
			if (!hasN && localname == word_element_to_create) {
235
				writer.writeAttribute("n", ""+wordcount);
236
			}
237 231
			if (!reg_word_tags.matcher(localname).matches()) {
238 232
				writer.writeCharacters("\n");
239 233
			}
......
433 427
				previousEvent = event;
434 428
			}
435 429
			
430
			if (noIdWrittenErrors.size() > 0) {
431
				println "Missing word IDs in source XML: "+noIdWrittenErrors.join(", ")
432
			}
433
			
436 434
			parser.close()
437 435
			writer.close();
438 436
			output.close();
......
453 451
		this.retokenize = retokenize
454 452
	}
455 453
	
454
	public void setDoBuildWordIDs(boolean doBuildWordIDs) {
455
		this.doBuildWordIDs = doBuildWordIDs
456
	}
457
	
456 458
	/**
457 459
	 * Set the element and content to ignore
458 460
	 * 
......
517 519
			for (def word : words) {
518 520
				wordcount++;
519 521
				writer.writeStartElement(word_element_to_create);
520
				writeWordAttributes();// id
522
				writeWordAttributes(false);// id
521 523
				writer.writeCharacters(word);
522 524
				writer.writeEndElement();
523 525
				writer.writeCharacters("\n");
......
533 535
	 *
534 536
	 * @return the java.lang. object
535 537
	 */
536
	protected writeWordAttributes() {
537
		writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
538
		writer.writeAttribute("n",""+wordcount);
538
	protected writeWordAttributes(boolean fromParser) {
539
		boolean idWritten = doBuildWordIDs;
540
		if (doBuildWordIDs) {
541
			writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
542
			writer.writeAttribute("n",""+wordcount);
543
		}
544
		
545
		// rewrite the properties of the retokenized word
539 546
		for (String attr : retokenizedWordProperties.keySet()) {
540
			if ("id" == attr) {
541
				writer.writeAttribute("old-id", retokenizedWordProperties[attr]);
542
			} else if ("n" == attr) {
543
				writer.writeAttribute("old-n", retokenizedWordProperties[attr]);
547
			if (doBuildWordIDs && "id" == attr) {
548
				writer.writeAttribute("foreign-id", retokenizedWordProperties[attr]);
549
			} else if (doBuildWordIDs && "n" == attr) {
550
				writer.writeAttribute("foreign-n", retokenizedWordProperties[attr]);
544 551
			} else {
545 552
				writer.writeAttribute(attr, retokenizedWordProperties[attr]);
553
				if ("id" == attr) idWritten = true;
546 554
			}
547 555
		}
556
		if (fromParser) {
557
			String attrprefix, attname;
558
			for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
559
				attname = parser.getAttributeLocalName(i);
560
				attrprefix = parser.getAttributePrefix(i);
561
				
562
				if (doBuildWordIDs && "id" == attname) {
563
					writer.writeAttribute("foreign-id", parser.getAttributeValue(i));
564
				} else if (doBuildWordIDs && "n" == attname) {
565
					writer.writeAttribute("foreign-n", parser.getAttributeValue(i));
566
				} else {
567
					if (attrprefix != null && attrprefix.length() > 0) {
568
						writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i))
569
					} else {
570
						writer.writeAttribute(attname, parser.getAttributeValue(i))
571
					}
572
					if ("id" == attname) idWritten = true;
573
				}
574
			}
575
		}
576
		if (!idWritten && noIdWrittenErrors.size() < 20) {
577
			noIdWrittenErrors << parser.getLocation().toString();
578
			if (noIdWrittenErrors.size() == 20) {
579
				noIdWrittenErrors << "...";
580
			}
581
		}
582
		
583
		
548 584
	}
549 585
	
550 586
	public void setStartTag(String tag)
TXM/trunk/org.txm.rcp/src/main/java/org/txm/rcp/editors/imports/sections/TokenizerSection.java (revision 3344)
34 34
	
35 35
	Button doRetokenizeStepButton;
36 36
	
37
	Button doBuildWordIDsButton;
38
	
37 39
	Text textElisions;
38 40
	
39 41
	Text textPuncts;
......
111 113
		gdata.colspan = 3;
112 114
		doRetokenizeStepButton.setLayoutData(gdata);
113 115
		
116
		doBuildWordIDsButton = toolkit.createButton(sectionClient, "Build word identifiers", SWT.CHECK);
117
		doBuildWordIDsButton.setToolTipText(TXMUIMessages.performWordSegmentationWithinWord);
118
		doBuildWordIDsButton.setSelection(true);
119
		doBuildWordIDsButton.setVisible(moduleParams.get(ImportModuleCustomization.ADVANCEDTOKENIZER));
120
		gdata = new TableWrapData(TableWrapData.LEFT, TableWrapData.FILL);
121
		gdata.colspan = 3;
122
		doBuildWordIDsButton.setLayoutData(gdata);
114 123
		
115 124
		Label l0 = toolkit.createLabel(sectionClient, TXMUIMessages.separatorCharacters, SWT.WRAP);
116 125
		gdata = new TableWrapData(TableWrapData.LEFT, TableWrapData.FILL);
......
303 312
	protected void updateFieldsEnableState() {
304 313
		boolean enabled = doTokenizeStepButton.getSelection();
305 314
		doRetokenizeStepButton.setEnabled(enabled);
315
		doBuildWordIDsButton.setEnabled(enabled);
306 316
		textWhiteSpaces.setEnabled(enabled);
307 317
		textPuncts.setEnabled(enabled);
308 318
		textElisions.setEnabled(enabled);
......
329 339
		doTokenizeStepButton.setSelection(doTokenizeStep);
330 340
		boolean doRetokenizeStep = "true".equals(project.getTokenizerParameter("doRetokenizeStep", "false")); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
331 341
		doRetokenizeStepButton.setSelection(doRetokenizeStep);
342
		boolean doBuildWordIds = "true".equals(project.getTokenizerParameter("doBuildWordIds", "true")); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
343
		doBuildWordIDsButton.setSelection(doBuildWordIds);
332 344
		updateFieldsEnableState();
333 345
		
334 346
		// HashMap<String, TTest> additionalTests = project.getTokenizerAdditionalTests();
......
351 363
		project.addTokenizerParameter("word_tags", wordElementText.getText()); //$NON-NLS-1$
352 364
		project.addTokenizerParameter("doTokenizeStep", "" + doTokenizeStepButton.getSelection()); //$NON-NLS-1$ //$NON-NLS-2$
353 365
		project.addTokenizerParameter("doRetokenizeStep", "" + doRetokenizeStepButton.getSelection()); //$NON-NLS-1$ //$NON-NLS-2$
366
		project.addTokenizerParameter("doBuildWordIds", "" + doBuildWordIDsButton.getSelection()); //$NON-NLS-1$ //$NON-NLS-2$
354 367
		project.addTokenizerParameter("onlyThoseTests", "false"); //$NON-NLS-1$ //$NON-NLS-2$
355 368
		// project.setTokenizerAdditionalTests(additionalTokenizerTests); //$NON-NLS-1$
356 369
		

Formats disponibles : Unified diff