Révision 2938

tmp/org.txm.core/src/java/org/txm/xml/xmltxm/XMLTXMWordPropertiesInjection.java (revision 2938)
32 32
import java.util.ArrayList;
33 33
import java.util.Arrays;
34 34
import java.util.HashMap;
35
import java.util.HashSet;
35 36
import java.util.List;
36 37
import java.util.Map.Entry;
37 38

  
......
53 54
public class XMLTXMWordPropertiesInjection extends XMLProcessor {
54 55
	
55 56
	HashMap<String, HashMap<String, String>> rules;
57
	HashSet<String> nonActivatedRules = new HashSet<String>();
56 58
	
57 59
	XPathHookActivator activator;
58 60
	
......
72 74
	public void addProperty(String id, HashMap<String, String> properties) throws IOException, XMLStreamException {
73 75
		if (rules != null) {
74 76
			rules.put(id, properties);
77
			nonActivatedRules.add(id);
75 78
		}
76 79
	}
77 80
	
81
	public HashSet<String> getNonActivatedRules() {
82
		return nonActivatedRules;
83
	}
84
	
78 85
	/**
79 86
	 * @param rules the keys are the word identifiers (id) to process. the values are hashmap of ana@type+ana@value to injected. Warning ana@type must be prefixed with "#"
80 87
	 * 
......
82 89
	public void setProperties(HashMap<String, HashMap<String, String>> rules) throws IOException, XMLStreamException {
83 90
		
84 91
		this.rules = rules;
92
		nonActivatedRules.addAll(rules.keySet());
85 93
		
86 94
		activator = new XPathHookActivator<>(hook, "//w");
87 95
		
......
115 123
				id = parser.getAttributeValue(null, "id");
116 124
				
117 125
				if (id != null && rules.containsKey(id)) {
126
					nonActivatedRules.remove(id); // the rule has been activated once
118 127
					anaValues.clear(); // empty ana values
119 128
					formValues.clear(); // empty form values
120 129
					nInsertions++;
tmp/org.txm.tigersearch.rcp/src/org/txm/tigersearch/commands/ExportCorpusAsCONNLU.java (revision 2938)
70 70
	@Option(name = "connluResultDirectory", usage = "connluResultDirectory", widget = "Folder", required = true, def = "connlu-result-directory")
71 71
	File connluResultDirectory;
72 72
	
73
	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud")
73
	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
74 74
	String propertiesPrefix;
75 75
	
76 76
	@Option(name = "separator", usage = "Options", widget = "Separator", required = true, def = "options")
......
89 89
	@Option(name = "insertParagraphs", usage = "Insert paragraph marks in the Connlu corpus", widget = "Boolean", required = true, def = "true")
90 90
	Boolean insertParagraphs = false;
91 91
	
92
	// @Option(name = "detectGap", usage = "Insert gap comment using the CQP 'gap' property", widget = "Boolean", required = true, def = "true")
92
	@Option(name = "detectGap", usage = "Insert gap comment using the CQP 'gap' property", widget = "Boolean", required = true, def = "true")
93 93
	Boolean detectGap = false;
94 94
	
95 95
	@Option(name = "formCorrPropertyName", usage = "optional CQP property to fix the missing 'form' ud property", widget = "String", required = false, def = "")
......
104 104
	@Option(name = "xposCorrPropertyName", usage = "optional CQP property to fix the missing 'xpos' ud property", widget = "String", required = false, def = "")
105 105
	String xposCorrPropertyName;
106 106
	
107
	@Option(name = "punctStrong", usage = "optional strong punct tag to fix sentence limits", widget = "String", required = true, def = "PONfrt")
108
	String punctStrong;
109
	
107 110
	@Option(name = "separator2", usage = "Options", widget = "Separator", required = true, def = "sentence fix options")
108 111
	Boolean separator2 = false;
109 112
	
110
	// @Option(name = "punctStrong", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "[?!\\.]")
111
	// String punctStrong;
112
	
113 113
	@Option(name = "openingPunct", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "[\\-–«‘“\\(]")
114 114
	String openingPunct;
115 115
	
......
145 145
		
146 146
		try {
147 147
			return exportAnnotationsAsCorpus(mainCorpus, connluResultDirectory, propertiesPrefix, openingPunct,
148
					formCorrPropertyName, lemmaCorrPropertyName, uposCorrPropertyName, xposCorrPropertyName,
148
					formCorrPropertyName, lemmaCorrPropertyName, uposCorrPropertyName, xposCorrPropertyName, punctStrong,
149 149
					detectGap, insertParagraphs, insertNoSpaceAfter);
150 150
		}
151 151
		catch (Exception e) {
......
172 172
	 * @throws InvalidCqpIdException
173 173
	 */
174 174
	public static int exportAnnotationsAsCorpus(MainCorpus mainCorpus, File conlluResultDirectory, String prefix, String openingPunct, String formCorrPropertyName, String lemmaCorrPropertyName,
175
			String uposCorrPropertyName, String xposCorrPropertyName, boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter) throws UnexpectedAnswerException,
175
			String uposCorrPropertyName, String xposCorrPropertyName, String punctStrongRegex, boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter) throws UnexpectedAnswerException,
176 176
			IOException,
177 177
			CqiServerError,
178 178
			CqiClientException, InvalidCqpIdException {
......
308 308
			// fixing sentences
309 309
			for (int s = 0; s < sentences.size(); s++) {
310 310
				
311
				// fix only ud sentences limits
311 312
				ArrayList<Integer> sentence = sentences.get(s);
313
				
314
				if (sentidStartPositions.get(sentence.get(0)) == null) {
315
					continue; // this is not a UD sentence
316
				}
317
				
312 318
				int max = -1;
313 319
				int imax = 0;
314 320
				for (int ip = 0; ip < sentence.size(); ip++) {
......
335 341
				}
336 342
			}
337 343
			
338
			// fixing sentences
339
			for (int s = 0; s < sentences.size(); s++) {
340
				
341
				ArrayList<Integer> sentence = sentences.get(s);
342
				if (s > 0 && formValues[sentence.get(0)].matches(openingPunct)) {
343
					System.out.println("FIXING: first position " + formValues[sentence.get(0)] + "in " + s);
344
					int p = sentence.remove(0);
345
					sentences.get(s - 1).add(p);
346
				}
347
				
348
				if (s + 1 < sentences.size() && formValues[sentence.get(sentence.size() - 1)].matches(openingPunct)) {
349
					System.out.println("FIXING: last position " + formValues[sentence.get(sentence.size() - 1)] + "in " + s);
350
					int p = sentence.remove(sentence.size() - 1);
351
					sentences.get(s + 1).add(0, p);
352
				}
353
				
354
				// int c = 0;
355
				// ArrayList<Integer> sentence = sentences.get(s);
356
				// for (int ip = 0 ; ip < sentence.size() ; ip++) {
357
				//
358
				// int p = sentence.get(ip);
359
				//
360
				// if (idValues[p].equals("__UNDEF__")) {
361
				// c++;
362
				// }
363
				// }
364
				// if (c == 0) { // al is fine
365
				//
366
				// } else if (c )
367
			}
368
			
369 344
			if (tmpSentence.size() > 0) { // add last sentence
370 345
				sentences.add(new ArrayList<>(tmpSentence));
371 346
			}
......
441 416
							}
442 417
						}
443 418
					}
419
					
420
				
444 421
				}
445 422
				
423
				
424
				// fixing sentence punct limits
425
					
426
					
427
//					while (sentence.size() > 0 && iSentence > 0 && xpos[0].matches(punctStrongRegex)) {
428
//						System.out.println("FIXING: first punctStrong position " + xposValues[sentence.get(0)] + " in " + iSentence);
429
//						int p2 = sentence.remove(0);
430
//						sentences.get(iSentence - 1).add(p2);
431
//					}
432
					
433
					while (sentence.size() > 0 && iSentence > 0 && formValues[sentence.get(0)].matches("\\p{P}") && !formValues[sentence.get(0)].matches(openingPunct)) {
434
						System.out.println("FIXING: first non-openingPunct position " + formValues[sentence.get(0)] + " in " + iSentence);
435
						int p2 = sentence.remove(0);
436
						sentences.get(iSentence - 1).add(p2);
437
					}
438
//
439
					while (sentence.size() > 0 && iSentence + 1 < sentences.size() && formValues[sentence.get(sentence.size() - 1)].matches(openingPunct)) {
440
						System.out.println("FIXING: last openingPunct position " + formValues[sentence.get(sentence.size() - 1)] + " in " + iSentence);
441
						int p2 = sentence.remove(sentence.size() - 1);
442
						sentences.get(iSentence + 1).add(0, p2);
443
					}
444
		
445
				if (sentence.size() == 0) { // sentence was depleted after fixing it
446
					sentences.remove(iSentence);
447
					iSentence--;
448
					continue;
449
				}
450
				
446 451
				// fixing head and set missing head to 0 and root
447 452
				for (int ip = 0; ip < sentence.size(); ip++) {
448 453
					int p = sentence.get(ip);
......
514 519
					iParagraph++;
515 520
				}
516 521
				
517
				for (int p : sentence) {
518
					
519
					if (gap != null && gap[p] != null) writer.println("# gap");
522
				for (int ip = 0 ; ip < sentence.size() ; ip++) {
523
					int p = sentence.get(ip);
524
				
520 525
					// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
521 526
					writer.println(idValues[p] + "\t" + formValues[p] + "\t" + lemmaValues[p] + "\t" + uposValues[p]
522 527
							+ "\t" + xposValues[p] + "\t" + featsValues[p] + "\t" + headValues[p] + "\t" + deprelValues[p]
523 528
							+ "\t" + depsValues[p] + "\t" + miscValues[p]);
529
					
530
					if (gap != null && gap[ip].equals("next")) writer.println("# gap");
524 531
				}
525 532
				writer.println("");
526 533
				numberOfSentencesWritten++;
tmp/org.txm.tigersearch.rcp/src/org/txm/tigersearch/commands/ImportCONNLUAnnotations.java (revision 2938)
35 35

  
36 36
import javax.xml.stream.XMLStreamException;
37 37

  
38
import org.apache.commons.lang.StringUtils;
38 39
import org.eclipse.core.commands.AbstractHandler;
39 40
import org.eclipse.core.commands.ExecutionEvent;
40 41
import org.eclipse.core.commands.ExecutionException;
......
70 71
	@Option(name = "connluDirectory", usage = "connluDirectory", widget = "Folder", required = true, def = "connlu-directory")
71 72
	File connluDirectory;
72 73
	
73
	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud")
74
	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
74 75
	String propertiesPrefix;
75 76
	
76 77
	/*
......
213 214
				}
214 215
				if (sent_id != null) {
215 216
					properties.put("#ud-sentid", sent_id);
216
					sent_id = null;
217
					sent_id = ""; // reset value for next sentence
218
				} else {
219
					properties.put("#ud-sentid", "");
217 220
				}
218 221
				if (newdoc_id != null) {
219 222
					properties.put("#ud-newdocid", newdoc_id);
220
					newdoc_id = null;
223
					newdoc_id = null; // reset value for next sentence
224
				} else {
225
					properties.put("#ud-newdocid", "");
221 226
				}
222 227
				if (newpar_id != null) {
223 228
					properties.put("#ud-newparid", newpar_id);
224
					newpar_id = null;
229
					newpar_id = null; // reset value for next sentence
230
				} else {
231
					properties.put("#ud-newparid", "");
225 232
				}
226 233
				processor.addProperty(id, properties);
227 234
				nWords2++;
......
239 246
			
240 247
			if (processor.process(xmltxmUpdatedFile)) {
241 248
				if (xmltxmFile.delete() && FileCopy.copy(xmltxmUpdatedFile, xmltxmFile)) {
242
					
249
					if (processor.getNonActivatedRules().size() > 0) {
250
						Log.warning("Warning: some words were not imported: "+StringUtils.join(processor.getNonActivatedRules(), ", "));
251
					}
243 252
				}
244 253
				else {
245 254
					Log.warning("** Warning: annotation import failed for replace the corpus XML-TXM file: " + xmltxmFile + ". TEMP file: " + xmltxmUpdatedFile);

Formats disponibles : Unified diff