Revision 2952

tmp/org.txm.tigersearch.rcp/src/org/txm/tigersearch/commands/ImportCONNLUAnnotationsFromFile.java (revision 2952)
32 32
import java.io.FileFilter;
33 33
import java.io.IOException;
34 34
import java.util.HashMap;
35
import java.util.HashSet;
35 36

  
36 37
import javax.xml.stream.XMLStreamException;
37 38

  
......
40 41
import org.eclipse.core.commands.ExecutionEvent;
41 42
import org.eclipse.core.commands.ExecutionException;
42 43
import org.eclipse.jface.viewers.IStructuredSelection;
44
import org.eclipse.osgi.util.NLS;
43 45
import org.eclipse.ui.handlers.HandlerUtil;
44 46
import org.kohsuke.args4j.Option;
45 47
import org.txm.core.messages.TXMCoreMessages;
......
77 79
	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
78 80
	String propertiesPrefix;
79 81
	
82
	@Option(name = "overwrite_cqp_properties", usage = "if set the CQP properties are replaced", widget = "Boolean", required = true, def = "false")
83
	Boolean overwrite_cqp_properties;
84
	
80 85
	/*
81 86
	 * (non-Javadoc)
82 87
	 * @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent)
......
104 109
		MainCorpus mainCorpus = corpus.getMainCorpus();
105 110
		
106 111
		try {
112
			HashSet<String> test = new HashSet<>();
113
			for (String p : ImportCONNLUAnnotationsFromDirectory.UD_PROPERTY_NAMES) {
114
				if (mainCorpus.getProperty(propertiesPrefix + p) != null) {
115
					test.add(propertiesPrefix + p);
116
				}
117
			}
118
			if (test.size() > 0 && !overwrite_cqp_properties) {
119
				Log.warning(NLS.bind("Error: can't use the {0} prefix because some properties are already used: {1}", propertiesPrefix, test));
120
				return null;
121
			}
122
			
107 123
			return ImportCONNLUAnnotationsFromDirectory.importAnnotationsFromCoNNLUFile(mainCorpus, connluFile, propertiesPrefix, textId);
108 124
		}
109 125
		catch (Exception e) {
tmp/org.txm.tigersearch.rcp/src/org/txm/tigersearch/commands/ExportCorpusAsCONNLU.java (revision 2952)
73 73
	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
74 74
	String propertiesPrefix;
75 75
	
76
	@Option(name = "separator", usage = "Options", widget = "Separator", required = true, def = "comments options")
76
	@Option(name = "separator", usage = "Options", widget = "Separator", required = true, def = "comment properties")
77 77
	Boolean separator = false;
78 78
	
79 79
	@Option(name = "insertParagraphs", usage = "Insert paragraph marks in the Connlu corpus", widget = "Boolean", required = true, def = "true")
......
91 91
	@Option(name = "insertTokenWithoutUdAnnotations", usage = "if checked words without ud annotations are exported as well", widget = "Boolean", required = false, def = "false")
92 92
	Boolean insertTokenWithoutUdAnnotations;
93 93
	
94
	// "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
95
	@Option(name = "separator_properties", usage = "Options", widget = "Separator", required = true, def = "fixing UD properties")
96
	Boolean separator_properties = false;
97
	
94 98
	@Option(name = "defaultFormPropertyName", usage = "optional CQP property to fix the missing 'form' ud property", widget = "String", required = false, def = "")
95 99
	String defaultFormPropertyName;
96 100
	
......
103 107
	@Option(name = "defaultXposPropertyName", usage = "optional CQP property to fix the missing 'xpos' ud property", widget = "String", required = false, def = "")
104 108
	String defaultXposPropertyName;
105 109
	
110
	@Option(name = "defaultFeatsPropertyName", usage = "optional CQP property to fix the missing 'feats' ud property", widget = "String", required = false, def = "")
111
	String defaultFeatsPropertyName;
112
	
113
	@Option(name = "defaultHeadPropertyName", usage = "optional CQP property to fix the missing 'head' ud property", widget = "String", required = false, def = "")
114
	String defaultHeadPropertyName;
115
	
116
	@Option(name = "defaultDeprelPropertyName", usage = "optional CQP property to fix the missing 'deprel' ud property", widget = "String", required = false, def = "")
117
	String defaultDeprelPropertyName;
118
	
119
	@Option(name = "defaultDepsPropertyName", usage = "optional CQP property to fix the missing 'deps' ud property", widget = "String", required = false, def = "")
120
	String defaultDepsPropertyName;
121
	
122
	@Option(name = "defaultMiscPropertyName", usage = "optional CQP property to fix the missing 'misc' ud property", widget = "String", required = false, def = "")
123
	String defaultMiscPropertyName;
124
	
106 125
	@Option(name = "separator2", usage = "Options", widget = "Separator", required = true, def = "sentence fix options")
107 126
	Boolean separator2 = false;
108 127
	
......
145 164
		try {
146 165
			return exportAnnotationsAsCorpus(mainCorpus, connluResultDirectory, propertiesPrefix, openingPunct, insertTokenWithoutUdAnnotations,
147 166
					defaultFormPropertyName, defaultLemmaPropertyName, defaultUposPropertyName, defaultXposPropertyName,
167
					defaultFeatsPropertyName, defaultHeadPropertyName, defaultDeprelPropertyName, defaultDepsPropertyName,
168
					defaultMiscPropertyName,
148 169
					detectGap, insertParagraphs, insertNoSpaceAfter);
149 170
		}
150 171
		catch (Exception e) {
......
178 199
	 * @throws InvalidCqpIdException
179 200
	 */
180 201
	public static int exportAnnotationsAsCorpus(MainCorpus mainCorpus, File conlluResultDirectory, String prefix, String openingPunct, boolean insertTokenWithoutUdAnnotations,
181
			String defaultFormPropertyName, String defaultLemmaPropertyName,
182
			String defaultUposPropertyName, String defaultXposPropertyName, boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter)
202
			String defaultFormPropertyName, String defaultLemmaPropertyName, String defaultUposPropertyName, String defaultXposPropertyName,
203
			String defaultFeatsPropertyName, String defaultHeadPropertyName, String defaultDeprelPropertyName, String defaultDepsPropertyName,
204
			String defaultMiscPropertyName,
205
			boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter)
183 206
			throws UnexpectedAnswerException,
184 207
			IOException,
185 208
			CqiServerError,
......
384 407
				if (defaultLemmaPropertyName != null && defaultLemmaPropertyName.length() > 0) {
385 408
					lemmas = getDefaultValues(mainCorpus, defaultLemmaPropertyName, sentencePositions);
386 409
				}
387
				String[] upos = null;
410
				String[] uposs = null;
388 411
				if (defaultUposPropertyName != null && defaultUposPropertyName.length() > 0) {
389
					upos = getDefaultValues(mainCorpus, defaultUposPropertyName, sentencePositions);
412
					uposs = getDefaultValues(mainCorpus, defaultUposPropertyName, sentencePositions);
390 413
				}
391
				String[] xpos = null;
414
				String[] xposs = null;
392 415
				if (defaultXposPropertyName != null && defaultXposPropertyName.length() > 0) {
393
					xpos = getDefaultValues(mainCorpus, defaultXposPropertyName, sentencePositions);
416
					xposs = getDefaultValues(mainCorpus, defaultXposPropertyName, sentencePositions);
394 417
				}
395 418
				
419
				String[] feats = null;
420
				if (defaultFeatsPropertyName != null && defaultFeatsPropertyName.length() > 0) {
421
					feats = getDefaultValues(mainCorpus, defaultFeatsPropertyName, sentencePositions);
422
				}
423
				String[] heads = null;
424
				if (defaultHeadPropertyName != null && defaultHeadPropertyName.length() > 0) {
425
					heads = getDefaultValues(mainCorpus, defaultHeadPropertyName, sentencePositions);
426
				}
427
				String[] deprels = null;
428
				if (defaultDeprelPropertyName != null && defaultDeprelPropertyName.length() > 0) {
429
					deprels = getDefaultValues(mainCorpus, defaultDeprelPropertyName, sentencePositions);
430
				}
431
				String[] depss = null;
432
				if (defaultDepsPropertyName != null && defaultDepsPropertyName.length() > 0) {
433
					depss = getDefaultValues(mainCorpus, defaultDepsPropertyName, sentencePositions);
434
				}
435
				String[] miscs = null;
436
				if (defaultMiscPropertyName != null && defaultMiscPropertyName.length() > 0) {
437
					miscs = getDefaultValues(mainCorpus, defaultMiscPropertyName, sentencePositions);
438
				}
439
				
396 440
				// String[] feats = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(featsCorrPropertyName).getQualifiedName(), sentencePositions);
397 441
				// String[] head = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(headCorrPropertyName).getQualifiedName(), sentencePositions);
398 442
				// String[] deprel = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions);
......
415 459
					if (lemmas != null && lemmaValues[p].equals("_")) {
416 460
						lemmaValues[p] = lemmas[ip];
417 461
					}
418
					if (upos != null && uposValues[p].equals("_")) {
419
						uposValues[p] = upos[ip];
462
					if (uposs != null && uposValues[p].equals("_")) {
463
						uposValues[p] = uposs[ip];
420 464
					}
421
					if (xpos != null && xposValues[p].equals("_")) {
422
						xposValues[p] = xpos[ip];
465
					if (xposs != null && xposValues[p].equals("_")) {
466
						xposValues[p] = xposs[ip];
423 467
					}
468
					if (feats != null && featsValues[p].equals("_")) {
469
						featsValues[p] = feats[ip];
470
					}
471
					if (heads != null && headValues[p] < 0) {
472
						headValues[p] = Integer.parseInt(heads[ip]);
473
					}
474
					if (deprels != null && deprelValues[p].equals("_")) {
475
						deprelValues[p] = deprels[ip];
476
					}
477
					if (depss != null && depsValues[p].equals("_")) {
478
						depsValues[p] = depss[ip];
479
					}
480
					if (miscs != null && miscValues[p].equals("_")) {
481
						miscValues[p] = miscs[ip];
482
					}
424 483
				}
425 484
				
426 485
				if (insertNoSpaceAfter) {
......
441 500
				
442 501
				// fixing sentence punct limits
443 502
				while (sentence.size() > 0 && iSentence > 0 && formValues[sentence.get(0)].matches("\\p{P}") && !formValues[sentence.get(0)].matches(openingPunct)) {
444
					System.out.println("FIXING: first non-openingPunct position " + formValues[sentence.get(0)] + " in " + iSentence);
503
					// System.out.println("FIXING: first non-openingPunct position " + formValues[sentence.get(0)] + " in " + iSentence);
445 504
					int p2 = sentence.remove(0);
446 505
					sentences.get(iSentence - 1).add(p2);
447 506
				}
448 507
				//
449 508
				while (sentence.size() > 0 && iSentence + 1 < sentences.size() && formValues[sentence.get(sentence.size() - 1)].matches(openingPunct)) {
450
					System.out.println("FIXING: last openingPunct position " + formValues[sentence.get(sentence.size() - 1)] + " in " + iSentence);
509
					// System.out.println("FIXING: last openingPunct position " + formValues[sentence.get(sentence.size() - 1)] + " in " + iSentence);
451 510
					int p2 = sentence.remove(sentence.size() - 1);
452 511
					sentences.get(iSentence + 1).add(0, p2);
453 512
				}
......
539 598
					writer.println("# sent_id = " + textIds[iText] + "-" + (iSentence + 1) + ".new");
540 599
				}
541 600
				
542
				
543 601
				if (paragraphsStartPositions.contains(sentence.get(0))) { // paragraphsStartPositions is empty if the injectParagraph option is not set
544 602
					writer.println("# newpar id = " + iParagraph);
545 603
					iParagraph++;
......
572 630
	}
573 631
	
574 632
	private static String[] getDefaultValues(MainCorpus mainCorpus, String property, int[] positions) throws UnexpectedAnswerException, IOException, CqiServerError, CqiClientException {
575
		String[] upos = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(property).getQualifiedName(), positions);
576
		for (int iupos = 0; iupos < upos.length; iupos++) { // recode the || CQP multiple values to ud multiple values
577
			if (upos[iupos].length() > 2 && upos[iupos].startsWith("|") && upos[iupos].endsWith("|")) {
578
				upos[iupos] = upos[iupos].substring(1, upos[iupos].length() - 1);
633
		String[] values = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(property).getQualifiedName(), positions);
634
		for (int iupos = 0; iupos < values.length; iupos++) { // recode the || CQP multiple values to ud multiple values
635
			if (values[iupos].length() > 2 && values[iupos].startsWith("|") && values[iupos].endsWith("|")) {
636
				values[iupos] = values[iupos].substring(1, values[iupos].length() - 1);
579 637
			}
580 638
		}
581 639
		
582
		return upos;
640
		return values;
583 641
	}
584 642
	
585 643
	private static HashMap<Integer, String> getNonUNDEFPositionsAndValues(MainCorpus mainCorpus, String property) throws UnexpectedAnswerException, IOException, CqiServerError, CqiClientException {
tmp/org.txm.tigersearch.rcp/src/org/txm/tigersearch/commands/ImportCONNLUAnnotationsFromDirectory.java (revision 2952)
32 32
import java.io.FileFilter;
33 33
import java.io.IOException;
34 34
import java.util.HashMap;
35
import java.util.HashSet;
35 36

  
36 37
import javax.xml.stream.XMLStreamException;
37 38

  
......
40 41
import org.eclipse.core.commands.ExecutionEvent;
41 42
import org.eclipse.core.commands.ExecutionException;
42 43
import org.eclipse.jface.viewers.IStructuredSelection;
44
import org.eclipse.osgi.util.NLS;
43 45
import org.eclipse.ui.handlers.HandlerUtil;
44 46
import org.kohsuke.args4j.Option;
45 47
import org.txm.core.messages.TXMCoreMessages;
......
68 70
	
69 71
	public static final String ID = ImportCONNLUAnnotationsFromDirectory.class.getName();
70 72
	
73
	public static final String[] UD_PROPERTY_NAMES = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
74
	
71 75
	@Option(name = "connluDirectory", usage = "connluDirectory", widget = "Folder", required = true, def = "connlu-directory")
72 76
	File connluDirectory;
73 77
	
74 78
	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
75 79
	String propertiesPrefix;
76 80
	
81
	@Option(name = "overwrite_cqp_properties", usage = "if set the CQP properties are replaced", widget = "Boolean", required = true, def = "false")
82
	Boolean overwrite_cqp_properties;
83
	
77 84
	/*
78 85
	 * (non-Javadoc)
79 86
	 * @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent)
......
101 108
		MainCorpus mainCorpus = corpus.getMainCorpus();
102 109
		
103 110
		try {
111
			HashSet<String> test = new HashSet<>();
112
			for (String p : UD_PROPERTY_NAMES) {
113
				if (mainCorpus.getProperty(propertiesPrefix + p) != null) {
114
					test.add(propertiesPrefix + p);
115
				}
116
			}
117
			if (test.size() > 0 && !overwrite_cqp_properties) {
118
				Log.warning(NLS.bind("Error: can't use the {0} prefix because some properties are already used: {1}", propertiesPrefix, test));
119
				return null;
120
			}
104 121
			return importAnnotations(mainCorpus, connluDirectory, propertiesPrefix);
105 122
		}
106 123
		catch (Exception e) {
......
201 218
		if (textId == null || textId.length() == 0) { // no text id provided, using the connlu file name
202 219
			textId = coonluFile.getName().substring(0, coonluFile.getName().length() - 7);
203 220
		}
221
		
204 222
		Log.info("** processing text: " + textId);
205 223
		Text text = mainCorpus.getProject().getText(textId);
206 224
		if (text == null) {
......
216 234
		
217 235
		BufferedReader reader = IOUtils.getReader(coonluFile);
218 236
		String line = reader.readLine();
219
		String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
220 237
		
238
		
221 239
		int nWords2 = 0;
222 240
		int nLine = 0;
223 241
		String sent_id = null;
......
266 284
			
267 285
			HashMap<String, String> properties = new HashMap<>();
268 286
			for (int i = 0; i < split.length; i++) {
269
				properties.put("#" + propertiesPrefix + propNames[i], split[i]); // add the property name using the prefix ; XML-TXM types are prefixed with '#'
287
				properties.put("#" + propertiesPrefix + UD_PROPERTY_NAMES[i], split[i]); // add the property name using the prefix ; XML-TXM types are prefixed with '#'
270 288
			}
271 289
			
272 290
			if (sent_id != null) {

Also available in: Unified diff