Révision 3315

TXM/trunk/org.txm.conllu.rcp/.project (revision 3315)
1
<?xml version="1.0" encoding="UTF-8"?>
2
<projectDescription>
3
	<name>org.txm.conllu.rcp</name>
4
	<comment></comment>
5
	<projects>
6
	</projects>
7
	<buildSpec>
8
		<buildCommand>
9
			<name>org.eclipse.jdt.core.javabuilder</name>
10
			<arguments>
11
			</arguments>
12
		</buildCommand>
13
		<buildCommand>
14
			<name>org.eclipse.pde.ManifestBuilder</name>
15
			<arguments>
16
			</arguments>
17
		</buildCommand>
18
		<buildCommand>
19
			<name>org.eclipse.pde.SchemaBuilder</name>
20
			<arguments>
21
			</arguments>
22
		</buildCommand>
23
	</buildSpec>
24
	<natures>
25
		<nature>org.eclipse.pde.PluginNature</nature>
26
		<nature>org.eclipse.jdt.core.javanature</nature>
27
	</natures>
28
</projectDescription>
0 29

  
TXM/trunk/org.txm.conllu.rcp/src/org/txm/conllu/rcp/importsection/CoNLLUSection.java (revision 3315)
1
package org.txm.conllu.rcp.importsection;
2

  
3
import org.eclipse.swt.SWT;
4
import org.eclipse.swt.widgets.Button;
5
import org.eclipse.swt.widgets.Composite;
6
import org.eclipse.swt.widgets.Label;
7
import org.eclipse.swt.widgets.Text;
8
import org.eclipse.ui.forms.events.ExpansionAdapter;
9
import org.eclipse.ui.forms.events.ExpansionEvent;
10
import org.eclipse.ui.forms.widgets.FormToolkit;
11
import org.eclipse.ui.forms.widgets.ScrolledForm;
12
import org.eclipse.ui.forms.widgets.TableWrapData;
13
import org.eclipse.ui.forms.widgets.TableWrapLayout;
14
import org.osgi.service.prefs.Preferences;
15
import org.txm.conllu.core.preferences.UDPreferences;
16
import org.txm.objects.Project;
17
import org.txm.rcp.editors.imports.sections.ImportEditorSection;
18

  
19
public class CoNLLUSection extends ImportEditorSection {
20
	
21
	String ID = CoNLLUSection.class.getSimpleName();
22
	
23
	private static final int SECTION_SIZE = 1;
24
	
25
	Button useNewDocIdButton;
26
	Button keepWordContractionsButton;
27
	Text udPropertiesPrefixButton;
28
	private Text headPropertiesText;
29
	private Text depsPropertiesText;
30
	
31
	/**
32
	 * 
33
	 * @param toolkit2
34
	 * @param form2
35
	 * @param parent
36
	 * @param stylesave
37
	 * 
38
	 * @param moduleParams
39
	 * @param scriptName temporary parameter to detect if import module is xtzLoader.groovy
40
	 */
41
	public CoNLLUSection(FormToolkit toolkit2, ScrolledForm form2, Composite parent, int style) {
42
		
43
		super(toolkit2, form2, parent, style, "CoNLL-U");
44
		
45
		TableWrapLayout layout = new TableWrapLayout();
46
		layout.makeColumnsEqualWidth = true;
47
		layout.numColumns = 1;
48
		this.section.setLayout(layout);
49
		this.section.setLayoutData(getSectionGridData(SECTION_SIZE));
50
		this.section.setEnabled(false);
51
		
52
		this.section.addExpansionListener(new ExpansionAdapter() {
53
			
54
			@Override
55
			public void expansionStateChanged(ExpansionEvent e) {
56
				form.layout(true);
57
			}
58
		});
59
		
60
		Composite sectionClient = toolkit.createComposite(this.section);
61
		TableWrapLayout slayout = new TableWrapLayout();
62
		slayout.numColumns = 4;
63
		sectionClient.setLayout(slayout);
64
		this.section.setClient(sectionClient);
65
		
66
		useNewDocIdButton = toolkit.createButton(sectionClient, "Use new doc id when importing CoNLL-U files", SWT.CHECK);
67
		TableWrapData gdata2 = getButtonLayoutData();
68
		gdata2.colspan = 4; // one line
69
		useNewDocIdButton.setLayoutData(gdata2);
70
		
71
		keepWordContractionsButton = toolkit.createButton(sectionClient, "Keep word contractions when importing CoNLL-U files", SWT.CHECK);
72
		gdata2 = getButtonLayoutData();
73
		gdata2.colspan = 4; // one line
74
		keepWordContractionsButton.setLayoutData(gdata2);
75
		
76
		udPropertiesPrefixButton = toolkit.createText(sectionClient, "UD properties prefix", SWT.CHECK);
77
		gdata2 = getButtonLayoutData();
78
		gdata2.colspan = 4; // one line
79
		udPropertiesPrefixButton.setLayoutData(gdata2);
80
		
81
		Label tmp4Label = toolkit.createLabel(sectionClient, "Head properties to project");
82
		tmp4Label.setToolTipText("Comma separated list of ud properties.");
83
		tmp4Label.setLayoutData(getLabelGridData());
84
		
85
		headPropertiesText = toolkit.createText(sectionClient, "UD head properties to project (comma separated list)", SWT.BORDER);
86
		gdata2 = getTextGridData();
87
		gdata2.colspan = 3; // one line
88
		headPropertiesText.setLayoutData(gdata2);
89
		
90
		tmp4Label = toolkit.createLabel(sectionClient, "Deps properties to project");
91
		tmp4Label.setToolTipText("Comma separated list of ud properties.");
92
		tmp4Label.setLayoutData(getLabelGridData());
93
		
94
		// build text edition or not button
95
		depsPropertiesText = toolkit.createText(sectionClient, "UD deps properties to project (comma separated list)", SWT.BORDER);
96
		gdata2 = getTextGridData();
97
		gdata2.colspan = 3; // one line
98
		depsPropertiesText.setLayoutData(gdata2);
99
	}
100
	
101
	@Override
102
	public void updateFields(Project project) {
103
		if (this.section.isDisposed()) return;
104
		if (project == null) return;
105
		
106
		Preferences customNode = project.getImportParameters().node("conllu");
107
		
108
		useNewDocIdButton.setSelection(customNode.getBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID, true)); //$NON-NLS-1$
109
		keepWordContractionsButton.setSelection(customNode.getBoolean(UDPreferences.KEEP_CONTRACTIONS, true)); //$NON-NLS-1$
110
		udPropertiesPrefixButton.setText(customNode.get(UDPreferences.UDPREFIX, "")); //$NON-NLS-1$
111
		headPropertiesText.setText(customNode.get(UDPreferences.IMPORT_HEAD_TO_PROJECT, "upos,deprel")); //$NON-NLS-1$
112
		depsPropertiesText.setText(customNode.get(UDPreferences.IMPORT_DEPS_TO_PROJECT, "upos,deprel")); //$NON-NLS-1$
113
	}
114
	
115
	@Override
116
	public boolean saveFields(Project project) {
117
		if (this.section != null && !this.section.isDisposed()) {
118
			
119
			Preferences customNode = project.getImportParameters().node("conllu");
120
			customNode.putBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID, useNewDocIdButton.getSelection()); //$NON-NLS-1$
121
			customNode.putBoolean(UDPreferences.KEEP_CONTRACTIONS, keepWordContractionsButton.getSelection()); //$NON-NLS-1$
122
			customNode.put(UDPreferences.UDPREFIX, udPropertiesPrefixButton.getText()); //$NON-NLS-1$
123
			customNode.put(UDPreferences.IMPORT_HEAD_TO_PROJECT, headPropertiesText.getText()); //$NON-NLS-1$
124
			customNode.put(UDPreferences.IMPORT_DEPS_TO_PROJECT, depsPropertiesText.getText()); //$NON-NLS-1$
125
		}
126
		return true;
127
	}
128
	
129
	@Override
130
	public boolean checkFields() {
131
		return true;
132
	}
133
	
134
	@Override
135
	public int getSectionSize() {
136
		return SECTION_SIZE;
137
	}
138
}
0 139

  
TXM/trunk/org.txm.conllu.rcp/src/org/txm/conllu/rcp/importsection/CoNLLUAnnotationSectionConfigurator.java (revision 3315)
1
package org.txm.conllu.rcp.importsection;
2

  
3
import org.txm.rcp.editors.imports.ImportEditorSectionConfigurator;
4
import org.txm.rcp.editors.imports.ImportModuleCustomization;
5

  
6

  
7
public class CoNLLUAnnotationSectionConfigurator extends ImportEditorSectionConfigurator {
8
	
9
	@Override
10
	public void installSections() {
11
		
12
		ImportModuleCustomization.additionalSections.put("conlluLoader.groovy", CoNLLUSection.class);
13
	}
14
}
0 15

  
TXM/trunk/org.txm.conllu.rcp/src/org/txm/conllu/rcp/commands/ExportCorpusAsCoNLLU.java (revision 3315)
1
// Copyright © 2010-2020 ENS de Lyon., University of Franche-Comté
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package org.txm.conllu.rcp.commands;
29

  
30
import java.io.File;
31
import java.io.IOException;
32
import java.io.PrintWriter;
33
import java.util.ArrayList;
34
import java.util.HashMap;
35
import java.util.HashSet;
36

  
37
import org.apache.commons.lang.StringUtils;
38
import org.eclipse.core.commands.AbstractHandler;
39
import org.eclipse.core.commands.ExecutionEvent;
40
import org.eclipse.core.commands.ExecutionException;
41
import org.eclipse.jface.viewers.IStructuredSelection;
42
import org.eclipse.osgi.util.NLS;
43
import org.eclipse.ui.handlers.HandlerUtil;
44
import org.kohsuke.args4j.Option;
45
import org.txm.rcp.swt.widget.parameters.ParametersDialog;
46
import org.txm.searchengine.cqp.CQPSearchEngine;
47
import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
48
import org.txm.searchengine.cqp.clientExceptions.InvalidCqpIdException;
49
import org.txm.searchengine.cqp.clientExceptions.UnexpectedAnswerException;
50
import org.txm.searchengine.cqp.corpus.CQPCorpus;
51
import org.txm.searchengine.cqp.corpus.MainCorpus;
52
import org.txm.searchengine.cqp.corpus.StructuralUnit;
53
import org.txm.searchengine.cqp.corpus.WordProperty;
54
import org.txm.searchengine.cqp.corpus.query.CQLQuery;
55
import org.txm.searchengine.cqp.serverException.CqiServerError;
56
import org.txm.utils.i18n.LangFormater;
57
import org.txm.utils.io.IOUtils;
58
import org.txm.utils.logger.Log;
59

  
60
/**
61
 * Export the conllu properties and CQP words into a conllu corpus of several files (one per text)
62
 * 
63
 * @author mdecorde.
64
 */
65
public class ExportCorpusAsCoNLLU extends AbstractHandler {
66
	
67
	public static final String ID = ExportCorpusAsCoNLLU.class.getName();
68
	
69
	@Option(name = "conlluResultDirectory", usage = "conlluResultDirectory", widget = "Folder", required = true, def = "conllu-result-directory")
70
	File conlluResultDirectory;
71
	
72
	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
73
	String propertiesPrefix;
74
	
75
	@Option(name = "separator", usage = "Options", widget = "Separator", required = true, def = "comment properties")
76
	Boolean separator = false;
77
	
78
	@Option(name = "insertParagraphs", usage = "Insert paragraph marks in the CoNLLU corpus", widget = "Boolean", required = true, def = "true")
79
	Boolean insertParagraphs = false;
80
	
81
	@Option(name = "detectGap", usage = "Insert gap comment using the CQP 'gap' property", widget = "Boolean", required = true, def = "true")
82
	Boolean detectGap = false;
83
	
84
	@Option(name = "separator3", usage = "Options", widget = "Separator", required = true, def = "tokens options")
85
	Boolean separator3 = false;
86
	
87
	@Option(name = "insertNoSpaceAfter", usage = "Insert the NoSpaceAfter misc property if not initially in the CoNLLU corpus", widget = "Boolean", required = true, def = "true")
88
	Boolean insertNoSpaceAfter = true;
89
	
90
	@Option(name = "insertTokenWithoutUdAnnotations", usage = "if checked words without ud annotations are exported as well", widget = "Boolean", required = false, def = "false")
91
	Boolean insertTokenWithoutUdAnnotations;
92
	
93
	// "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
94
	@Option(name = "separator_properties", usage = "Options", widget = "Separator", required = true, def = "fixing UD properties")
95
	Boolean separator_properties = false;
96
	
97
	@Option(name = "defaultFormPropertyName", usage = "optional CQP property to fix the missing 'form' ud property", widget = "String", required = false, def = "")
98
	String defaultFormPropertyName;
99
	
100
	@Option(name = "defaultLemmaPropertyName", usage = "optional CQP property to fix the missing 'lemma' ud property", widget = "String", required = false, def = "")
101
	String defaultLemmaPropertyName;
102
	
103
	@Option(name = "defaultUposPropertyName", usage = "optional CQP property to fix the missing 'upos' ud property", widget = "String", required = false, def = "")
104
	String defaultUposPropertyName;
105
	
106
	@Option(name = "defaultXposPropertyName", usage = "optional CQP property to fix the missing 'xpos' ud property", widget = "String", required = false, def = "")
107
	String defaultXposPropertyName;
108
	
109
	@Option(name = "defaultFeatsPropertyName", usage = "optional CQP property to fix the missing 'feats' ud property", widget = "String", required = false, def = "")
110
	String defaultFeatsPropertyName;
111
	
112
	@Option(name = "defaultHeadPropertyName", usage = "optional CQP property to fix the missing 'head' ud property", widget = "String", required = false, def = "")
113
	String defaultHeadPropertyName;
114
	
115
	@Option(name = "defaultDeprelPropertyName", usage = "optional CQP property to fix the missing 'deprel' ud property", widget = "String", required = false, def = "")
116
	String defaultDeprelPropertyName;
117
	
118
	@Option(name = "defaultDepsPropertyName", usage = "optional CQP property to fix the missing 'deps' ud property", widget = "String", required = false, def = "")
119
	String defaultDepsPropertyName;
120
	
121
	@Option(name = "defaultMiscPropertyName", usage = "optional CQP property to fix the missing 'misc' ud property", widget = "String", required = false, def = "")
122
	String defaultMiscPropertyName;
123
	
124
	@Option(name = "separator2", usage = "Options", widget = "Separator", required = true, def = "sentence fix options")
125
	Boolean separator2 = false;
126
	
127
	@Option(name = "openingPunct", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "[\\-–«‘“\\(]")
128
	String openingPunct;
129
	
130
	/**
131
	 * the UD property suffixes, will be used to create the CQP properties like propertiesPrefix + suffix
132
	 */
133
	public static String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
134
	
135
	/*
136
	 * (non-Javadoc)
137
	 * @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent)
138
	 */
139
	@Override
140
	public Object execute(final ExecutionEvent event) throws ExecutionException {
141
		
142
		IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event);
143
		
144
		Object s = selection.getFirstElement();
145
		if (!(s instanceof MainCorpus)) {
146
			Log.warning("Selection is not a corpus. Aborting.");
147
			return null;
148
		}
149
		
150
		if (!ParametersDialog.open(this)) {
151
			return null;
152
		}
153
		
154
		conlluResultDirectory.mkdirs();
155
		if (conlluResultDirectory == null || !conlluResultDirectory.exists() || !conlluResultDirectory.isDirectory()) {
156
			Log.warning("Error: conllu result directory does not exists: " + conlluResultDirectory);
157
			return null;
158
		}
159
		
160
		CQPCorpus corpus = (CQPCorpus) s;
161
		MainCorpus mainCorpus = corpus.getMainCorpus();
162
		
163
		try {
164
			return exportAnnotationsAsCorpus(mainCorpus, conlluResultDirectory, propertiesPrefix, openingPunct, insertTokenWithoutUdAnnotations,
165
					defaultFormPropertyName, defaultLemmaPropertyName, defaultUposPropertyName, defaultXposPropertyName,
166
					defaultFeatsPropertyName, defaultHeadPropertyName, defaultDeprelPropertyName, defaultDepsPropertyName,
167
					defaultMiscPropertyName,
168
					detectGap, insertParagraphs, insertNoSpaceAfter);
169
		}
170
		catch (Exception e) {
171
			Log.warning(e);
172
			Log.printStackTrace(e);
173
		}
174
		
175
		return null;
176
	}
177
	
178
	/**
179
	 * export the corpus in a directory of conllu files (one per text)
180
	 * 
181
	 * @param mainCorpus
182
	 * @param conlluResultDirectory
183
	 * @param prefix
184
	 * @param openingPunct
185
	 * @param insertTokenWithoutUdAnnotations
186
	 * @param defaultFormPropertyName
187
	 * @param defaultLemmaPropertyName
188
	 * @param defaultUposPropertyName
189
	 * @param defaultXposPropertyName
190
	 * @param detectGap
191
	 * @param insertParagraphs
192
	 * @param insertNoSpaceAfter
193
	 * @return the number of annotation exported
194
	 * @throws UnexpectedAnswerException
195
	 * @throws IOException
196
	 * @throws CqiServerError
197
	 * @throws CqiClientException
198
	 * @throws InvalidCqpIdException
199
	 */
200
	public static int exportAnnotationsAsCorpus(MainCorpus mainCorpus, File conlluResultDirectory, String prefix, String openingPunct, boolean insertTokenWithoutUdAnnotations,
201
			String defaultFormPropertyName, String defaultLemmaPropertyName, String defaultUposPropertyName, String defaultXposPropertyName,
202
			String defaultFeatsPropertyName, String defaultHeadPropertyName, String defaultDeprelPropertyName, String defaultDepsPropertyName,
203
			String defaultMiscPropertyName,
204
			boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter)
205
			throws UnexpectedAnswerException,
206
			IOException,
207
			CqiServerError,
208
			CqiClientException, InvalidCqpIdException {
209
		
210
		if (!conlluResultDirectory.exists()) {
211
			conlluResultDirectory.mkdirs();
212
		}
213
		int numberOfWordsWritten = 0;
214
		int numberOfSentencesWritten = 0;
215
		int numberOfTextsWritten = 0;
216
		
217
		String[] textIds = mainCorpus.getCorpusTextIdsList();
218
		int[] start_limits = mainCorpus.getTextStartLimits();
219
		int[] end_limits = mainCorpus.getTextEndLimits();
220
		
221
		String lang = mainCorpus.getLang();
222
		// HashSet<String> beforeSpacesRules = new HashSet<>(LangFormater.getNoSpaceBefore(mainCorpus.getLang()));
223
		// HashSet<String> afterSpacesRules = new HashSet<>(LangFormater.getNoSpaceAfter(mainCorpus.getLang()));
224
		
225
		for (String p : propNames) {
226
			WordProperty wp = mainCorpus.getProperty(prefix + p);
227
			if (wp == null) {
228
				Log.warning("Error: cannot find the Conllu property: " + prefix + p);
229
				return 0;
230
			}
231
		}
232
		
233
		if (insertTokenWithoutUdAnnotations && (defaultFormPropertyName == null || mainCorpus.getProperty(defaultFormPropertyName) == null)) {
234
			Log.warning("Error: the defaultFormPropertyName parameter needs to be set if insertTokenWithoutUdAnnotations is set to true");
235
			return 0;
236
		}
237
		
238
		for (int iText = 0; iText < start_limits.length; iText++) {
239
			
240
			// Build corpus positions
241
			int[] positions = new int[end_limits[iText] - start_limits[iText] + 1];
242
			int tmp = 0;
243
			for (int n = start_limits[iText]; n <= end_limits[iText]; n++) {
244
				positions[tmp++] = n;
245
			}
246
			numberOfWordsWritten += positions.length;
247
			
248
			// Get UD properties
249
			WordProperty wp;
250
			wp = mainCorpus.getProperty(prefix + "id");
251
			String[] tmpValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
252
			int[] idValues = new int[tmpValues.length];
253
			for (int iId = 0; iId < tmpValues.length; iId++) {
254
				if (tmpValues[iId].length() > 0 && !tmpValues[iId].equals("_") && !tmpValues[iId].equals("__UNDEF__")) {
255
					idValues[iId] = Integer.parseInt(tmpValues[iId]);
256
				}
257
				else {
258
					idValues[iId] = 0;
259
				}
260
			}
261
			tmpValues = null;
262
			
263
			WordProperty formWordProperty = mainCorpus.getProperty(prefix + "form");
264
			String[] formValues = CQPSearchEngine.getCqiClient().cpos2Str(formWordProperty.getQualifiedName(), positions);
265
			fixUNDEFValues(formValues);
266
			
267
			wp = mainCorpus.getProperty(prefix + "lemma");
268
			String[] lemmaValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
269
			fixUNDEFValues(lemmaValues);
270
			
271
			wp = mainCorpus.getProperty(prefix + "upos");
272
			String[] uposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
273
			fixUNDEFValues(uposValues);
274
			
275
			wp = mainCorpus.getProperty(prefix + "xpos");
276
			String[] xposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
277
			fixUNDEFValues(xposValues);
278
			
279
			wp = mainCorpus.getProperty(prefix + "feats");
280
			String[] featsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
281
			fixUNDEFValues(featsValues);
282
			
283
			wp = mainCorpus.getProperty(prefix + "head");
284
			// String[] headValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
285
			tmpValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
286
			int[] headValues = new int[tmpValues.length];
287
			for (int iId = 0; iId < tmpValues.length; iId++) {
288
				if (tmpValues[iId].length() > 0 && !tmpValues[iId].equals("_") && !tmpValues[iId].equals("__UNDEF__")) {
289
					headValues[iId] = Integer.parseInt(tmpValues[iId]);
290
				}
291
				else {
292
					headValues[iId] = -1;
293
				}
294
			}
295
			tmpValues = null;
296
			
297
			wp = mainCorpus.getProperty(prefix + "deprel");
298
			String[] deprelValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
299
			fixUNDEFValues(deprelValues);
300
			
301
			wp = mainCorpus.getProperty(prefix + "deps");
302
			String[] depsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
303
			fixUNDEFValues(depsValues);
304
			
305
			wp = mainCorpus.getProperty(prefix + "misc");
306
			String[] miscValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
307
			fixUNDEFValues(miscValues);
308
			
309
			HashSet<Integer> paragraphsStartPositions = new HashSet<>();
310
			if (insertParagraphs) {
311
				StructuralUnit p_struct = mainCorpus.getStructuralUnit("p");
312
				if (p_struct == null) {
313
					Log.warning(NLS.bind("** insertParagraph parameter is set, but there are no *p* structure (no paragraph) in the {0} CQP corpus. The insertParagraph parameter will be ignored.",
314
							mainCorpus));
315
				}
316
				else {
317
					for (int position : mainCorpus.query(new CQLQuery("<p> [_.text_id=\"" + textIds[iText] + "\"]"), "textParagraphPositions", false).getStarts()) {
318
						paragraphsStartPositions.add(position);
319
					}
320
				}
321
			}
322
			
323
			HashMap<Integer, String> sentidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, prefix+"sentid");
324
			HashMap<Integer, String> newdocidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, prefix+"newdocid");
325
			
326
			// build sentence, first pass using UD word sentence positions
327
			ArrayList<ArrayList<Integer>> sentences = new ArrayList<>();
328
			ArrayList<Integer> tmpSentence = new ArrayList<>();
329
			for (int p = 0; p < positions.length; p++) {
330
				// System.out.println("p=" + p + " id=" + idValues[p] + " form=" + formValues[p] + " lemma=" + lemmaValues[p] + " upos=" + uposValues[p] + " xpos=" + xposValues[p] + " feats="
331
				// + featsValues[p] + " head="
332
				// + headValues[p] + " deprel=" + deprelValues[p] + " deps=" + depsValues[p] + " misc=" + miscValues[p]);
333
				if (sentidStartPositions.containsKey(p)) { // new ud sentence
334
					
335
					if (tmpSentence.size() > 0) {
336
						sentences.add(new ArrayList<>(tmpSentence));
337
					}
338
					
339
					// System.out.println("new sentence: " + " id=" + idValues[p] + " form=" + formValues[p] + " lemma=" + lemmaValues[p] + " upos=" + uposValues[p] + " xpos=" + xposValues[p] + "
340
					// feats="
341
					// + featsValues[p] + " head="
342
					// + headValues[p] + " deprel=" + deprelValues[p] + " deps=" + depsValues[p] + " misc=" + miscValues[p]);
343
					tmpSentence.clear();
344
				}
345
				
346
				if (insertTokenWithoutUdAnnotations) {
347
					tmpSentence.add(p); // insert all tokens
348
				}
349
				else if (idValues[p] != 0) {
350
					tmpSentence.add(p); // insert all tokens
351
				}
352
				
353
			}
354
			positions = null; // free memory
355
			
356
			// fixing sentences
357
			for (int s = 0; s < sentences.size(); s++) {
358
				
359
				// fix only ud sentences limits
360
				ArrayList<Integer> sentence = sentences.get(s);
361
				
362
				if (sentidStartPositions.get(sentence.get(0)) == null) {
363
					continue; // this is not a UD sentence
364
				}
365
				
366
				int max = -1;
367
				int imax = 0;
368
				for (int ip = 0; ip < sentence.size(); ip++) {
369
					int p = sentence.get(ip);
370
					if (max < idValues[p]) {
371
						max = idValues[p];
372
						imax = ip;
373
					}
374
				}
375
				
376
				ArrayList<Integer> newSentence = new ArrayList<>();
377
				for (int ip = imax + 1; ip < sentence.size(); ip++) {
378
					newSentence.add(sentence.get(ip));
379
					sentence.remove(ip);
380
					ip--;
381
				}
382
				if (newSentence.size() == 1) { // the new sentence size is 1, resinsert it
383
					sentence.addAll(newSentence);
384
					newSentence.clear();
385
				}
386
				if (newSentence.size() > 0) {
387
					//System.out.println("INSERT " + newSentence);
388
					sentences.add(s + 1, newSentence);
389
				}
390
			}
391
			
392
			if (tmpSentence.size() > 0) { // add last sentence
393
				sentences.add(new ArrayList<>(tmpSentence));
394
			}
395
			
396
			// fixing sentence __NULL__ ud properties
397
			for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
398
				ArrayList<Integer> sentence = sentences.get(iSentence);
399
				
400
				int[] sentencePositions = new int[sentence.size()];
401
				for (int p = 0; p < sentence.size(); p++) {
402
					sentencePositions[p] = sentence.get(p);
403
				}
404
				
405
				// get CQP values fixing "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps"
406
				String[] ids = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("id").getQualifiedName(), sentencePositions);
407
				
408
				String[] words = null;
409
				if (defaultFormPropertyName != null && defaultFormPropertyName.length() > 0) {
410
					words = getDefaultValues(mainCorpus, defaultFormPropertyName, sentencePositions);
411
				}
412
				String[] lemmas = null;
413
				if (defaultLemmaPropertyName != null && defaultLemmaPropertyName.length() > 0) {
414
					lemmas = getDefaultValues(mainCorpus, defaultLemmaPropertyName, sentencePositions);
415
				}
416
				String[] uposs = null;
417
				if (defaultUposPropertyName != null && defaultUposPropertyName.length() > 0) {
418
					uposs = getDefaultValues(mainCorpus, defaultUposPropertyName, sentencePositions);
419
				}
420
				String[] xposs = null;
421
				if (defaultXposPropertyName != null && defaultXposPropertyName.length() > 0) {
422
					xposs = getDefaultValues(mainCorpus, defaultXposPropertyName, sentencePositions);
423
				}
424
				
425
				String[] feats = null;
426
				if (defaultFeatsPropertyName != null && defaultFeatsPropertyName.length() > 0) {
427
					feats = getDefaultValues(mainCorpus, defaultFeatsPropertyName, sentencePositions);
428
				}
429
				String[] heads = null;
430
				if (defaultHeadPropertyName != null && defaultHeadPropertyName.length() > 0) {
431
					heads = getDefaultValues(mainCorpus, defaultHeadPropertyName, sentencePositions);
432
				}
433
				String[] deprels = null;
434
				if (defaultDeprelPropertyName != null && defaultDeprelPropertyName.length() > 0) {
435
					deprels = getDefaultValues(mainCorpus, defaultDeprelPropertyName, sentencePositions);
436
				}
437
				String[] depss = null;
438
				if (defaultDepsPropertyName != null && defaultDepsPropertyName.length() > 0) {
439
					depss = getDefaultValues(mainCorpus, defaultDepsPropertyName, sentencePositions);
440
				}
441
				String[] miscs = null;
442
				if (defaultMiscPropertyName != null && defaultMiscPropertyName.length() > 0) {
443
					miscs = getDefaultValues(mainCorpus, defaultMiscPropertyName, sentencePositions);
444
				}
445
				
446
				// String[] feats = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(featsCorrPropertyName).getQualifiedName(), sentencePositions);
447
				// String[] head = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(headCorrPropertyName).getQualifiedName(), sentencePositions);
448
				// String[] deprel = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions);
449
				// String[] deps = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions);
450
				
451
				// fix ud properties using CQP values
452
				for (int ip = 0; ip < sentence.size(); ip++) {
453
					
454
					int p = sentence.get(ip);
455
					
456
					// new word
457
					if (miscValues[p].equals("_")) {
458
						miscValues[p] = "XmlId=" + ids[ip];
459
					}
460
					
461
					// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
462
					if (words != null && formValues[p].equals("_")) {
463
						formValues[p] = words[ip];
464
					}
465
					if (lemmas != null && lemmaValues[p].equals("_")) {
466
						lemmaValues[p] = lemmas[ip];
467
					}
468
					if (uposs != null && uposValues[p].equals("_")) {
469
						uposValues[p] = uposs[ip];
470
					}
471
					if (xposs != null && xposValues[p].equals("_")) {
472
						xposValues[p] = xposs[ip];
473
					}
474
					if (feats != null && featsValues[p].equals("_")) {
475
						featsValues[p] = feats[ip];
476
					}
477
					if (heads != null && headValues[p] < 0) {
478
						headValues[p] = Integer.parseInt(heads[ip]);
479
					}
480
					if (deprels != null && deprelValues[p].equals("_")) {
481
						deprelValues[p] = deprels[ip];
482
					}
483
					if (depss != null && depsValues[p].equals("_")) {
484
						depsValues[p] = depss[ip];
485
					}
486
					if (miscs != null && miscValues[p].equals("_")) {
487
						miscValues[p] = miscs[ip];
488
					}
489
				}
490
				
491
				if (insertNoSpaceAfter) {
492
					for (int ip = 0; ip < sentence.size(); ip++) { // fix SpaceAfter. !!! this needs to be done after ud properties are fixed
493
						int p = sentence.get(ip);
494
						// insertion activated
495
						if (!miscValues[p].contains("SpaceAfter=")) { // only update if not present
496
							if (LangFormater.isSpaceAfterNotNeeded(formValues[p], lang)) {
497
								miscValues[p] += "|SpaceAfter=No";
498
							}
499
							else if (formValues.length > (p + 1) && LangFormater.isSpaceBeforeNotNeeded(formValues[p + 1], lang)) {
500
								// if next token needs a space before, set SpaceAfter=Yes to the previous token
501
								miscValues[p] += "|SpaceAfter=No";
502
							}
503
						}
504
					}
505
				}
506
				
507
				// fixing sentence punct limits
508
				while (sentence.size() > 0 && iSentence > 0 && formValues[sentence.get(0)].matches("\\p{P}") && !formValues[sentence.get(0)].matches(openingPunct)) {
509
					// System.out.println("FIXING: first non-openingPunct position " + formValues[sentence.get(0)] + " in " + iSentence);
510
					int p2 = sentence.remove(0);
511
					sentences.get(iSentence - 1).add(p2);
512
				}
513
				//
514
				while (sentence.size() > 0 && iSentence + 1 < sentences.size() && formValues[sentence.get(sentence.size() - 1)].matches(openingPunct)) {
515
					// System.out.println("FIXING: last openingPunct position " + formValues[sentence.get(sentence.size() - 1)] + " in " + iSentence);
516
					int p2 = sentence.remove(sentence.size() - 1);
517
					sentences.get(iSentence + 1).add(0, p2);
518
				}
519
				
520
				if (sentence.size() == 0) { // sentence was depleted after fixing it
521
					sentences.remove(iSentence);
522
					iSentence--;
523
					continue;
524
				}
525
			}
526
			
527
			for (int s = 0; s < sentences.size(); s++) {
528
				
529
				// fix only ud sentences limits
530
				ArrayList<Integer> sentence = sentences.get(s);
531
				HashMap<Integer, Integer> oldToNewIds = new HashMap<>();
532
				for (int ip = 0; ip < sentence.size(); ip++) { // computing old to new ids
533
					int p = sentence.get(ip);
534
					
535
					if (idValues[p] != 0) { // store "old id -> new id"
536
						oldToNewIds.put(idValues[p], (ip + 1)); // from 1 to N
537
					}
538
				}
539
				
540
				// fixing head and set missing head to 0 and root
541
				for (int ip = 0; ip < sentence.size(); ip++) {
542
					int p = sentence.get(ip);
543
					
544
					// fixing id value
545
					idValues[p] = (ip + 1);  // from 1 to N
546
					
547
					// fixing head values
548
					if (oldToNewIds.containsKey(headValues[p])) {
549
						headValues[p] = oldToNewIds.get(headValues[p]);
550
					}
551
					else if (headValues[p] != 0) { // new word, set to default values
552
						headValues[p] = 0;
553
						deprelValues[p] = "_";
554
						depsValues[p] = "_";
555
					}
556
				}
557
			}
558
			
559
			// writing sentences
560
			File resultConlluFile = new File(conlluResultDirectory, textIds[iText] + ".conllu");
561
			PrintWriter writer = IOUtils.getWriter(resultConlluFile);
562
			
563
			int iParagraph = 1;
564
			
565
			for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
566
				ArrayList<Integer> sentence = sentences.get(iSentence);
567
				
568
				int[] sentencePositions = new int[sentence.size()];
569
				for (int p = 0; p < sentence.size(); p++) {
570
					sentencePositions[p] = sentence.get(p);
571
				}
572
				
573
				String[] gap = null;
574
				if (detectGap && mainCorpus.getProperty("gap") != null) {
575
					gap = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("gap").getQualifiedName(), sentencePositions);
576
				}
577
				
578
				String[] tokens = new String[sentence.size()];
579
				for (int ip = 0; ip < sentence.size(); ip++) {
580
					tokens[ip] = formValues[sentence.get(ip)];
581
				}
582
				
583
				if (insertNoSpaceAfter) {
584
					writer.println("# text = " + LangFormater.format(StringUtils.join(tokens, " "), mainCorpus.getLang()));
585
				}
586
				else {
587
					writer.println("# text = " + StringUtils.join(tokens, " "));
588
				}
589
				
590
				if (newdocidStartPositions.containsKey(sentence.get(0))) {
591
					writer.println("# newdoc id = " + newdocidStartPositions.get(sentence.get(0)));
592
				}
593
				else {
594
					writer.println("# newdoc id = " + textIds[iText]);
595
				}
596
				
597
				boolean foundSentId = false;
598
				for (int ip : sentence) {
599
					if (!foundSentId && sentidStartPositions.containsKey(ip)) {
600
						writer.println("# sent_id = " + sentidStartPositions.get(ip));
601
						foundSentId = true;
602
					}
603
				}
604
				if (!foundSentId) { // no sent_id found
605
					writer.println("# sent_id = " + textIds[iText] + "-" + (iSentence + 1) + ".new");
606
				}
607
				
608
				if (paragraphsStartPositions.contains(sentence.get(0))) { // paragraphsStartPositions is empty if the injectParagraph option is not set
609
					writer.println("# newpar id = " + iParagraph);
610
					iParagraph++;
611
				}
612
				
613
				for (int ip = 0; ip < sentence.size(); ip++) {
614
					int p = sentence.get(ip);
615
					
616
					// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
617
					writer.println(idValues[p] + "\t" + formValues[p] + "\t" + lemmaValues[p] + "\t" + uposValues[p]
618
							+ "\t" + xposValues[p] + "\t" + featsValues[p] + "\t" + headValues[p] + "\t" + deprelValues[p]
619
							+ "\t" + depsValues[p] + "\t" + miscValues[p]);
620
					
621
					if (gap != null && gap[ip].equals("next")) {
622
						writer.println("# gap");
623
					}
624
				}
625
				writer.println("");
626
				numberOfSentencesWritten++;
627
			}
628
			writer.close();
629
			
630
			System.out.println(" Text done: " + resultConlluFile);
631
			numberOfTextsWritten++;
632
		}
633
		
634
		System.out.println("# words written: " + numberOfWordsWritten);
635
		System.out.println("# sentences written: " + numberOfSentencesWritten);
636
		System.out.println("# texts written: " + numberOfTextsWritten);
637
		
638
		return numberOfWordsWritten;
639
	}
640
	
641
	private static String[] getDefaultValues(MainCorpus mainCorpus, String property, int[] positions) throws UnexpectedAnswerException, IOException, CqiServerError, CqiClientException {
642
		String[] values = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(property).getQualifiedName(), positions);
643
		for (int iupos = 0; iupos < values.length; iupos++) { // recode the || CQP multiple values to ud multiple values
644
			if (values[iupos].length() > 2 && values[iupos].startsWith("|") && values[iupos].endsWith("|")) {
645
				values[iupos] = values[iupos].substring(1, values[iupos].length() - 1);
646
			}
647
		}
648
		
649
		return values;
650
	}
651
	
652
	private static HashMap<Integer, String> getNonUNDEFPositionsAndValues(MainCorpus mainCorpus, String property) throws UnexpectedAnswerException, IOException, CqiServerError, CqiClientException {
653
		HashMap<Integer, String> sentidStartPositions = new HashMap<>();
654
		int[] ids = CQPSearchEngine.getCqiClient().regex2Id(mainCorpus.getProperty(property).getQualifiedName(), "(?!__UNDEF__).+");
655
		String[] strs = CQPSearchEngine.getCqiClient().id2Str(mainCorpus.getProperty(property).getQualifiedName(), ids);
656
		for (int iId = 0; iId < ids.length; iId++) {
657
			int id = ids[iId];
658
			int[] pp = CQPSearchEngine.getCqiClient().id2Cpos(mainCorpus.getProperty(property).getQualifiedName(), id);
659
			for (int p : pp) {
660
				sentidStartPositions.put(p, strs[iId]);
661
			}
662
		}
663
		
664
		return sentidStartPositions;
665
	}
666
	
667
	private static void fixUNDEFValues(String[] values) {
668
		for (int i = 0; i < values.length; i++) {
669
			if (values[i].equals("__UNDEF__") || values[i].equals("")) {
670
				values[i] = "_";
671
			}
672
		}
673
	}
674
}
675

  
676

  
0 677

  
TXM/trunk/org.txm.conllu.rcp/src/org/txm/conllu/rcp/commands/ImportCoNLLUAnnotationsFromDirectory.java (revision 3315)
1
// Copyright © 2010-2020 ENS de Lyon., University of Franche-Comté
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package org.txm.conllu.rcp.commands;
29

  
30
import java.io.File;
31
import java.io.FileFilter;
32
import java.io.IOException;
33
import java.util.Arrays;
34
import java.util.HashSet;
35
import java.util.Set;
36

  
37
import javax.xml.stream.XMLStreamException;
38

  
39
import org.eclipse.core.commands.AbstractHandler;
40
import org.eclipse.core.commands.ExecutionEvent;
41
import org.eclipse.core.commands.ExecutionException;
42
import org.eclipse.jface.viewers.IStructuredSelection;
43
import org.eclipse.osgi.util.NLS;
44
import org.eclipse.ui.handlers.HandlerUtil;
45
import org.kohsuke.args4j.Option;
46
import org.txm.conllu.core.function.ImportCoNLLUAnnotations;
47
import org.txm.conllu.core.preferences.UDPreferences;
48
import org.txm.core.messages.TXMCoreMessages;
49
import org.txm.rcp.commands.workspace.UpdateCorpus;
50
import org.txm.rcp.swt.widget.parameters.ParametersDialog;
51
import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
52
import org.txm.searchengine.cqp.corpus.CQPCorpus;
53
import org.txm.searchengine.cqp.corpus.MainCorpus;
54
import org.txm.searchengine.cqp.serverException.CqiServerError;
55
import org.txm.utils.logger.Log;
56

  
57
/**
58
 * Import CoNLLU annotations into a TXM corpus
59
 * 
60
 * If the corpus already contains CoNLLU annotations, they are replaced
61
 * 
62
 * @author mdecorde.
63
 */
64
public class ImportCoNLLUAnnotationsFromDirectory extends AbstractHandler {
65
	
66
	public static final String ID = ImportCoNLLUAnnotationsFromDirectory.class.getName();
67
	
68
	@Option(name = "conlluDirectory", usage = "conlluDirectory", widget = "Folder", required = true, def = "conllu-directory")
69
	File conlluDirectory;
70
	
71
	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
72
	String propertiesPrefix;
73
	
74
	@Option(name = "udPropertiesToImport", usage = "to create the ud properties", widget = "StringArrayMultiple", metaVar="form	lemma	upos	xpos	feats	head	deprel	deps	misc", required = true, def = "form,lemma,upos,xpos,feats,head,deprel,deps,misc")
75
	String udPropertiesToImport;
76
	
77
	@Option(name = "overwrite_cqp_properties", usage = "if set the CQP properties are replaced", widget = "Boolean", required = true, def = "false")
78
	Boolean overwrite_cqp_properties;
79
	
80
	@Option(name = "normalize_word_ids", usage = "if set the CQP properties are replaced", widget = "Boolean", required = true, def = "false")
81
	Boolean normalize_word_ids;
82
		
83
	@Option(name = "headPropertiesToProject", usage = "to create the head-XYZ properties from the word head", widget = "StringArrayMultiple", metaVar="form	lemma	upos	xpos	feats	head	deprel	deps	misc", required = true, def = "deprel,upos")
84
	String headPropertiesToProject;
85
	
86
	@Option(name = "depsPropertiesToProject", usage = "to create the dep-XYZ from the word dependancies", widget = "StringArrayMultiple", metaVar="form	lemma	upos	xpos	feats	head	deprel	deps	misc", required = true, def = "deprel,upos")
87
	String depsPropertiesToProject;
88
	
89
	/*
90
	 * (non-Javadoc)
91
	 * @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent)
92
	 */
93
	@Override
94
	public Object execute(final ExecutionEvent event) throws ExecutionException {
95
		
96
		IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event);
97
		
98
		Object s = selection.getFirstElement();
99
		if (!(s instanceof MainCorpus)) {
100
			Log.warning("Selection is not a corpus. Aborting.");
101
			return null;
102
		}
103
		
104
		if (!ParametersDialog.open(this)) {
105
			return null;
106
		}
107
		if (conlluDirectory == null || !conlluDirectory.exists() || !conlluDirectory.isDirectory() || conlluDirectory.listFiles().length == 0) {
108
			Log.warning("Error: conllu directory is empty: " + conlluDirectory);
109
			return null;
110
		}
111
		
112
		CQPCorpus corpus = (CQPCorpus) s;
113
		MainCorpus mainCorpus = corpus.getMainCorpus();
114
		
115
		try {
116
			HashSet<String> test = new HashSet<>();
117
			for (String p : ImportCoNLLUAnnotations.UD_PROPERTY_NAMES) {
118
				if (mainCorpus.getProperty(propertiesPrefix + p) != null) {
119
					test.add(propertiesPrefix + p);
120
				}
121
			}
122
			if (test.size() > 0 && !overwrite_cqp_properties) {
123
				Log.warning(NLS.bind("Error: can't use the {0} prefix because some properties are already used: {1}", propertiesPrefix, test));
124
				return null;
125
			}
126
			
127
			return importAnnotations(mainCorpus, conlluDirectory, propertiesPrefix, normalize_word_ids, 
128
					new HashSet<String>(Arrays.asList(headPropertiesToProject.split(","))), 
129
					new HashSet<String>(Arrays.asList(depsPropertiesToProject.split(","))),
130
					new HashSet<String>(Arrays.asList(udPropertiesToImport.split(","))));
131
		}
132
		catch (Exception e) {
133
			Log.warning(e);
134
			e.printStackTrace();
135
		}
136
		
137
		return null;
138
	}
139
	
140
	/**
141
	 * 
142
	 * if import CoNLLU annotations in the corpus with the same name already exists, it is replaced
143
	 * 
144
	 * @param corpus
145
	 * @param conlluDirectory
146
	 * @param propertiesPrefix
147
	 * @return the number of imported annotations
148
	 * @throws CqiClientException
149
	 * @throws CqiServerError
150
	 * @throws IOException
151
	 * @throws XMLStreamException
152
	 */
153
	public static int importAnnotations(MainCorpus mainCorpus, File conlluDirectory, String propertiesPrefix, Boolean normalizeWordIds, 
154
			Set<String> headPropertiesToProject, Set<String> depsPropertiesToProject, Set<String> udPropertiesToImport) throws IOException, CqiServerError, CqiClientException, XMLStreamException {
155
		
156
		Log.info(TXMCoreMessages.bind("Importing CONLL-u annotations of {0} in {1} using the ''{2}'' prefix...", conlluDirectory, mainCorpus, propertiesPrefix));
157
		
158
		File[] files = conlluDirectory.listFiles(new FileFilter() {
159
			@Override
160
			public boolean accept(File file) {
161
				return file.isFile() && file.getName().endsWith(".conllu");
162
			}
163
		});
164
		
165
		int nTextProcessed = 0;
166
		int nWordsInserted = 0;
167
		for (File coonluFile : files) {
168
			
169
			nWordsInserted += ImportCoNLLUAnnotations._importAnnotations(coonluFile, mainCorpus, propertiesPrefix, null, normalizeWordIds, headPropertiesToProject, depsPropertiesToProject, udPropertiesToImport);
170
			nTextProcessed++;
171
		}
172
		
173
		if (nTextProcessed == 0) {
174
			Log.warning("** No text to process. Aborting.");
175
			return 0;
176
		}
177
		
178
		if (nWordsInserted == 0) {
179
			Log.warning("** No annotation imported. Aborting.");
180
			return 0;
181
		}
182
		
183
		Log.info("XML-TXM source files updated. Updating indexes...");
184
		
185
		UDPreferences.getInstance().setProjectPreferenceValue(mainCorpus.getProject(), UDPreferences.UDPREFIX, propertiesPrefix);
186
		
187
		UpdateCorpus.update(mainCorpus);
188
		
189
		Log.info("Done.");
190
		
191
		return nWordsInserted;
192
	}
193
	
194
	/**
195
	 * 
196
	 * if import CoNLLU annotations in the corpus with the same name already exists, it is replaced
197
	 * 
198
	 * @param corpus
199
	 * @param conlluFile
200
	 * @param propertiesPrefix
201
	 * @param normalize_word_ids
202
	 * @return the number of imported annotations
203
	 * @throws CqiClientException
204
	 * @throws CqiServerError
205
	 * @throws IOException
206
	 * @throws XMLStreamException
207
	 */
208
	public static int importAnnotationsFromCoNLLUFile(MainCorpus mainCorpus, File conlluFile, String propertiesPrefix, String textId, Boolean normalize_word_ids, 
209
			Set<String> headPropertiesToProject, Set<String> depsPropertiesToProject, Set<String> udPropertiesToImport) throws IOException,
210
			CqiServerError, CqiClientException, XMLStreamException {
211
		Log.info(TXMCoreMessages.bind("Importing CONLL-u annotations of {0} in {1} using the ''{2}'' prefix...", conlluFile, mainCorpus, propertiesPrefix));
212
		
213
		int nWordsInserted = ImportCoNLLUAnnotations._importAnnotations(conlluFile, mainCorpus, propertiesPrefix, textId, normalize_word_ids, headPropertiesToProject, depsPropertiesToProject, udPropertiesToImport);
214
		
215
		if (nWordsInserted == 0) {
216
			Log.warning("** No annotation imported. Aborting.");
217
			return 0;
218
		}
219
		
220
		Log.info("XML-TXM source files updated. Updating indexes...");
221
		
222
		UDPreferences.getInstance().setProjectPreferenceValue(mainCorpus.getProject(), UDPreferences.UDPREFIX, propertiesPrefix);
223
		
224
		UpdateCorpus.update(mainCorpus);
225
		
226
		Log.info("Done.");
227
		
228
		return nWordsInserted;
229
	}
230

  
231
}
0 232

  
TXM/trunk/org.txm.conllu.rcp/src/org/txm/conllu/rcp/commands/ImportCoNLLUAnnotationsFromFile.java (revision 3315)
1
// Copyright © 2010-2020 ENS de Lyon., University of Franche-Comté
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package org.txm.conllu.rcp.commands;
29

  
30
import java.io.File;
31
import java.util.Arrays;
32
import java.util.HashSet;
33

  
34
import org.eclipse.core.commands.AbstractHandler;
35
import org.eclipse.core.commands.ExecutionEvent;
36
import org.eclipse.core.commands.ExecutionException;
37
import org.eclipse.jface.viewers.IStructuredSelection;
38
import org.eclipse.osgi.util.NLS;
39
import org.eclipse.ui.handlers.HandlerUtil;
40
import org.kohsuke.args4j.Option;
41
import org.txm.conllu.core.function.ImportCoNLLUAnnotations;
42
import org.txm.rcp.swt.widget.parameters.ParametersDialog;
43
import org.txm.searchengine.cqp.corpus.CQPCorpus;
44
import org.txm.searchengine.cqp.corpus.MainCorpus;
45
import org.txm.utils.logger.Log;
46

  
47
/**
48
 * Import CoNLLU annotations into a TXM corpus
49
 * 
50
 * IF the corpus already contains CoNLLU annotations, they are replaced
51
 * 
52
 * @author mdecorde.
53
 */
54
public class ImportCoNLLUAnnotationsFromFile extends AbstractHandler {
55
	
56
	public static final String ID = ImportCoNLLUAnnotationsFromFile.class.getName();
57
	
58
	@Option(name = "conlluFile", usage = "CoNLL-U file", widget = "FileOpen", required = true, def = "file.conllu")
59
	File conlluFile;
60
	
61
	@Option(name = "textId", usage = "Identifier of the text to update", widget = "String", required = true, def = "text-id")
62
	String textId;
63
	
64
	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
65
	String propertiesPrefix;
66
	
67
	@Option(name = "udPropertiesToImport", usage = "to create the ud properties", widget = "StringArrayMultiple", metaVar="form	lemma	upos	xpos	feats	head	deprel	deps	misc", required = true, def = "form,lemma,upos,xpos,feats,head,deprel,deps,misc")
68
	String udPropertiesToImport;
69
	
70
	@Option(name = "overwrite_cqp_properties", usage = "if set the CQP properties are replaced", widget = "Boolean", required = true, def = "false")
71
	Boolean overwrite_cqp_properties;
72
	
73
	@Option(name = "normalize_word_ids", usage = "if set the CQP properties are replaced", widget = "Boolean", required = true, def = "false")
74
	Boolean normalize_word_ids;
75
	
76
	@Option(name = "headPropertiesToProject", usage = "to create the headXYZ properties", widget = "StringArrayMultiple", metaVar="form	lemma	upos	xpos	feats	head	deprel	deps	misc", required = true, def = "deprel,upos")
77
	String headPropertiesToProject;
78
	
79
	@Option(name = "depsPropertiesToProject", usage = "to create the depXYZ and outdeprel properties", widget = "StringArrayMultiple", metaVar="form	lemma	upos	xpos	feats	head	deprel	deps	misc", required = true, def = "deprel,upos")
80
	String depsPropertiesToProject;
81
	
82
	/*
83
	 * (non-Javadoc)
84
	 * @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent)
85
	 */
86
	@Override
87
	public Object execute(final ExecutionEvent event) throws ExecutionException {
88
		
89
		IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event);
90
		
91
		Object s = selection.getFirstElement();
92
		if (!(s instanceof MainCorpus)) {
93
			Log.warning("Selection is not a corpus. Aborting.");
94
			return null;
95
		}
96
		
97
		if (!ParametersDialog.open(this)) {
98
			return null;
99
		}
100
		if (conlluFile == null || !conlluFile.exists() || !conlluFile.isFile()) {
101
			Log.warning("Error: cannot access to the conllu file: " + conlluFile);
102
			return null;
103
		}
104
		
105
		CQPCorpus corpus = (CQPCorpus) s;
106
		MainCorpus mainCorpus = corpus.getMainCorpus();
107
		
108
		try {
109
			HashSet<String> test = new HashSet<>();
110
			for (String p : ImportCoNLLUAnnotations.UD_PROPERTY_NAMES) {
111
				if (mainCorpus.getProperty(propertiesPrefix + p) != null) {
112
					test.add(propertiesPrefix + p);
113
				}
114
			}
115
			if (test.size() > 0 && !overwrite_cqp_properties) {
116
				Log.warning(NLS.bind("Error: can't use the {0} prefix because some properties are already used: {1}", propertiesPrefix, test));
117
				return null;
118
			}
119
			
120
			return ImportCoNLLUAnnotationsFromDirectory.importAnnotationsFromCoNLLUFile(mainCorpus, conlluFile, propertiesPrefix, textId, normalize_word_ids, 
121
					new HashSet<String>(Arrays.asList(headPropertiesToProject.split(","))), 
122
					new HashSet<String>(Arrays.asList(depsPropertiesToProject.split(","))),
123
					new HashSet<String>(Arrays.asList(udPropertiesToImport.split(","))));
124
		}
125
		catch (Exception e) {
126
			Log.warning(e);
127
			Log.printStackTrace(e);
128
		}
129
		
130
		return null;
131
	}
132
}
0 133

  
TXM/trunk/org.txm.conllu.rcp/src/org/txm/conllu/rcp/preferences/CoNLLUSearchPreferencePage.java (revision 3315)
1
package org.txm.conllu.rcp.preferences;
2

  
3
import org.eclipse.jface.preference.BooleanFieldEditor;
4
import org.eclipse.jface.preference.StringFieldEditor;
5
import org.eclipse.ui.IWorkbench;
6
import org.txm.conllu.core.preferences.UDPreferences;
7
import org.txm.rcp.IImageKeys;
8
import org.txm.rcp.preferences.TXMPreferencePage;
9
import org.txm.rcp.preferences.TXMPreferenceStore;
10

  
11
/**
12
 * UD preferences page
13
 * 
14
 * @author mdecorde
15
 *
16
 */
17
public class CoNLLUSearchPreferencePage extends TXMPreferencePage {
18

  
19
	@Override
20
	public void createFieldEditors() {
21
		this.addField(new BooleanFieldEditor(UDPreferences.IMPORT_USE_NEW_DOC_ID, "Use new odc id when importing CoNLL-U files", this.getFieldEditorParent()));
22
		this.addField(new BooleanFieldEditor(UDPreferences.KEEP_CONTRACTIONS, "Keep word contractions when importing CoNLL-U files", this.getFieldEditorParent()));
23
		this.addField(new StringFieldEditor(UDPreferences.UDPREFIX, "UD properties prefix", this.getFieldEditorParent()));
24
		this.addField(new StringFieldEditor(UDPreferences.IMPORT_HEAD_TO_PROJECT, "UD head properties to project (comma separated list)", this.getFieldEditorParent()));
25
		this.addField(new StringFieldEditor(UDPreferences.IMPORT_DEPS_TO_PROJECT, "UD deps properties to project (comma separated list)", this.getFieldEditorParent()));
26
		//this.addField(new StringFieldEditor(UDPreferences.DEFAULT_TPROPERTY, "Default T property", this.getFieldEditorParent()));
27
		//this.addField(new StringFieldEditor(UDPreferences.DEFAULT_NTPROPERTY, "Default NT property", this.getFieldEditorParent()));
28
	}
29
	
30
	/*
31
	 * (non-Javadoc)
32
	 * 
33
	 * @see
34
	 * org.eclipse.ui.IWorkbenchPreferencePage#init(org.eclipse.ui.IWorkbench)
35
	 */
36
	@Override
37
	public void init(IWorkbench workbench) {
38
		this.setPreferenceStore(new TXMPreferenceStore(UDPreferences.getInstance().getPreferencesNodeQualifier()));
39
		this.setDescription("UD");
40
		this.setImageDescriptor(IImageKeys.getImageDescriptor(this.getClass(), "icons/functions/UD.png"));
41
	}
42
}
0 43

  
TXM/trunk/org.txm.conllu.rcp/build.properties (revision 3315)
1
source.. = src/
2
output.. = bin/
3
bin.includes = META-INF/,\
4
               .,\
5
               plugin.xml,\
6
               icons/
0 7

  
TXM/trunk/org.txm.conllu.rcp/plugin.xml (revision 3315)
1
<?xml version="1.0" encoding="UTF-8"?>
2
<?eclipse version="3.4"?>
3
<plugin>
4
   <extension
5
         point="org.eclipse.ui.commands">
6
      <command
7
            categoryId="TreeSearch4TXM.commands.category"
8
            defaultHandler="org.txm.conllu.rcp.commands.ImportCoNLLUAnnotationsFromDirectory"
9
            id="org.txm.conllu.rcp.commands.ImportCoNLLUAnnotationsFromDirectory"
10
            name="Import CONLL-u Annotations from directory...">
11
      </command>
12
      <command
13
            categoryId="TreeSearch4TXM.commands.category"
14
            defaultHandler="org.txm.conllu.rcp.commands.ExportCorpusAsCoNLLU"
15
            id="org.txm.conllu.rcp.commands.ExportCorpusAsCoNLLU"
16
            name="Export CONLL-u Annotations...">
17
      </command>
18
      <command
19
            categoryId="TreeSearch4TXM.commands.category"
20
            defaultHandler="org.txm.conllu.rcp.commands.ImportCoNLLUAnnotationsFromFile"
21
            id="org.txm.conllu.rcp.commands.ImportCoNLLUAnnotationsFromFile"
22
            name="Import CONLL-u Annotations from file...">
23
      </command>
24
   </extension>
25
   <extension
26
         point="org.eclipse.ui.menus">
27
      <menuContribution
28
            locationURI="menu:menu.file.import?before=menu.file.import.separator.software">
29
         <command
30
               commandId="org.txm.rcp.handlers.scripts.ExecuteImportScript"
31
               icon="icons/functions/UD.png"
32
               label="CoNLL-U + CSV"
33
               style="push">
34
            <parameter
35
                  name="org.txm.rcp.commands.commandParameter3"
36
                  value="conllu/conlluLoader.groovy">
37
            </parameter>
38
         </command>
39
      </menuContribution>
40
      <menuContribution
41
            locationURI="menu:menu.file.export">
42
         <command
43
               commandId="org.txm.conllu.rcp.commands.ExportCorpusAsCoNLLU"
44
               icon="icons/functions/UD.png"
45
               label="Export corpus as CONLL-U..."
46
               style="push">
47
            <visibleWhen
48
                  checkEnabled="false">
49
               <reference
50
                     definitionId="OneMainCorpusSelected">
51
               </reference>
52
            </visibleWhen>
53
         </command>
54
      </menuContribution>
55
      <menuContribution
56
            locationURI="menu:menu.corpus.import">
57
         <command
58
               commandId="org.txm.conllu.rcp.commands.ImportCoNLLUAnnotationsFromDirectory"
59
               icon="icons/functions/UDplus.png"
60
               style="push">
61
            <visibleWhen
62
                  checkEnabled="false">
63
               <or>
64
                  <test
65
                        forcePluginActivation="true"
66
                        property="org.txm.rcp.testers.TreeSearchReady"
67
                        value="TreeSearchReady">
68
                  </test>
69
                  <reference
70
                        definitionId="OneMainCorpusSelected">
71
                  </reference>
72
               </or>
73
            </visibleWhen>
74
         </command>
75
         <command
76
               commandId="org.txm.conllu.rcp.commands.ImportCoNLLUAnnotationsFromFile"
77
               icon="icons/functions/UDplus.png"
78
               style="push">
79
            <visibleWhen
80
                  checkEnabled="false">
81
               <or>
82
                  <test
83
                        forcePluginActivation="true"
84
                        property="org.txm.rcp.testers.TreeSearchReady"
85
                        value="TreeSearchReady">
86
                  </test>
87
                  <reference
88
                        definitionId="OneMainCorpusSelected">
89
                  </reference>
90
               </or>
91
            </visibleWhen>
92
         </command>
93
      </menuContribution>
94
   </extension>
95
   <extension
96
         point="org.eclipse.ui.preferencePages">
97
      <page
98
            category="org.txm.treesearch.preferences.TreeSearchPreferencePage"
99
            class="org.txm.conllu.rcp.preferences.CoNLLUSearchPreferencePage"
100
            id="org.txm.conllu.rcp.preferences.CoNLLUSearchPreferencePage"
101
            name="UD">
102
      </page>
103
   </extension>
104
   <extension
105
         point="org.txm.rcp.importsection">
106
      <importsectionconfiguration
107
            class="org.txm.conllu.rcp.importsection.CoNLLUAnnotationSectionConfigurator">
108
      </importsectionconfiguration>
109
   </extension>
110

  
111
</plugin>
0 112

  
... Ce différentiel a été tronqué car il excède la taille maximale pouvant être affichée.

Formats disponibles : Unified diff