Révision 2941

tmp/org.txm.utils/src/org/txm/utils/i18n/LangFormater.java (revision 2941)
127 127
	 * @return the string
128 128
	 */
129 129
	public static String format(String str, String lang) {
130
		if (lang == null)
130
		if (lang == null) {
131 131
			lang = Locale.getDefault().getLanguage();
132
		}
133

  
132 134
		for (String punc : getNoSpaceAfter(lang)) {
133 135
			str = str.replace(punc + " ", punc); //$NON-NLS-1$
134 136
		}
......
140 142
		return str;
141 143
	}
142 144

  
145
	public static boolean isSpaceAfterNotNeeded(String str, String lang) {
146
		for (String t : getNoSpaceAfter(lang)) {
147
			if (str.endsWith(t)) {
148
				return true;
149
			}
150
		}
151
		return false;
152
	}
153

  
154
	public static boolean isSpaceBeforeNotNeeded(String str, String lang) {
155
		for (String t : getNoSpaceBefore(lang)) {
156
			if (str.startsWith(t)) {
157
				return true;
158
			}
159
		}
160
		return false;
161
	}
162

  
143 163
	/**
144 164
	 * Format.
145 165
	 *
tmp/org.txm.tigersearch.rcp/plugin.xml (revision 2941)
36 36
      </command>
37 37
      <command
38 38
            categoryId="TIGERSearch4TXM.commands.category"
39
            defaultHandler="org.txm.tigersearch.commands.ImportCONNLUAnnotations"
40
            id="org.txm.tigersearch.commands.ImportCONNLUAnnotations"
41
            name="Import CONNL-u Annotations...">
39
            defaultHandler="org.txm.tigersearch.commands.ImportCONNLUAnnotationsFromDirectory"
40
            id="org.txm.tigersearch.commands.ImportCONNLUAnnotationsFromDirectory"
41
            name="Import CONNL-u Annotations from a directory...">
42 42
      </command>
43 43
      <command
44 44
            categoryId="TIGERSearch4TXM.commands.category"
......
46 46
            id="org.txm.tigersearch.commands.ExportCorpusAsCONNLU"
47 47
            name="Export CONNL-u Annotations...">
48 48
      </command>
49
      <command
50
            categoryId="TIGERSearch4TXM.commands.category"
51
            defaultHandler="org.txm.tigersearch.commands.ImportCONNLUAnnotationsFromFile"
52
            id="org.txm.tigersearch.commands.ImportCONNLUAnnotationsFromFile"
53
            name="Import CONNL-u Annotations from a CoNNL-U file...">
54
      </command>
49 55
   </extension>
50 56
   <extension
51 57
         point="org.eclipse.core.expressions.propertyTesters">
......
208 214
               </visibleWhen>
209 215
            </command>
210 216
            <command
211
                  commandId="org.txm.tigersearch.commands.ImportCONNLUAnnotations"
217
                  commandId="org.txm.tigersearch.commands.ImportCONNLUAnnotationsFromFile"
212 218
                  icon="icons/functions/UDplus.png"
213 219
                  style="push">
214 220
               <visibleWhen
......
225 231
                  </or>
226 232
               </visibleWhen>
227 233
            </command>
234
            <command
235
                  commandId="org.txm.tigersearch.commands.ImportCONNLUAnnotationsFromDirectory"
236
                  icon="icons/functions/UDplus.png"
237
                  style="push">
238
               <visibleWhen
239
                     checkEnabled="false">
240
                  <or>
241
                     <test
242
                           forcePluginActivation="true"
243
                           property="org.txm.rcp.testers.TIGERSearchReady"
244
                           value="TIGERSearchReady">
245
                     </test>
246
                     <reference
247
                           definitionId="OneMainCorpusSelected">
248
                     </reference>
249
                  </or>
250
               </visibleWhen>
251
            </command>
228 252
         </menu>
229 253
      </menuContribution>
230 254
      <menuContribution
tmp/org.txm.tigersearch.rcp/src/org/txm/tigersearch/commands/ImportCONNLUAnnotations.java (revision 2941)
1
// Copyright © 2010-2020 ENS de Lyon., University of Franche-Comté
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package org.txm.tigersearch.commands;
29

  
30
import java.io.BufferedReader;
31
import java.io.File;
32
import java.io.FileFilter;
33
import java.io.IOException;
34
import java.util.HashMap;
35

  
36
import javax.xml.stream.XMLStreamException;
37

  
38
import org.apache.commons.lang.StringUtils;
39
import org.eclipse.core.commands.AbstractHandler;
40
import org.eclipse.core.commands.ExecutionEvent;
41
import org.eclipse.core.commands.ExecutionException;
42
import org.eclipse.jface.viewers.IStructuredSelection;
43
import org.eclipse.ui.handlers.HandlerUtil;
44
import org.kohsuke.args4j.Option;
45
import org.txm.core.messages.TXMCoreMessages;
46
import org.txm.objects.Text;
47
import org.txm.rcp.commands.workspace.UpdateCorpus;
48
import org.txm.rcp.swt.widget.parameters.ParametersDialog;
49
import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
50
import org.txm.searchengine.cqp.corpus.CQPCorpus;
51
import org.txm.searchengine.cqp.corpus.MainCorpus;
52
import org.txm.searchengine.cqp.serverException.CqiServerError;
53
import org.txm.utils.io.FileCopy;
54
import org.txm.utils.io.IOUtils;
55
import org.txm.utils.logger.Log;
56
import org.txm.xml.xmltxm.XMLTXMWordPropertiesInjection;
57

  
58
import cern.colt.Arrays;
59

  
60
/**
61
 * Import CONNLU annotations into a TXM corpus
62
 * 
63
 * IF the corpus already contains CONNLU annotations, they are replaced
64
 * 
65
 * @author mdecorde.
66
 */
67
public class ImportCONNLUAnnotations extends AbstractHandler {
68
	
69
	public static final String ID = ImportCONNLUAnnotations.class.getName();
70
	
71
	@Option(name = "connluDirectory", usage = "connluDirectory", widget = "Folder", required = true, def = "connlu-directory")
72
	File connluDirectory;
73
	
74
	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
75
	String propertiesPrefix;
76
	
77
	/*
78
	 * (non-Javadoc)
79
	 * @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent)
80
	 */
81
	@Override
82
	public Object execute(final ExecutionEvent event) throws ExecutionException {
83
		
84
		IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event);
85
		
86
		Object s = selection.getFirstElement();
87
		if (!(s instanceof MainCorpus)) {
88
			Log.warning("Selection is not a corpus. Aborting.");
89
			return null;
90
		}
91
		
92
		if (!ParametersDialog.open(this)) {
93
			return null;
94
		}
95
		if (connluDirectory == null || !connluDirectory.exists() || !connluDirectory.isDirectory() || connluDirectory.listFiles().length == 0) {
96
			Log.warning("Error: connlu directory is empty: " + connluDirectory);
97
			return null;
98
		}
99
		
100
		CQPCorpus corpus = (CQPCorpus) s;
101
		MainCorpus mainCorpus = corpus.getMainCorpus();
102
		
103
		try {
104
			return importAnnotations(mainCorpus, connluDirectory, propertiesPrefix);
105
		}
106
		catch (Exception e) {
107
			Log.warning(e);
108
			e.printStackTrace();
109
		}
110
		
111
		return null;
112
	}
113
	
114
	/**
115
	 * 
116
	 * if import CONNLU annotations in the corpus with the same name already exists, it is replaced
117
	 * 
118
	 * @param corpus
119
	 * @param connluDirectory
120
	 * @param propertiesPrefix
121
	 * @return the number of imported annotations
122
	 * @throws CqiClientException
123
	 * @throws CqiServerError
124
	 * @throws IOException
125
	 * @throws XMLStreamException
126
	 */
127
	public static int importAnnotations(MainCorpus mainCorpus, File connluDirectory, String propertiesPrefix) throws IOException,
128
			CqiServerError, CqiClientException, XMLStreamException {
129
		Log.info(TXMCoreMessages.bind("Importing CONNL-u annotations of {0} in {1} using the ''{2}'' prefix...", connluDirectory, mainCorpus, propertiesPrefix));
130
		
131
		File[] files = connluDirectory.listFiles(new FileFilter() {
132
			
133
			@Override
134
			public boolean accept(File file) {
135
				return file.isFile() && file.getName().endsWith(".conllu");
136
			}
137
		});
138
		
139
		int nTextProcessed = 0;
140
		int nWords = 0;
141
		int nWordsInserted = 0;
142
		for (File coonluFile : files) {
143
			
144
			String textid = coonluFile.getName().substring(0, coonluFile.getName().length() - 7);
145
			Log.info("** processing text: " + textid);
146
			Text text = mainCorpus.getProject().getText(textid);
147
			if (text == null) {
148
				Log.warning("No text found with ID=" + textid);
149
				continue;
150
			}
151
			File xmltxmFile = mainCorpus.getProject().getText(textid).getXMLTXMFile();
152
			File xmltxmUpdatedFile = new File(System.getProperty("java.io.tmpdir"), xmltxmFile.getName());
153
			
154
			XMLTXMWordPropertiesInjection processor = new XMLTXMWordPropertiesInjection(xmltxmFile);
155
			HashMap<String, HashMap<String, String>> rules = new HashMap<>();
156
			processor.setProperties(rules);
157
			
158
			BufferedReader reader = IOUtils.getReader(coonluFile);
159
			String line = reader.readLine();
160
			String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
161
			
162
			int nWords2 = 0;
163
			int nLine = 0;
164
			String sent_id = null;
165
			String newpar_id = null;
166
			String newdoc_id = null;
167
			while (line != null) {
168
				nLine++;
169
				if (line.length() == 0) {
170
					line = reader.readLine();
171
					continue; // comment
172
				}
173
				
174
				if (line.startsWith("#")) {
175
					if (line.startsWith("# sent_id = ")) {
176
						sent_id = line.substring(12).trim();
177
					}
178
					else if (line.startsWith("# newdoc id = ")) {
179
						newdoc_id = line.substring(14).trim();
180
					}
181
					else if (line.startsWith("# newpar id = ")) {
182
						newpar_id = line.substring(14).trim();
183
					}
184
					else {
185
						// nothing for now
186
					}
187
					
188
					line = reader.readLine();
189
					continue; // comment
190
				}
191
				
192
				String[] split = line.split("\t", 10);
193
				if (split.length < 10) {
194
					Log.warning("Error: line " + nLine + " : " + line + " -> " + Arrays.toString(split) + " len=" + split.length);
195
					line = reader.readLine();
196
					continue; // comment
197
				}
198
				
199
				String id = split[9];
200
				int from = id.indexOf("XmlId=") + 6;
201
				if (from < 6) {
202
					Log.warning("Error: line " + nLine + " with no 'XmlId=': " + line);
203
					line = reader.readLine();
204
					continue;
205
				}
206
				id = id.substring(from);
207
				// System.out.println("ID=" + id);
208
				
209
				if (id.contains("-")) continue; // TODO to manage later
210
				
211
				HashMap<String, String> properties = new HashMap<>();
212
				for (int i = 0; i < split.length; i++) {
213
					properties.put("#" + propertiesPrefix + propNames[i], split[i]); // add the property name using the prefix ; XML-TXM types are prefixed with '#'
214
				}
215
				if (sent_id != null) {
216
					properties.put("#ud-sentid", sent_id);
217
					sent_id = ""; // reset value for next sentence
218
				} else {
219
					properties.put("#ud-sentid", "");
220
				}
221
				if (newdoc_id != null) {
222
					properties.put("#ud-newdocid", newdoc_id);
223
					newdoc_id = null; // reset value for next sentence
224
				} else {
225
					properties.put("#ud-newdocid", "");
226
				}
227
				if (newpar_id != null) {
228
					properties.put("#ud-newparid", newpar_id);
229
					newpar_id = null; // reset value for next sentence
230
				} else {
231
					properties.put("#ud-newparid", "");
232
				}
233
				processor.addProperty(id, properties);
234
				nWords2++;
235
				line = reader.readLine();
236
			}
237
			reader.close();
238
			
239
			if (nWords2 == 0) {
240
				Log.warning("** No annotation found in " + coonluFile);
241
			}
242
			
243
			nWords += nWords2;
244
			
245
			Log.info("** loading annotations from : " + coonluFile);
246
			
247
			if (processor.process(xmltxmUpdatedFile)) {
248
				if (xmltxmFile.delete() && FileCopy.copy(xmltxmUpdatedFile, xmltxmFile)) {
249
					if (processor.getNonActivatedRules().size() > 0) {
250
						Log.warning("Warning: some words were not imported: "+StringUtils.join(processor.getNonActivatedRules(), ", "));
251
					}
252
				}
253
				else {
254
					Log.warning("** Warning: annotation import failed for replace the corpus XML-TXM file: " + xmltxmFile + ". TEMP file: " + xmltxmUpdatedFile);
255
					return 0;
256
				}
257
			}
258
			else {
259
				Log.warning("** Warning: annotation import failed for text: " + textid);
260
				return 0;
261
			}
262
			
263
			if (processor.getNInsertions() == 0) {
264
				Log.warning("** No annotation imported in " + textid);
265
			}
266
			
267
			nWordsInserted += processor.getNInsertions();
268
			nTextProcessed++;
269
		}
270
		
271
		if (nTextProcessed == 0) {
272
			Log.warning("** No text to process. Aborting.");
273
			return 0;
274
		}
275
		
276
		if (nWords == 0) {
277
			Log.warning("** No annotation to import in corpus. Aborting.");
278
			return 0;
279
		}
280
		
281
		if (nWordsInserted == 0) {
282
			Log.warning("** No annotation imported. Aborting.");
283
			return 0;
284
		}
285
		
286
		Log.info("XML-TXM source files updated. Updating indexes...");
287
		
288
		UpdateCorpus.update(mainCorpus);
289
		
290
		Log.info("Done.");
291
		
292
		return 0;
293
	}
294
}
tmp/org.txm.tigersearch.rcp/src/org/txm/tigersearch/commands/ImportCONNLUAnnotationsFromFile.java (revision 2941)
1
// Copyright © 2010-2020 ENS de Lyon., University of Franche-Comté
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package org.txm.tigersearch.commands;
29

  
30
import java.io.BufferedReader;
31
import java.io.File;
32
import java.io.FileFilter;
33
import java.io.IOException;
34
import java.util.HashMap;
35

  
36
import javax.xml.stream.XMLStreamException;
37

  
38
import org.apache.commons.lang.StringUtils;
39
import org.eclipse.core.commands.AbstractHandler;
40
import org.eclipse.core.commands.ExecutionEvent;
41
import org.eclipse.core.commands.ExecutionException;
42
import org.eclipse.jface.viewers.IStructuredSelection;
43
import org.eclipse.ui.handlers.HandlerUtil;
44
import org.kohsuke.args4j.Option;
45
import org.txm.core.messages.TXMCoreMessages;
46
import org.txm.objects.Text;
47
import org.txm.rcp.commands.workspace.UpdateCorpus;
48
import org.txm.rcp.swt.widget.parameters.ParametersDialog;
49
import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
50
import org.txm.searchengine.cqp.corpus.CQPCorpus;
51
import org.txm.searchengine.cqp.corpus.MainCorpus;
52
import org.txm.searchengine.cqp.serverException.CqiServerError;
53
import org.txm.utils.io.FileCopy;
54
import org.txm.utils.io.IOUtils;
55
import org.txm.utils.logger.Log;
56
import org.txm.xml.xmltxm.XMLTXMWordPropertiesInjection;
57

  
58
import cern.colt.Arrays;
59

  
60
/**
61
 * Import CONNLU annotations into a TXM corpus
62
 * 
63
 * IF the corpus already contains CONNLU annotations, they are replaced
64
 * 
65
 * @author mdecorde.
66
 */
67
public class ImportCONNLUAnnotationsFromFile extends AbstractHandler {
68
	
69
	public static final String ID = ImportCONNLUAnnotationsFromFile.class.getName();
70
	
71
	@Option(name = "connluFile", usage = "CoNLL-U file", widget = "FileOpen", required = true, def = "file.conllu")
72
	File connluFile;
73
	
74
	@Option(name = "textId", usage = "Identifier of the text to update", widget = "String", required = true, def = "text-id")
75
	String textId;
76
	
77
	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
78
	String propertiesPrefix;
79
	
80
	/*
81
	 * (non-Javadoc)
82
	 * @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent)
83
	 */
84
	@Override
85
	public Object execute(final ExecutionEvent event) throws ExecutionException {
86
		
87
		IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event);
88
		
89
		Object s = selection.getFirstElement();
90
		if (!(s instanceof MainCorpus)) {
91
			Log.warning("Selection is not a corpus. Aborting.");
92
			return null;
93
		}
94
		
95
		if (!ParametersDialog.open(this)) {
96
			return null;
97
		}
98
		if (connluFile == null || !connluFile.exists() || !connluFile.isFile()) {
99
			Log.warning("Error: cannot access to the connlu file: " + connluFile);
100
			return null;
101
		}
102
		
103
		CQPCorpus corpus = (CQPCorpus) s;
104
		MainCorpus mainCorpus = corpus.getMainCorpus();
105
		
106
		try {
107
			return ImportCONNLUAnnotationsFromDirectory.importAnnotationsFromCoNNLUFile(mainCorpus, connluFile, propertiesPrefix, textId);
108
		}
109
		catch (Exception e) {
110
			Log.warning(e);
111
			e.printStackTrace();
112
		}
113
		
114
		return null;
115
	}
116
}
0 117

  
tmp/org.txm.tigersearch.rcp/src/org/txm/tigersearch/commands/ExportCorpusAsCONNLU.java (revision 2941)
73 73
	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
74 74
	String propertiesPrefix;
75 75
	
76
	@Option(name = "separator", usage = "Options", widget = "Separator", required = true, def = "options")
76
	@Option(name = "separator", usage = "Options", widget = "Separator", required = true, def = "comments options")
77 77
	Boolean separator = false;
78 78
	
79
	// @Option(name = "useUDForms", usage = "use the ud form property instead of CQP 'word' property", widget = "Boolean", required = true, def = "true")
80
	Boolean useUDForms = true;
79
	@Option(name = "insertParagraphs", usage = "Insert paragraph marks in the Connlu corpus", widget = "Boolean", required = true, def = "true")
80
	Boolean insertParagraphs = false;
81 81
	
82
	@Option(name = "detectGap", usage = "Insert gap comment using the CQP 'gap' property", widget = "Boolean", required = true, def = "true")
83
	Boolean detectGap = false;
82 84
	
83
	// @Option(name = "transfertAllWords", usage = "Transfert word not initially in the Connlu corpus", widget = "Boolean", required = true, def = "true")
84
	Boolean transfertAllWords = true;
85
	@Option(name = "separator3", usage = "Options", widget = "Separator", required = true, def = "tokens options")
86
	Boolean separator3 = false;
85 87
	
86 88
	@Option(name = "insertNoSpaceAfter", usage = "Insert the NoSpaceAfter misc property if not initially in the Connlu corpus", widget = "Boolean", required = true, def = "true")
87 89
	Boolean insertNoSpaceAfter = true;
88 90
	
89
	@Option(name = "insertParagraphs", usage = "Insert paragraph marks in the Connlu corpus", widget = "Boolean", required = true, def = "true")
90
	Boolean insertParagraphs = false;
91
	@Option(name = "insertTokenWithoutUdAnnotations", usage = "if checked words without ud annotations are exported as well", widget = "Boolean", required = false, def = "false")
92
	Boolean insertTokenWithoutUdAnnotations;
91 93
	
92
	@Option(name = "detectGap", usage = "Insert gap comment using the CQP 'gap' property", widget = "Boolean", required = true, def = "true")
93
	Boolean detectGap = false;
94
	@Option(name = "defaultFormPropertyName", usage = "optional CQP property to fix the missing 'form' ud property", widget = "String", required = false, def = "")
95
	String defaultFormPropertyName;
94 96
	
95
	@Option(name = "formCorrPropertyName", usage = "optional CQP property to fix the missing 'form' ud property", widget = "String", required = false, def = "")
96
	String formCorrPropertyName;
97
	@Option(name = "defaultLemmaPropertyName", usage = "optional CQP property to fix the missing 'lemma' ud property", widget = "String", required = false, def = "")
98
	String defaultLemmaPropertyName;
97 99
	
98
	@Option(name = "lemmaCorrPropertyName", usage = "optional CQP property to fix the missing 'lemma' ud property", widget = "String", required = false, def = "")
99
	String lemmaCorrPropertyName;
100
	@Option(name = "defaultUposPropertyName", usage = "optional CQP property to fix the missing 'upos' ud property", widget = "String", required = false, def = "")
101
	String defaultUposPropertyName;
100 102
	
101
	@Option(name = "uposCorrPropertyName", usage = "optional CQP property to fix the missing 'upos' ud property", widget = "String", required = false, def = "")
102
	String uposCorrPropertyName;
103
	@Option(name = "defaultXposPropertyName", usage = "optional CQP property to fix the missing 'xpos' ud property", widget = "String", required = false, def = "")
104
	String defaultXposPropertyName;
103 105
	
104
	@Option(name = "xposCorrPropertyName", usage = "optional CQP property to fix the missing 'xpos' ud property", widget = "String", required = false, def = "")
105
	String xposCorrPropertyName;
106
	
107
	@Option(name = "punctStrong", usage = "optional strong punct tag to fix sentence limits", widget = "String", required = true, def = "PONfrt")
108
	String punctStrong;
109
	
110 106
	@Option(name = "separator2", usage = "Options", widget = "Separator", required = true, def = "sentence fix options")
111 107
	Boolean separator2 = false;
112 108
	
113 109
	@Option(name = "openingPunct", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "[\\-–«‘“\\(]")
114 110
	String openingPunct;
115 111
	
112
	/**
113
	 * the UD property suffixes, will be used to create the CQP properties like propertiesPrefix + suffix
114
	 */
116 115
	public static String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
117 116
	
118 117
	/*
......
144 143
		MainCorpus mainCorpus = corpus.getMainCorpus();
145 144
		
146 145
		try {
147
			return exportAnnotationsAsCorpus(mainCorpus, connluResultDirectory, propertiesPrefix, openingPunct,
148
					formCorrPropertyName, lemmaCorrPropertyName, uposCorrPropertyName, xposCorrPropertyName, punctStrong,
146
			return exportAnnotationsAsCorpus(mainCorpus, connluResultDirectory, propertiesPrefix, openingPunct, insertTokenWithoutUdAnnotations,
147
					defaultFormPropertyName, defaultLemmaPropertyName, defaultUposPropertyName, defaultXposPropertyName,
149 148
					detectGap, insertParagraphs, insertNoSpaceAfter);
150 149
		}
151 150
		catch (Exception e) {
......
157 156
	}
158 157
	
159 158
	/**
159
	 * export the corpus in a directory of conllu files (one per text)
160 160
	 * 
161
	 * if aTIGER corpus with the same name already exists, it is replaced
162
	 * 
163
	 * @param corpus
164
	 * @param tigerCorpusDirectory
165
	 * @return the number of imported annotations
166
	 * @throws IndexException
167
	 * @throws QueryIndexException
161
	 * @param mainCorpus
162
	 * @param conlluResultDirectory
163
	 * @param prefix
164
	 * @param openingPunct
165
	 * @param insertTokenWithoutUdAnnotations
166
	 * @param defaultFormPropertyName
167
	 * @param defaultLemmaPropertyName
168
	 * @param defaultUposPropertyName
169
	 * @param defaultXposPropertyName
170
	 * @param detectGap
171
	 * @param insertParagraphs
172
	 * @param insertNoSpaceAfter
173
	 * @return the number of annotation exported
174
	 * @throws UnexpectedAnswerException
175
	 * @throws IOException
176
	 * @throws CqiServerError
168 177
	 * @throws CqiClientException
169
	 * @throws CqiServerError
170
	 * @throws IOException
171
	 * @throws UnexpectedAnswerException
172 178
	 * @throws InvalidCqpIdException
173 179
	 */
174
	public static int exportAnnotationsAsCorpus(MainCorpus mainCorpus, File conlluResultDirectory, String prefix, String openingPunct, String formCorrPropertyName, String lemmaCorrPropertyName,
175
			String uposCorrPropertyName, String xposCorrPropertyName, String punctStrongRegex, boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter) throws UnexpectedAnswerException,
180
	public static int exportAnnotationsAsCorpus(MainCorpus mainCorpus, File conlluResultDirectory, String prefix, String openingPunct, boolean insertTokenWithoutUdAnnotations,
181
			String defaultFormPropertyName, String defaultLemmaPropertyName,
182
			String defaultUposPropertyName, String defaultXposPropertyName, boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter)
183
			throws UnexpectedAnswerException,
176 184
			IOException,
177 185
			CqiServerError,
178 186
			CqiClientException, InvalidCqpIdException {
......
188 196
		int[] start_limits = mainCorpus.getTextStartLimits();
189 197
		int[] end_limits = mainCorpus.getTextEndLimits();
190 198
		
191
		HashSet<String> beforeSpacesRules = new HashSet<>(LangFormater.getNoSpaceBefore(mainCorpus.getLang()));
192
		HashSet<String> afterSpacesRules = new HashSet<>(LangFormater.getNoSpaceAfter(mainCorpus.getLang()));
199
		String lang = mainCorpus.getLang();
200
		// HashSet<String> beforeSpacesRules = new HashSet<>(LangFormater.getNoSpaceBefore(mainCorpus.getLang()));
201
		// HashSet<String> afterSpacesRules = new HashSet<>(LangFormater.getNoSpaceAfter(mainCorpus.getLang()));
193 202
		
194 203
		for (String p : propNames) {
195 204
			WordProperty wp = mainCorpus.getProperty(prefix + p);
......
199 208
			}
200 209
		}
201 210
		
211
		if (insertTokenWithoutUdAnnotations && (defaultFormPropertyName == null || mainCorpus.getProperty(defaultFormPropertyName) == null)) {
212
			Log.warning("Error: the defaultFormPropertyName parameter needs to be set if insertTokenWithoutUdAnnotations is set to true");
213
			return 0;
214
		}
215
		
202 216
		for (int iText = 0; iText < start_limits.length; iText++) {
203 217
			
204 218
			// Build corpus positions
......
280 294
			HashMap<Integer, String> sentidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, "ud-sentid");
281 295
			HashMap<Integer, String> newdocidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, "ud-newdocid");
282 296
			
283
			
284 297
			// build sentence, first pass using UD word sentence positions
285 298
			ArrayList<ArrayList<Integer>> sentences = new ArrayList<>();
286 299
			ArrayList<Integer> tmpSentence = new ArrayList<>();
......
301 314
					tmpSentence.clear();
302 315
				}
303 316
				
304
				tmpSentence.add(p);
317
				if (insertTokenWithoutUdAnnotations) {
318
					tmpSentence.add(p); // insert all tokens
319
				}
320
				else if (idValues[p] != 0) {
321
					tmpSentence.add(p); // insert all tokens
322
				}
323
				
305 324
			}
306 325
			positions = null; // free memory
307 326
			
......
358 377
				String[] ids = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("id").getQualifiedName(), sentencePositions);
359 378
				
360 379
				String[] words = null;
361
				if (formCorrPropertyName != null && formCorrPropertyName.length() > 0) {
362
					words = getDefaultValues(mainCorpus, formCorrPropertyName, sentencePositions);
380
				if (defaultFormPropertyName != null && defaultFormPropertyName.length() > 0) {
381
					words = getDefaultValues(mainCorpus, defaultFormPropertyName, sentencePositions);
363 382
				}
364 383
				String[] lemmas = null;
365
				if (lemmaCorrPropertyName != null && lemmaCorrPropertyName.length() > 0) {
366
					lemmas = getDefaultValues(mainCorpus, lemmaCorrPropertyName, sentencePositions);
384
				if (defaultLemmaPropertyName != null && defaultLemmaPropertyName.length() > 0) {
385
					lemmas = getDefaultValues(mainCorpus, defaultLemmaPropertyName, sentencePositions);
367 386
				}
368 387
				String[] upos = null;
369
				if (uposCorrPropertyName != null && uposCorrPropertyName.length() > 0) {
370
					upos = getDefaultValues(mainCorpus, uposCorrPropertyName, sentencePositions);
388
				if (defaultUposPropertyName != null && defaultUposPropertyName.length() > 0) {
389
					upos = getDefaultValues(mainCorpus, defaultUposPropertyName, sentencePositions);
371 390
				}
372 391
				String[] xpos = null;
373
				if (xposCorrPropertyName != null && xposCorrPropertyName.length() > 0) {
374
					xpos = getDefaultValues(mainCorpus, xposCorrPropertyName, sentencePositions);
392
				if (defaultXposPropertyName != null && defaultXposPropertyName.length() > 0) {
393
					xpos = getDefaultValues(mainCorpus, defaultXposPropertyName, sentencePositions);
375 394
				}
376 395
				
377 396
				// String[] feats = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(featsCorrPropertyName).getQualifiedName(), sentencePositions);
......
379 398
				// String[] deprel = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions);
380 399
				// String[] deps = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions);
381 400
				
382
				HashMap<Integer, Integer> sentIds = new HashMap<>();
401
				// fix ud properties using CQP values
383 402
				for (int ip = 0; ip < sentence.size(); ip++) {
403
					
384 404
					int p = sentence.get(ip);
385 405
					
386
					if (idValues[p] != 0) { // store "old id -> new id"
387
						sentIds.put(idValues[p], (ip + 1)); // from 1 to N
388
					}
389
					
390 406
					// new word
391 407
					if (miscValues[p].equals("_")) {
392 408
						miscValues[p] = "XmlId=" + ids[ip];
......
405 421
					if (xpos != null && xposValues[p].equals("_")) {
406 422
						xposValues[p] = xpos[ip];
407 423
					}
408
					
409
					if (insertNoSpaceAfter) { // inserttino activated
424
				}
425
				
426
				if (insertNoSpaceAfter) {
427
					for (int ip = 0; ip < sentence.size(); ip++) { // fix SpaceAfter. !!! this needs to be done after ud properties are fixed
428
						int p = sentence.get(ip);
429
						// insertion activated
410 430
						if (!miscValues[p].contains("SpaceAfter=")) { // only update if not present
411
							if (afterSpacesRules.contains(formValues[p])) {
412
								miscValues[p] += " SpaceAfter=Yes";
431
							if (LangFormater.isSpaceAfterNotNeeded(formValues[p], lang)) {
432
								miscValues[p] += "|SpaceAfter=No";
413 433
							}
414
							else if (formValues.length < p + 1 && beforeSpacesRules.contains(formValues[p + 1])) {
415
								miscValues[p] += " SpaceAfter=Yes";
434
							else if (formValues.length > (p + 1) && LangFormater.isSpaceBeforeNotNeeded(formValues[p + 1], lang)) {
435
								// if next token needs a space before, set SpaceAfter=Yes to the previous token
436
								miscValues[p] += "|SpaceAfter=No";
416 437
							}
417 438
						}
418 439
					}
419
					
440
				}
420 441
				
442
				// fixing sentence punct limits
443
				while (sentence.size() > 0 && iSentence > 0 && formValues[sentence.get(0)].matches("\\p{P}") && !formValues[sentence.get(0)].matches(openingPunct)) {
444
					System.out.println("FIXING: first non-openingPunct position " + formValues[sentence.get(0)] + " in " + iSentence);
445
					int p2 = sentence.remove(0);
446
					sentences.get(iSentence - 1).add(p2);
421 447
				}
448
				//
449
				while (sentence.size() > 0 && iSentence + 1 < sentences.size() && formValues[sentence.get(sentence.size() - 1)].matches(openingPunct)) {
450
					System.out.println("FIXING: last openingPunct position " + formValues[sentence.get(sentence.size() - 1)] + " in " + iSentence);
451
					int p2 = sentence.remove(sentence.size() - 1);
452
					sentences.get(iSentence + 1).add(0, p2);
453
				}
422 454
				
423
				
424
				// fixing sentence punct limits
425
					
426
					
427
//					while (sentence.size() > 0 && iSentence > 0 && xpos[0].matches(punctStrongRegex)) {
428
//						System.out.println("FIXING: first punctStrong position " + xposValues[sentence.get(0)] + " in " + iSentence);
429
//						int p2 = sentence.remove(0);
430
//						sentences.get(iSentence - 1).add(p2);
431
//					}
432
					
433
					while (sentence.size() > 0 && iSentence > 0 && formValues[sentence.get(0)].matches("\\p{P}") && !formValues[sentence.get(0)].matches(openingPunct)) {
434
						System.out.println("FIXING: first non-openingPunct position " + formValues[sentence.get(0)] + " in " + iSentence);
435
						int p2 = sentence.remove(0);
436
						sentences.get(iSentence - 1).add(p2);
437
					}
438
//
439
					while (sentence.size() > 0 && iSentence + 1 < sentences.size() && formValues[sentence.get(sentence.size() - 1)].matches(openingPunct)) {
440
						System.out.println("FIXING: last openingPunct position " + formValues[sentence.get(sentence.size() - 1)] + " in " + iSentence);
441
						int p2 = sentence.remove(sentence.size() - 1);
442
						sentences.get(iSentence + 1).add(0, p2);
443
					}
444
		
445 455
				if (sentence.size() == 0) { // sentence was depleted after fixing it
446 456
					sentences.remove(iSentence);
447 457
					iSentence--;
448 458
					continue;
449 459
				}
460
			}
461
			
462
			for (int s = 0; s < sentences.size(); s++) {
450 463
				
464
				// fix only ud sentences limits
465
				ArrayList<Integer> sentence = sentences.get(s);
466
				HashMap<Integer, Integer> oldToNewIds = new HashMap<>();
467
				for (int ip = 0; ip < sentence.size(); ip++) { // computing old to new ids
468
					int p = sentence.get(ip);
469
					
470
					if (idValues[p] != 0) { // store "old id -> new id"
471
						oldToNewIds.put(idValues[p], (ip + 1)); // from 1 to N
472
					}
473
				}
474
				
451 475
				// fixing head and set missing head to 0 and root
452 476
				for (int ip = 0; ip < sentence.size(); ip++) {
453 477
					int p = sentence.get(ip);
......
456 480
					idValues[p] = (ip + 1);  // from 1 to N
457 481
					
458 482
					// fixing head values
459
					if (sentIds.containsKey(headValues[p])) {
460
						headValues[p] = sentIds.get(headValues[p]);
483
					if (oldToNewIds.containsKey(headValues[p])) {
484
						headValues[p] = oldToNewIds.get(headValues[p]);
461 485
					}
462
					else { // new word, set to default values
463
						headValues[p] = -1;
486
					else if (headValues[p] != 0) { // new word, set to default values
487
						headValues[p] = 0;
464 488
						deprelValues[p] = "_";
465 489
						depsValues[p] = "_";
466 490
					}
......
485 509
					gap = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("gap").getQualifiedName(), sentencePositions);
486 510
				}
487 511
				
488
				
489
				
490 512
				String[] tokens = new String[sentence.size()];
491 513
				for (int ip = 0; ip < sentence.size(); ip++) {
492 514
					tokens[ip] = formValues[sentence.get(ip)];
......
506 528
					writer.println("# newdoc id = " + textIds[iText]);
507 529
				}
508 530
				
509
				if (sentidStartPositions.containsKey(sentence.get(0))) {
510
					writer.println("# sent_id = " + sentidStartPositions.get(sentence.get(0)));
531
				boolean foundSentId = false;
532
				for (int ip : sentence) {
533
					if (!foundSentId && sentidStartPositions.containsKey(ip)) {
534
						writer.println("# sent_id = " + sentidStartPositions.get(ip));
535
						foundSentId = true;
536
					}
511 537
				}
512
				else {
538
				if (!foundSentId) { // no sent_id found
513 539
					writer.println("# sent_id = " + textIds[iText] + "-" + (iSentence + 1) + ".new");
514 540
				}
515 541
				
......
519 545
					iParagraph++;
520 546
				}
521 547
				
522
				for (int ip = 0 ; ip < sentence.size() ; ip++) {
548
				for (int ip = 0; ip < sentence.size(); ip++) {
523 549
					int p = sentence.get(ip);
524
				
550
					
525 551
					// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
526 552
					writer.println(idValues[p] + "\t" + formValues[p] + "\t" + lemmaValues[p] + "\t" + uposValues[p]
527 553
							+ "\t" + xposValues[p] + "\t" + featsValues[p] + "\t" + headValues[p] + "\t" + deprelValues[p]
......
538 564
			numberOfTextsWritten++;
539 565
		}
540 566
		
541
		System.out.println("N words written: " + numberOfWordsWritten);
542
		System.out.println("N sentences written: " + numberOfSentencesWritten);
543
		System.out.println("N texts written: " + numberOfTextsWritten);
567
		System.out.println("# words written: " + numberOfWordsWritten);
568
		System.out.println("# sentences written: " + numberOfSentencesWritten);
569
		System.out.println("# texts written: " + numberOfTextsWritten);
544 570
		
545 571
		return numberOfWordsWritten;
546 572
	}
......
579 605
		}
580 606
	}
581 607
}
608

  
609

  
tmp/org.txm.tigersearch.rcp/src/org/txm/tigersearch/commands/ImportCONNLUAnnotationsFromDirectory.java (revision 2941)
1
// Copyright © 2010-2020 ENS de Lyon., University of Franche-Comté
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package org.txm.tigersearch.commands;
29

  
30
import java.io.BufferedReader;
31
import java.io.File;
32
import java.io.FileFilter;
33
import java.io.IOException;
34
import java.util.HashMap;
35

  
36
import javax.xml.stream.XMLStreamException;
37

  
38
import org.apache.commons.lang.StringUtils;
39
import org.eclipse.core.commands.AbstractHandler;
40
import org.eclipse.core.commands.ExecutionEvent;
41
import org.eclipse.core.commands.ExecutionException;
42
import org.eclipse.jface.viewers.IStructuredSelection;
43
import org.eclipse.ui.handlers.HandlerUtil;
44
import org.kohsuke.args4j.Option;
45
import org.txm.core.messages.TXMCoreMessages;
46
import org.txm.objects.Text;
47
import org.txm.rcp.commands.workspace.UpdateCorpus;
48
import org.txm.rcp.swt.widget.parameters.ParametersDialog;
49
import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
50
import org.txm.searchengine.cqp.corpus.CQPCorpus;
51
import org.txm.searchengine.cqp.corpus.MainCorpus;
52
import org.txm.searchengine.cqp.serverException.CqiServerError;
53
import org.txm.utils.io.FileCopy;
54
import org.txm.utils.io.IOUtils;
55
import org.txm.utils.logger.Log;
56
import org.txm.xml.xmltxm.XMLTXMWordPropertiesInjection;
57

  
58
import cern.colt.Arrays;
59

  
60
/**
61
 * Import CONNLU annotations into a TXM corpus
62
 * 
63
 * IF the corpus already contains CONNLU annotations, they are replaced
64
 * 
65
 * @author mdecorde.
66
 */
67
public class ImportCONNLUAnnotationsFromDirectory extends AbstractHandler {
68
	
69
	public static final String ID = ImportCONNLUAnnotationsFromDirectory.class.getName();
70
	
71
	@Option(name = "connluDirectory", usage = "connluDirectory", widget = "Folder", required = true, def = "connlu-directory")
72
	File connluDirectory;
73
	
74
	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
75
	String propertiesPrefix;
76
	
77
	/*
78
	 * (non-Javadoc)
79
	 * @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent)
80
	 */
81
	@Override
82
	public Object execute(final ExecutionEvent event) throws ExecutionException {
83
		
84
		IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event);
85
		
86
		Object s = selection.getFirstElement();
87
		if (!(s instanceof MainCorpus)) {
88
			Log.warning("Selection is not a corpus. Aborting.");
89
			return null;
90
		}
91
		
92
		if (!ParametersDialog.open(this)) {
93
			return null;
94
		}
95
		if (connluDirectory == null || !connluDirectory.exists() || !connluDirectory.isDirectory() || connluDirectory.listFiles().length == 0) {
96
			Log.warning("Error: connlu directory is empty: " + connluDirectory);
97
			return null;
98
		}
99
		
100
		CQPCorpus corpus = (CQPCorpus) s;
101
		MainCorpus mainCorpus = corpus.getMainCorpus();
102
		
103
		try {
104
			return importAnnotations(mainCorpus, connluDirectory, propertiesPrefix);
105
		}
106
		catch (Exception e) {
107
			Log.warning(e);
108
			e.printStackTrace();
109
		}
110
		
111
		return null;
112
	}
113
	
114
	/**
115
	 * 
116
	 * if import CONNLU annotations in the corpus with the same name already exists, it is replaced
117
	 * 
118
	 * @param corpus
119
	 * @param connluDirectory
120
	 * @param propertiesPrefix
121
	 * @return the number of imported annotations
122
	 * @throws CqiClientException
123
	 * @throws CqiServerError
124
	 * @throws IOException
125
	 * @throws XMLStreamException
126
	 */
127
	public static int importAnnotations(MainCorpus mainCorpus, File connluDirectory, String propertiesPrefix) throws IOException,
128
			CqiServerError, CqiClientException, XMLStreamException {
129
		Log.info(TXMCoreMessages.bind("Importing CONNL-u annotations of {0} in {1} using the ''{2}'' prefix...", connluDirectory, mainCorpus, propertiesPrefix));
130
		
131
		File[] files = connluDirectory.listFiles(new FileFilter() {
132
			
133
			@Override
134
			public boolean accept(File file) {
135
				return file.isFile() && file.getName().endsWith(".conllu");
136
			}
137
		});
138
		
139
		int nTextProcessed = 0;
140
		int nWordsInserted = 0;
141
		for (File coonluFile : files) {
142
			
143
			nWordsInserted += _importAnnotations(coonluFile, mainCorpus, propertiesPrefix, null);
144
			nTextProcessed++;
145
		}
146
		
147
		if (nTextProcessed == 0) {
148
			Log.warning("** No text to process. Aborting.");
149
			return 0;
150
		}
151
		
152
		if (nWordsInserted == 0) {
153
			Log.warning("** No annotation imported. Aborting.");
154
			return 0;
155
		}
156
		
157
		Log.info("XML-TXM source files updated. Updating indexes...");
158
		
159
		UpdateCorpus.update(mainCorpus);
160
		
161
		Log.info("Done.");
162
		
163
		return 0;
164
	}
165
	
166
	/**
167
	 * 
168
	 * if import CONNLU annotations in the corpus with the same name already exists, it is replaced
169
	 * 
170
	 * @param corpus
171
	 * @param connluFile
172
	 * @param propertiesPrefix
173
	 * @return the number of imported annotations
174
	 * @throws CqiClientException
175
	 * @throws CqiServerError
176
	 * @throws IOException
177
	 * @throws XMLStreamException
178
	 */
179
	public static int importAnnotationsFromCoNNLUFile(MainCorpus mainCorpus, File connluFile, String propertiesPrefix, String textId) throws IOException,
180
			CqiServerError, CqiClientException, XMLStreamException {
181
		Log.info(TXMCoreMessages.bind("Importing CONNL-u annotations of {0} in {1} using the ''{2}'' prefix...", connluFile, mainCorpus, propertiesPrefix));
182
		
183
		
184
		int nWordsInserted = _importAnnotations(connluFile, mainCorpus, propertiesPrefix, textId);
185
		
186
		if (nWordsInserted == 0) {
187
			Log.warning("** No annotation imported. Aborting.");
188
			return 0;
189
		}
190
		
191
		Log.info("XML-TXM source files updated. Updating indexes...");
192
		
193
		UpdateCorpus.update(mainCorpus);
194
		
195
		Log.info("Done.");
196
		
197
		return 0;
198
	}
199
	
200
	private static int _importAnnotations(File coonluFile, MainCorpus mainCorpus, String propertiesPrefix, String textId) throws IOException, XMLStreamException {
201
		if (textId == null || textId.length() == 0) { // no text id provided, using the connlu file name
202
			textId = coonluFile.getName().substring(0, coonluFile.getName().length() - 7);
203
		}
204
		Log.info("** processing text: " + textId);
205
		Text text = mainCorpus.getProject().getText(textId);
206
		if (text == null) {
207
			Log.warning("No text found with ID=" + textId);
208
			return 0;
209
		}
210
		File xmltxmFile = mainCorpus.getProject().getText(textId).getXMLTXMFile();
211
		File xmltxmUpdatedFile = new File(System.getProperty("java.io.tmpdir"), xmltxmFile.getName());
212
		
213
		XMLTXMWordPropertiesInjection processor = new XMLTXMWordPropertiesInjection(xmltxmFile);
214
		HashMap<String, HashMap<String, String>> rules = new HashMap<>();
215
		processor.setProperties(rules);
216
		
217
		BufferedReader reader = IOUtils.getReader(coonluFile);
218
		String line = reader.readLine();
219
		String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
220
		
221
		int nWords2 = 0;
222
		int nLine = 0;
223
		String sent_id = null;
224
		String newpar_id = null;
225
		String newdoc_id = null;
226
		while (line != null) {
227
			nLine++;
228
			if (line.length() == 0) {
229
				line = reader.readLine();
230
				continue; // comment
231
			}
232
			
233
			if (line.startsWith("#")) {
234
				if (line.startsWith("# sent_id = ")) {
235
					sent_id = line.substring(12).trim();
236
				}
237
				else if (line.startsWith("# newdoc id = ")) {
238
					newdoc_id = line.substring(14).trim();
239
				}
240
				else if (line.startsWith("# newpar id = ")) {
241
					newpar_id = line.substring(14).trim();
242
				}
243
				else {
244
					// nothing for now
245
				}
246
				
247
				line = reader.readLine();
248
				continue; // comment
249
			}
250
			
251
			String[] split = line.split("\t", 10);
252
			if (split.length < 10) {
253
				Log.warning("Error: line " + nLine + " : " + line + " -> " + Arrays.toString(split) + " len=" + split.length);
254
				line = reader.readLine();
255
				continue; // comment
256
			}
257
			
258
			String misc = split[9];
259
			String[] miscValues = misc.split("\\|");
260
			String id = null;
261
			for (String miscValue : miscValues) {
262
				if (miscValue.startsWith("XmlId=")) {
263
					id = miscValue.substring(6);
264
				}
265
			}
266
			
267
			HashMap<String, String> properties = new HashMap<>();
268
			for (int i = 0; i < split.length; i++) {
269
				properties.put("#" + propertiesPrefix + propNames[i], split[i]); // add the property name using the prefix ; XML-TXM types are prefixed with '#'
270
			}
271
			
272
			if (sent_id != null) {
273
				properties.put("#ud-sentid", sent_id);
274
				sent_id = ""; // reset value for next sentence
275
			}
276
			else {
277
				properties.put("#ud-sentid", "");
278
			}
279
			
280
			if (newdoc_id != null) {
281
				properties.put("#ud-newdocid", newdoc_id);
282
				newdoc_id = null; // reset value for next sentence
283
			}
284
			else {
285
				properties.put("#ud-newdocid", "");
286
			}
287
			
288
			if (newpar_id != null) {
289
				properties.put("#ud-newparid", newpar_id);
290
				newpar_id = null; // reset value for next sentence
291
			}
292
			else {
293
				properties.put("#ud-newparid", "");
294
			}
295
			
296
			if (id == null) {
297
				Log.warning("No 'XmlId=' found for UD line: " + line);
298
			}
299
			else {
300
				processor.addProperty(id, properties);
301
				nWords2++;
302
			}
303
			line = reader.readLine();
304
		}
305
		reader.close();
306
		
307
		if (nWords2 == 0) {
308
			Log.warning("** No annotation to import in " + coonluFile);
309
			return 0;
310
		}
311
		
312
		Log.info("** loading annotations from : " + coonluFile);
313
		if (processor.process(xmltxmUpdatedFile)) {
314
			if (xmltxmFile.delete() && FileCopy.copy(xmltxmUpdatedFile, xmltxmFile)) {
315
				if (processor.getNonActivatedRules().size() > 0) {
316
					Log.warning("Warning: some words were not imported: " + StringUtils.join(processor.getNonActivatedRules(), ", "));
317
				}
318
			}
319
			else {
320
				Log.warning("** Warning: annotation import failed for replace the corpus XML-TXM file: " + xmltxmFile + ". TEMP file: " + xmltxmUpdatedFile);
321
				return 0;
322
			}
323
		}
324
		else {
325
			Log.warning("** Warning: annotation import failed for text: " + textId);
326
			return 0;
327
		}
328
		
329
		if (processor.getNInsertions() == 0) {
330
			Log.warning("** No annotation imported in " + textId);
331
		}
332
		
333
		return processor.getNInsertions();
334
	}
335
}
0 336

  

Formats disponibles : Unified diff