52 |
52 |
import org.txm.searchengine.cqp.corpus.CQPCorpus;
|
53 |
53 |
import org.txm.searchengine.cqp.corpus.MainCorpus;
|
54 |
54 |
import org.txm.searchengine.cqp.serverException.CqiServerError;
|
|
55 |
import org.txm.utils.AsciiUtils;
|
55 |
56 |
import org.txm.utils.io.FileCopy;
|
56 |
57 |
import org.txm.utils.io.IOUtils;
|
57 |
58 |
import org.txm.utils.logger.Log;
|
... | ... | |
81 |
82 |
@Option(name = "overwrite_cqp_properties", usage = "if set the CQP properties are replaced", widget = "Boolean", required = true, def = "false")
|
82 |
83 |
Boolean overwrite_cqp_properties;
|
83 |
84 |
|
|
85 |
@Option(name = "normalize_word_ids", usage = "if set the CQP properties are replaced", widget = "Boolean", required = true, def = "false")
|
|
86 |
Boolean normalize_word_ids;
|
|
87 |
|
84 |
88 |
/*
|
85 |
89 |
* (non-Javadoc)
|
86 |
90 |
* @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent)
|
... | ... | |
118 |
122 |
Log.warning(NLS.bind("Error: can't use the {0} prefix because some properties are already used: {1}", propertiesPrefix, test));
|
119 |
123 |
return null;
|
120 |
124 |
}
|
121 |
|
return importAnnotations(mainCorpus, connluDirectory, propertiesPrefix);
|
|
125 |
return importAnnotations(mainCorpus, connluDirectory, propertiesPrefix, normalize_word_ids);
|
122 |
126 |
}
|
123 |
127 |
catch (Exception e) {
|
124 |
128 |
Log.warning(e);
|
... | ... | |
141 |
145 |
* @throws IOException
|
142 |
146 |
* @throws XMLStreamException
|
143 |
147 |
*/
|
144 |
|
public static int importAnnotations(MainCorpus mainCorpus, File connluDirectory, String propertiesPrefix) throws IOException,
|
|
148 |
public static int importAnnotations(MainCorpus mainCorpus, File connluDirectory, String propertiesPrefix, Boolean normalizeWordIds) throws IOException,
|
145 |
149 |
CqiServerError, CqiClientException, XMLStreamException {
|
146 |
150 |
Log.info(TXMCoreMessages.bind("Importing CONNL-u annotations of {0} in {1} using the ''{2}'' prefix...", connluDirectory, mainCorpus, propertiesPrefix));
|
147 |
151 |
|
... | ... | |
157 |
161 |
int nWordsInserted = 0;
|
158 |
162 |
for (File coonluFile : files) {
|
159 |
163 |
|
160 |
|
nWordsInserted += _importAnnotations(coonluFile, mainCorpus, propertiesPrefix, null);
|
|
164 |
nWordsInserted += _importAnnotations(coonluFile, mainCorpus, propertiesPrefix, null, normalizeWordIds);
|
161 |
165 |
nTextProcessed++;
|
162 |
166 |
}
|
163 |
167 |
|
... | ... | |
187 |
191 |
* @param corpus
|
188 |
192 |
* @param connluFile
|
189 |
193 |
* @param propertiesPrefix
|
|
194 |
* @param normalize_word_ids
|
190 |
195 |
* @return the number of imported annotations
|
191 |
196 |
* @throws CqiClientException
|
192 |
197 |
* @throws CqiServerError
|
193 |
198 |
* @throws IOException
|
194 |
199 |
* @throws XMLStreamException
|
195 |
200 |
*/
|
196 |
|
public static int importAnnotationsFromCoNNLUFile(MainCorpus mainCorpus, File connluFile, String propertiesPrefix, String textId) throws IOException,
|
|
201 |
public static int importAnnotationsFromCoNNLUFile(MainCorpus mainCorpus, File connluFile, String propertiesPrefix, String textId, Boolean normalize_word_ids) throws IOException,
|
197 |
202 |
CqiServerError, CqiClientException, XMLStreamException {
|
198 |
203 |
Log.info(TXMCoreMessages.bind("Importing CONNL-u annotations of {0} in {1} using the ''{2}'' prefix...", connluFile, mainCorpus, propertiesPrefix));
|
199 |
204 |
|
200 |
205 |
|
201 |
|
int nWordsInserted = _importAnnotations(connluFile, mainCorpus, propertiesPrefix, textId);
|
|
206 |
int nWordsInserted = _importAnnotations(connluFile, mainCorpus, propertiesPrefix, textId, normalize_word_ids);
|
202 |
207 |
|
203 |
208 |
if (nWordsInserted == 0) {
|
204 |
209 |
Log.warning("** No annotation imported. Aborting.");
|
... | ... | |
214 |
219 |
return 0;
|
215 |
220 |
}
|
216 |
221 |
|
217 |
|
private static int _importAnnotations(File coonluFile, MainCorpus mainCorpus, String propertiesPrefix, String textId) throws IOException, XMLStreamException {
|
|
222 |
private static int _importAnnotations(File coonluFile, MainCorpus mainCorpus, String propertiesPrefix, String textId, Boolean normalize_word_ids) throws IOException, XMLStreamException {
|
218 |
223 |
if (textId == null || textId.length() == 0) { // no text id provided, using the connlu file name
|
219 |
224 |
textId = coonluFile.getName().substring(0, coonluFile.getName().length() - 7);
|
220 |
225 |
}
|
... | ... | |
235 |
240 |
BufferedReader reader = IOUtils.getReader(coonluFile);
|
236 |
241 |
String line = reader.readLine();
|
237 |
242 |
|
238 |
|
|
239 |
243 |
int nWords2 = 0;
|
240 |
244 |
int nLine = 0;
|
241 |
245 |
String sent_id = null;
|
... | ... | |
315 |
319 |
Log.warning("No 'XmlId=' found for UD line: " + line);
|
316 |
320 |
}
|
317 |
321 |
else {
|
|
322 |
|
|
323 |
if (normalize_word_ids) {
|
|
324 |
if (!id.startsWith("w_"))
|
|
325 |
id = "w_" + id.substring(1);
|
|
326 |
id = AsciiUtils.buildId(id);
|
|
327 |
}
|
|
328 |
|
318 |
329 |
processor.addProperty(id, properties);
|
319 |
330 |
nWords2++;
|
320 |
331 |
}
|