Révision 2941
tmp/org.txm.utils/src/org/txm/utils/i18n/LangFormater.java (revision 2941) | ||
---|---|---|
127 | 127 |
* @return the string |
128 | 128 |
*/ |
129 | 129 |
public static String format(String str, String lang) { |
130 |
if (lang == null) |
|
130 |
if (lang == null) {
|
|
131 | 131 |
lang = Locale.getDefault().getLanguage(); |
132 |
} |
|
133 |
|
|
132 | 134 |
for (String punc : getNoSpaceAfter(lang)) { |
133 | 135 |
str = str.replace(punc + " ", punc); //$NON-NLS-1$ |
134 | 136 |
} |
... | ... | |
140 | 142 |
return str; |
141 | 143 |
} |
142 | 144 |
|
145 |
public static boolean isSpaceAfterNotNeeded(String str, String lang) { |
|
146 |
for (String t : getNoSpaceAfter(lang)) { |
|
147 |
if (str.endsWith(t)) { |
|
148 |
return true; |
|
149 |
} |
|
150 |
} |
|
151 |
return false; |
|
152 |
} |
|
153 |
|
|
154 |
public static boolean isSpaceBeforeNotNeeded(String str, String lang) { |
|
155 |
for (String t : getNoSpaceBefore(lang)) { |
|
156 |
if (str.startsWith(t)) { |
|
157 |
return true; |
|
158 |
} |
|
159 |
} |
|
160 |
return false; |
|
161 |
} |
|
162 |
|
|
143 | 163 |
/** |
144 | 164 |
* Format. |
145 | 165 |
* |
tmp/org.txm.tigersearch.rcp/plugin.xml (revision 2941) | ||
---|---|---|
36 | 36 |
</command> |
37 | 37 |
<command |
38 | 38 |
categoryId="TIGERSearch4TXM.commands.category" |
39 |
defaultHandler="org.txm.tigersearch.commands.ImportCONNLUAnnotations" |
|
40 |
id="org.txm.tigersearch.commands.ImportCONNLUAnnotations" |
|
41 |
name="Import CONNL-u Annotations..."> |
|
39 |
defaultHandler="org.txm.tigersearch.commands.ImportCONNLUAnnotationsFromDirectory"
|
|
40 |
id="org.txm.tigersearch.commands.ImportCONNLUAnnotationsFromDirectory"
|
|
41 |
name="Import CONNL-u Annotations from a directory...">
|
|
42 | 42 |
</command> |
43 | 43 |
<command |
44 | 44 |
categoryId="TIGERSearch4TXM.commands.category" |
... | ... | |
46 | 46 |
id="org.txm.tigersearch.commands.ExportCorpusAsCONNLU" |
47 | 47 |
name="Export CONNL-u Annotations..."> |
48 | 48 |
</command> |
49 |
<command |
|
50 |
categoryId="TIGERSearch4TXM.commands.category" |
|
51 |
defaultHandler="org.txm.tigersearch.commands.ImportCONNLUAnnotationsFromFile" |
|
52 |
id="org.txm.tigersearch.commands.ImportCONNLUAnnotationsFromFile" |
|
53 |
name="Import CONNL-u Annotations from a CoNNL-U file..."> |
|
54 |
</command> |
|
49 | 55 |
</extension> |
50 | 56 |
<extension |
51 | 57 |
point="org.eclipse.core.expressions.propertyTesters"> |
... | ... | |
208 | 214 |
</visibleWhen> |
209 | 215 |
</command> |
210 | 216 |
<command |
211 |
commandId="org.txm.tigersearch.commands.ImportCONNLUAnnotations" |
|
217 |
commandId="org.txm.tigersearch.commands.ImportCONNLUAnnotationsFromFile"
|
|
212 | 218 |
icon="icons/functions/UDplus.png" |
213 | 219 |
style="push"> |
214 | 220 |
<visibleWhen |
... | ... | |
225 | 231 |
</or> |
226 | 232 |
</visibleWhen> |
227 | 233 |
</command> |
234 |
<command |
|
235 |
commandId="org.txm.tigersearch.commands.ImportCONNLUAnnotationsFromDirectory" |
|
236 |
icon="icons/functions/UDplus.png" |
|
237 |
style="push"> |
|
238 |
<visibleWhen |
|
239 |
checkEnabled="false"> |
|
240 |
<or> |
|
241 |
<test |
|
242 |
forcePluginActivation="true" |
|
243 |
property="org.txm.rcp.testers.TIGERSearchReady" |
|
244 |
value="TIGERSearchReady"> |
|
245 |
</test> |
|
246 |
<reference |
|
247 |
definitionId="OneMainCorpusSelected"> |
|
248 |
</reference> |
|
249 |
</or> |
|
250 |
</visibleWhen> |
|
251 |
</command> |
|
228 | 252 |
</menu> |
229 | 253 |
</menuContribution> |
230 | 254 |
<menuContribution |
tmp/org.txm.tigersearch.rcp/src/org/txm/tigersearch/commands/ImportCONNLUAnnotations.java (revision 2941) | ||
---|---|---|
1 |
// Copyright © 2010-2020 ENS de Lyon., University of Franche-Comté |
|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice |
|
4 |
// Sophia Antipolis, University of Paris 3. |
|
5 |
// |
|
6 |
// The TXM platform is free software: you can redistribute it |
|
7 |
// and/or modify it under the terms of the GNU General Public |
|
8 |
// License as published by the Free Software Foundation, |
|
9 |
// either version 2 of the License, or (at your option) any |
|
10 |
// later version. |
|
11 |
// |
|
12 |
// The TXM platform is distributed in the hope that it will be |
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
|
15 |
// PURPOSE. See the GNU General Public License for more |
|
16 |
// details. |
|
17 |
// |
|
18 |
// You should have received a copy of the GNU General |
|
19 |
// Public License along with the TXM platform. If not, see |
|
20 |
// http://www.gnu.org/licenses. |
|
21 |
// |
|
22 |
// |
|
23 |
// |
|
24 |
// $LastChangedDate:$ |
|
25 |
// $LastChangedRevision:$ |
|
26 |
// $LastChangedBy:$ |
|
27 |
// |
|
28 |
package org.txm.tigersearch.commands; |
|
29 |
|
|
30 |
import java.io.BufferedReader; |
|
31 |
import java.io.File; |
|
32 |
import java.io.FileFilter; |
|
33 |
import java.io.IOException; |
|
34 |
import java.util.HashMap; |
|
35 |
|
|
36 |
import javax.xml.stream.XMLStreamException; |
|
37 |
|
|
38 |
import org.apache.commons.lang.StringUtils; |
|
39 |
import org.eclipse.core.commands.AbstractHandler; |
|
40 |
import org.eclipse.core.commands.ExecutionEvent; |
|
41 |
import org.eclipse.core.commands.ExecutionException; |
|
42 |
import org.eclipse.jface.viewers.IStructuredSelection; |
|
43 |
import org.eclipse.ui.handlers.HandlerUtil; |
|
44 |
import org.kohsuke.args4j.Option; |
|
45 |
import org.txm.core.messages.TXMCoreMessages; |
|
46 |
import org.txm.objects.Text; |
|
47 |
import org.txm.rcp.commands.workspace.UpdateCorpus; |
|
48 |
import org.txm.rcp.swt.widget.parameters.ParametersDialog; |
|
49 |
import org.txm.searchengine.cqp.clientExceptions.CqiClientException; |
|
50 |
import org.txm.searchengine.cqp.corpus.CQPCorpus; |
|
51 |
import org.txm.searchengine.cqp.corpus.MainCorpus; |
|
52 |
import org.txm.searchengine.cqp.serverException.CqiServerError; |
|
53 |
import org.txm.utils.io.FileCopy; |
|
54 |
import org.txm.utils.io.IOUtils; |
|
55 |
import org.txm.utils.logger.Log; |
|
56 |
import org.txm.xml.xmltxm.XMLTXMWordPropertiesInjection; |
|
57 |
|
|
58 |
import cern.colt.Arrays; |
|
59 |
|
|
60 |
/** |
|
61 |
* Import CONNLU annotations into a TXM corpus |
|
62 |
* |
|
63 |
* IF the corpus already contains CONNLU annotations, they are replaced |
|
64 |
* |
|
65 |
* @author mdecorde. |
|
66 |
*/ |
|
67 |
public class ImportCONNLUAnnotations extends AbstractHandler { |
|
68 |
|
|
69 |
public static final String ID = ImportCONNLUAnnotations.class.getName(); |
|
70 |
|
|
71 |
@Option(name = "connluDirectory", usage = "connluDirectory", widget = "Folder", required = true, def = "connlu-directory") |
|
72 |
File connluDirectory; |
|
73 |
|
|
74 |
@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-") |
|
75 |
String propertiesPrefix; |
|
76 |
|
|
77 |
/* |
|
78 |
* (non-Javadoc) |
|
79 |
* @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent) |
|
80 |
*/ |
|
81 |
@Override |
|
82 |
public Object execute(final ExecutionEvent event) throws ExecutionException { |
|
83 |
|
|
84 |
IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event); |
|
85 |
|
|
86 |
Object s = selection.getFirstElement(); |
|
87 |
if (!(s instanceof MainCorpus)) { |
|
88 |
Log.warning("Selection is not a corpus. Aborting."); |
|
89 |
return null; |
|
90 |
} |
|
91 |
|
|
92 |
if (!ParametersDialog.open(this)) { |
|
93 |
return null; |
|
94 |
} |
|
95 |
if (connluDirectory == null || !connluDirectory.exists() || !connluDirectory.isDirectory() || connluDirectory.listFiles().length == 0) { |
|
96 |
Log.warning("Error: connlu directory is empty: " + connluDirectory); |
|
97 |
return null; |
|
98 |
} |
|
99 |
|
|
100 |
CQPCorpus corpus = (CQPCorpus) s; |
|
101 |
MainCorpus mainCorpus = corpus.getMainCorpus(); |
|
102 |
|
|
103 |
try { |
|
104 |
return importAnnotations(mainCorpus, connluDirectory, propertiesPrefix); |
|
105 |
} |
|
106 |
catch (Exception e) { |
|
107 |
Log.warning(e); |
|
108 |
e.printStackTrace(); |
|
109 |
} |
|
110 |
|
|
111 |
return null; |
|
112 |
} |
|
113 |
|
|
114 |
/** |
|
115 |
* |
|
116 |
* if import CONNLU annotations in the corpus with the same name already exists, it is replaced |
|
117 |
* |
|
118 |
* @param corpus |
|
119 |
* @param connluDirectory |
|
120 |
* @param propertiesPrefix |
|
121 |
* @return the number of imported annotations |
|
122 |
* @throws CqiClientException |
|
123 |
* @throws CqiServerError |
|
124 |
* @throws IOException |
|
125 |
* @throws XMLStreamException |
|
126 |
*/ |
|
127 |
public static int importAnnotations(MainCorpus mainCorpus, File connluDirectory, String propertiesPrefix) throws IOException, |
|
128 |
CqiServerError, CqiClientException, XMLStreamException { |
|
129 |
Log.info(TXMCoreMessages.bind("Importing CONNL-u annotations of {0} in {1} using the ''{2}'' prefix...", connluDirectory, mainCorpus, propertiesPrefix)); |
|
130 |
|
|
131 |
File[] files = connluDirectory.listFiles(new FileFilter() { |
|
132 |
|
|
133 |
@Override |
|
134 |
public boolean accept(File file) { |
|
135 |
return file.isFile() && file.getName().endsWith(".conllu"); |
|
136 |
} |
|
137 |
}); |
|
138 |
|
|
139 |
int nTextProcessed = 0; |
|
140 |
int nWords = 0; |
|
141 |
int nWordsInserted = 0; |
|
142 |
for (File coonluFile : files) { |
|
143 |
|
|
144 |
String textid = coonluFile.getName().substring(0, coonluFile.getName().length() - 7); |
|
145 |
Log.info("** processing text: " + textid); |
|
146 |
Text text = mainCorpus.getProject().getText(textid); |
|
147 |
if (text == null) { |
|
148 |
Log.warning("No text found with ID=" + textid); |
|
149 |
continue; |
|
150 |
} |
|
151 |
File xmltxmFile = mainCorpus.getProject().getText(textid).getXMLTXMFile(); |
|
152 |
File xmltxmUpdatedFile = new File(System.getProperty("java.io.tmpdir"), xmltxmFile.getName()); |
|
153 |
|
|
154 |
XMLTXMWordPropertiesInjection processor = new XMLTXMWordPropertiesInjection(xmltxmFile); |
|
155 |
HashMap<String, HashMap<String, String>> rules = new HashMap<>(); |
|
156 |
processor.setProperties(rules); |
|
157 |
|
|
158 |
BufferedReader reader = IOUtils.getReader(coonluFile); |
|
159 |
String line = reader.readLine(); |
|
160 |
String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" }; |
|
161 |
|
|
162 |
int nWords2 = 0; |
|
163 |
int nLine = 0; |
|
164 |
String sent_id = null; |
|
165 |
String newpar_id = null; |
|
166 |
String newdoc_id = null; |
|
167 |
while (line != null) { |
|
168 |
nLine++; |
|
169 |
if (line.length() == 0) { |
|
170 |
line = reader.readLine(); |
|
171 |
continue; // comment |
|
172 |
} |
|
173 |
|
|
174 |
if (line.startsWith("#")) { |
|
175 |
if (line.startsWith("# sent_id = ")) { |
|
176 |
sent_id = line.substring(12).trim(); |
|
177 |
} |
|
178 |
else if (line.startsWith("# newdoc id = ")) { |
|
179 |
newdoc_id = line.substring(14).trim(); |
|
180 |
} |
|
181 |
else if (line.startsWith("# newpar id = ")) { |
|
182 |
newpar_id = line.substring(14).trim(); |
|
183 |
} |
|
184 |
else { |
|
185 |
// nothing for now |
|
186 |
} |
|
187 |
|
|
188 |
line = reader.readLine(); |
|
189 |
continue; // comment |
|
190 |
} |
|
191 |
|
|
192 |
String[] split = line.split("\t", 10); |
|
193 |
if (split.length < 10) { |
|
194 |
Log.warning("Error: line " + nLine + " : " + line + " -> " + Arrays.toString(split) + " len=" + split.length); |
|
195 |
line = reader.readLine(); |
|
196 |
continue; // comment |
|
197 |
} |
|
198 |
|
|
199 |
String id = split[9]; |
|
200 |
int from = id.indexOf("XmlId=") + 6; |
|
201 |
if (from < 6) { |
|
202 |
Log.warning("Error: line " + nLine + " with no 'XmlId=': " + line); |
|
203 |
line = reader.readLine(); |
|
204 |
continue; |
|
205 |
} |
|
206 |
id = id.substring(from); |
|
207 |
// System.out.println("ID=" + id); |
|
208 |
|
|
209 |
if (id.contains("-")) continue; // TODO to manage later |
|
210 |
|
|
211 |
HashMap<String, String> properties = new HashMap<>(); |
|
212 |
for (int i = 0; i < split.length; i++) { |
|
213 |
properties.put("#" + propertiesPrefix + propNames[i], split[i]); // add the property name using the prefix ; XML-TXM types are prefixed with '#' |
|
214 |
} |
|
215 |
if (sent_id != null) { |
|
216 |
properties.put("#ud-sentid", sent_id); |
|
217 |
sent_id = ""; // reset value for next sentence |
|
218 |
} else { |
|
219 |
properties.put("#ud-sentid", ""); |
|
220 |
} |
|
221 |
if (newdoc_id != null) { |
|
222 |
properties.put("#ud-newdocid", newdoc_id); |
|
223 |
newdoc_id = null; // reset value for next sentence |
|
224 |
} else { |
|
225 |
properties.put("#ud-newdocid", ""); |
|
226 |
} |
|
227 |
if (newpar_id != null) { |
|
228 |
properties.put("#ud-newparid", newpar_id); |
|
229 |
newpar_id = null; // reset value for next sentence |
|
230 |
} else { |
|
231 |
properties.put("#ud-newparid", ""); |
|
232 |
} |
|
233 |
processor.addProperty(id, properties); |
|
234 |
nWords2++; |
|
235 |
line = reader.readLine(); |
|
236 |
} |
|
237 |
reader.close(); |
|
238 |
|
|
239 |
if (nWords2 == 0) { |
|
240 |
Log.warning("** No annotation found in " + coonluFile); |
|
241 |
} |
|
242 |
|
|
243 |
nWords += nWords2; |
|
244 |
|
|
245 |
Log.info("** loading annotations from : " + coonluFile); |
|
246 |
|
|
247 |
if (processor.process(xmltxmUpdatedFile)) { |
|
248 |
if (xmltxmFile.delete() && FileCopy.copy(xmltxmUpdatedFile, xmltxmFile)) { |
|
249 |
if (processor.getNonActivatedRules().size() > 0) { |
|
250 |
Log.warning("Warning: some words were not imported: "+StringUtils.join(processor.getNonActivatedRules(), ", ")); |
|
251 |
} |
|
252 |
} |
|
253 |
else { |
|
254 |
Log.warning("** Warning: annotation import failed for replace the corpus XML-TXM file: " + xmltxmFile + ". TEMP file: " + xmltxmUpdatedFile); |
|
255 |
return 0; |
|
256 |
} |
|
257 |
} |
|
258 |
else { |
|
259 |
Log.warning("** Warning: annotation import failed for text: " + textid); |
|
260 |
return 0; |
|
261 |
} |
|
262 |
|
|
263 |
if (processor.getNInsertions() == 0) { |
|
264 |
Log.warning("** No annotation imported in " + textid); |
|
265 |
} |
|
266 |
|
|
267 |
nWordsInserted += processor.getNInsertions(); |
|
268 |
nTextProcessed++; |
|
269 |
} |
|
270 |
|
|
271 |
if (nTextProcessed == 0) { |
|
272 |
Log.warning("** No text to process. Aborting."); |
|
273 |
return 0; |
|
274 |
} |
|
275 |
|
|
276 |
if (nWords == 0) { |
|
277 |
Log.warning("** No annotation to import in corpus. Aborting."); |
|
278 |
return 0; |
|
279 |
} |
|
280 |
|
|
281 |
if (nWordsInserted == 0) { |
|
282 |
Log.warning("** No annotation imported. Aborting."); |
|
283 |
return 0; |
|
284 |
} |
|
285 |
|
|
286 |
Log.info("XML-TXM source files updated. Updating indexes..."); |
|
287 |
|
|
288 |
UpdateCorpus.update(mainCorpus); |
|
289 |
|
|
290 |
Log.info("Done."); |
|
291 |
|
|
292 |
return 0; |
|
293 |
} |
|
294 |
} |
tmp/org.txm.tigersearch.rcp/src/org/txm/tigersearch/commands/ImportCONNLUAnnotationsFromFile.java (revision 2941) | ||
---|---|---|
1 |
// Copyright © 2010-2020 ENS de Lyon., University of Franche-Comté |
|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice |
|
4 |
// Sophia Antipolis, University of Paris 3. |
|
5 |
// |
|
6 |
// The TXM platform is free software: you can redistribute it |
|
7 |
// and/or modify it under the terms of the GNU General Public |
|
8 |
// License as published by the Free Software Foundation, |
|
9 |
// either version 2 of the License, or (at your option) any |
|
10 |
// later version. |
|
11 |
// |
|
12 |
// The TXM platform is distributed in the hope that it will be |
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
|
15 |
// PURPOSE. See the GNU General Public License for more |
|
16 |
// details. |
|
17 |
// |
|
18 |
// You should have received a copy of the GNU General |
|
19 |
// Public License along with the TXM platform. If not, see |
|
20 |
// http://www.gnu.org/licenses. |
|
21 |
// |
|
22 |
// |
|
23 |
// |
|
24 |
// $LastChangedDate:$ |
|
25 |
// $LastChangedRevision:$ |
|
26 |
// $LastChangedBy:$ |
|
27 |
// |
|
28 |
package org.txm.tigersearch.commands; |
|
29 |
|
|
30 |
import java.io.BufferedReader; |
|
31 |
import java.io.File; |
|
32 |
import java.io.FileFilter; |
|
33 |
import java.io.IOException; |
|
34 |
import java.util.HashMap; |
|
35 |
|
|
36 |
import javax.xml.stream.XMLStreamException; |
|
37 |
|
|
38 |
import org.apache.commons.lang.StringUtils; |
|
39 |
import org.eclipse.core.commands.AbstractHandler; |
|
40 |
import org.eclipse.core.commands.ExecutionEvent; |
|
41 |
import org.eclipse.core.commands.ExecutionException; |
|
42 |
import org.eclipse.jface.viewers.IStructuredSelection; |
|
43 |
import org.eclipse.ui.handlers.HandlerUtil; |
|
44 |
import org.kohsuke.args4j.Option; |
|
45 |
import org.txm.core.messages.TXMCoreMessages; |
|
46 |
import org.txm.objects.Text; |
|
47 |
import org.txm.rcp.commands.workspace.UpdateCorpus; |
|
48 |
import org.txm.rcp.swt.widget.parameters.ParametersDialog; |
|
49 |
import org.txm.searchengine.cqp.clientExceptions.CqiClientException; |
|
50 |
import org.txm.searchengine.cqp.corpus.CQPCorpus; |
|
51 |
import org.txm.searchengine.cqp.corpus.MainCorpus; |
|
52 |
import org.txm.searchengine.cqp.serverException.CqiServerError; |
|
53 |
import org.txm.utils.io.FileCopy; |
|
54 |
import org.txm.utils.io.IOUtils; |
|
55 |
import org.txm.utils.logger.Log; |
|
56 |
import org.txm.xml.xmltxm.XMLTXMWordPropertiesInjection; |
|
57 |
|
|
58 |
import cern.colt.Arrays; |
|
59 |
|
|
60 |
/** |
|
61 |
* Import CONNLU annotations into a TXM corpus |
|
62 |
* |
|
63 |
* IF the corpus already contains CONNLU annotations, they are replaced |
|
64 |
* |
|
65 |
* @author mdecorde. |
|
66 |
*/ |
|
67 |
public class ImportCONNLUAnnotationsFromFile extends AbstractHandler { |
|
68 |
|
|
69 |
public static final String ID = ImportCONNLUAnnotationsFromFile.class.getName(); |
|
70 |
|
|
71 |
@Option(name = "connluFile", usage = "CoNLL-U file", widget = "FileOpen", required = true, def = "file.conllu") |
|
72 |
File connluFile; |
|
73 |
|
|
74 |
@Option(name = "textId", usage = "Identifier of the text to update", widget = "String", required = true, def = "text-id") |
|
75 |
String textId; |
|
76 |
|
|
77 |
@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-") |
|
78 |
String propertiesPrefix; |
|
79 |
|
|
80 |
/* |
|
81 |
* (non-Javadoc) |
|
82 |
* @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent) |
|
83 |
*/ |
|
84 |
@Override |
|
85 |
public Object execute(final ExecutionEvent event) throws ExecutionException { |
|
86 |
|
|
87 |
IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event); |
|
88 |
|
|
89 |
Object s = selection.getFirstElement(); |
|
90 |
if (!(s instanceof MainCorpus)) { |
|
91 |
Log.warning("Selection is not a corpus. Aborting."); |
|
92 |
return null; |
|
93 |
} |
|
94 |
|
|
95 |
if (!ParametersDialog.open(this)) { |
|
96 |
return null; |
|
97 |
} |
|
98 |
if (connluFile == null || !connluFile.exists() || !connluFile.isFile()) { |
|
99 |
Log.warning("Error: cannot access to the connlu file: " + connluFile); |
|
100 |
return null; |
|
101 |
} |
|
102 |
|
|
103 |
CQPCorpus corpus = (CQPCorpus) s; |
|
104 |
MainCorpus mainCorpus = corpus.getMainCorpus(); |
|
105 |
|
|
106 |
try { |
|
107 |
return ImportCONNLUAnnotationsFromDirectory.importAnnotationsFromCoNNLUFile(mainCorpus, connluFile, propertiesPrefix, textId); |
|
108 |
} |
|
109 |
catch (Exception e) { |
|
110 |
Log.warning(e); |
|
111 |
e.printStackTrace(); |
|
112 |
} |
|
113 |
|
|
114 |
return null; |
|
115 |
} |
|
116 |
} |
|
0 | 117 |
tmp/org.txm.tigersearch.rcp/src/org/txm/tigersearch/commands/ExportCorpusAsCONNLU.java (revision 2941) | ||
---|---|---|
73 | 73 |
@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-") |
74 | 74 |
String propertiesPrefix; |
75 | 75 |
|
76 |
@Option(name = "separator", usage = "Options", widget = "Separator", required = true, def = "options") |
|
76 |
@Option(name = "separator", usage = "Options", widget = "Separator", required = true, def = "comments options")
|
|
77 | 77 |
Boolean separator = false; |
78 | 78 |
|
79 |
// @Option(name = "useUDForms", usage = "use the ud form property instead of CQP 'word' property", widget = "Boolean", required = true, def = "true")
|
|
80 |
Boolean useUDForms = true;
|
|
79 |
@Option(name = "insertParagraphs", usage = "Insert paragraph marks in the Connlu corpus", widget = "Boolean", required = true, def = "true")
|
|
80 |
Boolean insertParagraphs = false;
|
|
81 | 81 |
|
82 |
@Option(name = "detectGap", usage = "Insert gap comment using the CQP 'gap' property", widget = "Boolean", required = true, def = "true") |
|
83 |
Boolean detectGap = false; |
|
82 | 84 |
|
83 |
// @Option(name = "transfertAllWords", usage = "Transfert word not initially in the Connlu corpus", widget = "Boolean", required = true, def = "true")
|
|
84 |
Boolean transfertAllWords = true;
|
|
85 |
@Option(name = "separator3", usage = "Options", widget = "Separator", required = true, def = "tokens options")
|
|
86 |
Boolean separator3 = false;
|
|
85 | 87 |
|
86 | 88 |
@Option(name = "insertNoSpaceAfter", usage = "Insert the NoSpaceAfter misc property if not initially in the Connlu corpus", widget = "Boolean", required = true, def = "true") |
87 | 89 |
Boolean insertNoSpaceAfter = true; |
88 | 90 |
|
89 |
@Option(name = "insertParagraphs", usage = "Insert paragraph marks in the Connlu corpus", widget = "Boolean", required = true, def = "true")
|
|
90 |
Boolean insertParagraphs = false;
|
|
91 |
@Option(name = "insertTokenWithoutUdAnnotations", usage = "if checked words without ud annotations are exported as well", widget = "Boolean", required = false, def = "false")
|
|
92 |
Boolean insertTokenWithoutUdAnnotations;
|
|
91 | 93 |
|
92 |
@Option(name = "detectGap", usage = "Insert gap comment using the CQP 'gap' property", widget = "Boolean", required = true, def = "true")
|
|
93 |
Boolean detectGap = false;
|
|
94 |
@Option(name = "defaultFormPropertyName", usage = "optional CQP property to fix the missing 'form' ud property", widget = "String", required = false, def = "")
|
|
95 |
String defaultFormPropertyName;
|
|
94 | 96 |
|
95 |
@Option(name = "formCorrPropertyName", usage = "optional CQP property to fix the missing 'form' ud property", widget = "String", required = false, def = "")
|
|
96 |
String formCorrPropertyName;
|
|
97 |
@Option(name = "defaultLemmaPropertyName", usage = "optional CQP property to fix the missing 'lemma' ud property", widget = "String", required = false, def = "")
|
|
98 |
String defaultLemmaPropertyName;
|
|
97 | 99 |
|
98 |
@Option(name = "lemmaCorrPropertyName", usage = "optional CQP property to fix the missing 'lemma' ud property", widget = "String", required = false, def = "")
|
|
99 |
String lemmaCorrPropertyName;
|
|
100 |
@Option(name = "defaultUposPropertyName", usage = "optional CQP property to fix the missing 'upos' ud property", widget = "String", required = false, def = "")
|
|
101 |
String defaultUposPropertyName;
|
|
100 | 102 |
|
101 |
@Option(name = "uposCorrPropertyName", usage = "optional CQP property to fix the missing 'upos' ud property", widget = "String", required = false, def = "")
|
|
102 |
String uposCorrPropertyName;
|
|
103 |
@Option(name = "defaultXposPropertyName", usage = "optional CQP property to fix the missing 'xpos' ud property", widget = "String", required = false, def = "")
|
|
104 |
String defaultXposPropertyName;
|
|
103 | 105 |
|
104 |
@Option(name = "xposCorrPropertyName", usage = "optional CQP property to fix the missing 'xpos' ud property", widget = "String", required = false, def = "") |
|
105 |
String xposCorrPropertyName; |
|
106 |
|
|
107 |
@Option(name = "punctStrong", usage = "optional strong punct tag to fix sentence limits", widget = "String", required = true, def = "PONfrt") |
|
108 |
String punctStrong; |
|
109 |
|
|
110 | 106 |
@Option(name = "separator2", usage = "Options", widget = "Separator", required = true, def = "sentence fix options") |
111 | 107 |
Boolean separator2 = false; |
112 | 108 |
|
113 | 109 |
@Option(name = "openingPunct", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "[\\-–«‘“\\(]") |
114 | 110 |
String openingPunct; |
115 | 111 |
|
112 |
/** |
|
113 |
* the UD property suffixes, will be used to create the CQP properties like propertiesPrefix + suffix |
|
114 |
*/ |
|
116 | 115 |
public static String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" }; |
117 | 116 |
|
118 | 117 |
/* |
... | ... | |
144 | 143 |
MainCorpus mainCorpus = corpus.getMainCorpus(); |
145 | 144 |
|
146 | 145 |
try { |
147 |
return exportAnnotationsAsCorpus(mainCorpus, connluResultDirectory, propertiesPrefix, openingPunct, |
|
148 |
formCorrPropertyName, lemmaCorrPropertyName, uposCorrPropertyName, xposCorrPropertyName, punctStrong,
|
|
146 |
return exportAnnotationsAsCorpus(mainCorpus, connluResultDirectory, propertiesPrefix, openingPunct, insertTokenWithoutUdAnnotations,
|
|
147 |
defaultFormPropertyName, defaultLemmaPropertyName, defaultUposPropertyName, defaultXposPropertyName,
|
|
149 | 148 |
detectGap, insertParagraphs, insertNoSpaceAfter); |
150 | 149 |
} |
151 | 150 |
catch (Exception e) { |
... | ... | |
157 | 156 |
} |
158 | 157 |
|
159 | 158 |
/** |
159 |
* export the corpus in a directory of conllu files (one per text) |
|
160 | 160 |
* |
161 |
* if aTIGER corpus with the same name already exists, it is replaced |
|
162 |
* |
|
163 |
* @param corpus |
|
164 |
* @param tigerCorpusDirectory |
|
165 |
* @return the number of imported annotations |
|
166 |
* @throws IndexException |
|
167 |
* @throws QueryIndexException |
|
161 |
* @param mainCorpus |
|
162 |
* @param conlluResultDirectory |
|
163 |
* @param prefix |
|
164 |
* @param openingPunct |
|
165 |
* @param insertTokenWithoutUdAnnotations |
|
166 |
* @param defaultFormPropertyName |
|
167 |
* @param defaultLemmaPropertyName |
|
168 |
* @param defaultUposPropertyName |
|
169 |
* @param defaultXposPropertyName |
|
170 |
* @param detectGap |
|
171 |
* @param insertParagraphs |
|
172 |
* @param insertNoSpaceAfter |
|
173 |
* @return the number of annotation exported |
|
174 |
* @throws UnexpectedAnswerException |
|
175 |
* @throws IOException |
|
176 |
* @throws CqiServerError |
|
168 | 177 |
* @throws CqiClientException |
169 |
* @throws CqiServerError |
|
170 |
* @throws IOException |
|
171 |
* @throws UnexpectedAnswerException |
|
172 | 178 |
* @throws InvalidCqpIdException |
173 | 179 |
*/ |
174 |
public static int exportAnnotationsAsCorpus(MainCorpus mainCorpus, File conlluResultDirectory, String prefix, String openingPunct, String formCorrPropertyName, String lemmaCorrPropertyName, |
|
175 |
String uposCorrPropertyName, String xposCorrPropertyName, String punctStrongRegex, boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter) throws UnexpectedAnswerException, |
|
180 |
public static int exportAnnotationsAsCorpus(MainCorpus mainCorpus, File conlluResultDirectory, String prefix, String openingPunct, boolean insertTokenWithoutUdAnnotations, |
|
181 |
String defaultFormPropertyName, String defaultLemmaPropertyName, |
|
182 |
String defaultUposPropertyName, String defaultXposPropertyName, boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter) |
|
183 |
throws UnexpectedAnswerException, |
|
176 | 184 |
IOException, |
177 | 185 |
CqiServerError, |
178 | 186 |
CqiClientException, InvalidCqpIdException { |
... | ... | |
188 | 196 |
int[] start_limits = mainCorpus.getTextStartLimits(); |
189 | 197 |
int[] end_limits = mainCorpus.getTextEndLimits(); |
190 | 198 |
|
191 |
HashSet<String> beforeSpacesRules = new HashSet<>(LangFormater.getNoSpaceBefore(mainCorpus.getLang())); |
|
192 |
HashSet<String> afterSpacesRules = new HashSet<>(LangFormater.getNoSpaceAfter(mainCorpus.getLang())); |
|
199 |
String lang = mainCorpus.getLang(); |
|
200 |
// HashSet<String> beforeSpacesRules = new HashSet<>(LangFormater.getNoSpaceBefore(mainCorpus.getLang())); |
|
201 |
// HashSet<String> afterSpacesRules = new HashSet<>(LangFormater.getNoSpaceAfter(mainCorpus.getLang())); |
|
193 | 202 |
|
194 | 203 |
for (String p : propNames) { |
195 | 204 |
WordProperty wp = mainCorpus.getProperty(prefix + p); |
... | ... | |
199 | 208 |
} |
200 | 209 |
} |
201 | 210 |
|
211 |
if (insertTokenWithoutUdAnnotations && (defaultFormPropertyName == null || mainCorpus.getProperty(defaultFormPropertyName) == null)) { |
|
212 |
Log.warning("Error: the defaultFormPropertyName parameter needs to be set if insertTokenWithoutUdAnnotations is set to true"); |
|
213 |
return 0; |
|
214 |
} |
|
215 |
|
|
202 | 216 |
for (int iText = 0; iText < start_limits.length; iText++) { |
203 | 217 |
|
204 | 218 |
// Build corpus positions |
... | ... | |
280 | 294 |
HashMap<Integer, String> sentidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, "ud-sentid"); |
281 | 295 |
HashMap<Integer, String> newdocidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, "ud-newdocid"); |
282 | 296 |
|
283 |
|
|
284 | 297 |
// build sentence, first pass using UD word sentence positions |
285 | 298 |
ArrayList<ArrayList<Integer>> sentences = new ArrayList<>(); |
286 | 299 |
ArrayList<Integer> tmpSentence = new ArrayList<>(); |
... | ... | |
301 | 314 |
tmpSentence.clear(); |
302 | 315 |
} |
303 | 316 |
|
304 |
tmpSentence.add(p); |
|
317 |
if (insertTokenWithoutUdAnnotations) { |
|
318 |
tmpSentence.add(p); // insert all tokens |
|
319 |
} |
|
320 |
else if (idValues[p] != 0) { |
|
321 |
tmpSentence.add(p); // insert all tokens |
|
322 |
} |
|
323 |
|
|
305 | 324 |
} |
306 | 325 |
positions = null; // free memory |
307 | 326 |
|
... | ... | |
358 | 377 |
String[] ids = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("id").getQualifiedName(), sentencePositions); |
359 | 378 |
|
360 | 379 |
String[] words = null; |
361 |
if (formCorrPropertyName != null && formCorrPropertyName.length() > 0) {
|
|
362 |
words = getDefaultValues(mainCorpus, formCorrPropertyName, sentencePositions);
|
|
380 |
if (defaultFormPropertyName != null && defaultFormPropertyName.length() > 0) {
|
|
381 |
words = getDefaultValues(mainCorpus, defaultFormPropertyName, sentencePositions);
|
|
363 | 382 |
} |
364 | 383 |
String[] lemmas = null; |
365 |
if (lemmaCorrPropertyName != null && lemmaCorrPropertyName.length() > 0) {
|
|
366 |
lemmas = getDefaultValues(mainCorpus, lemmaCorrPropertyName, sentencePositions);
|
|
384 |
if (defaultLemmaPropertyName != null && defaultLemmaPropertyName.length() > 0) {
|
|
385 |
lemmas = getDefaultValues(mainCorpus, defaultLemmaPropertyName, sentencePositions);
|
|
367 | 386 |
} |
368 | 387 |
String[] upos = null; |
369 |
if (uposCorrPropertyName != null && uposCorrPropertyName.length() > 0) {
|
|
370 |
upos = getDefaultValues(mainCorpus, uposCorrPropertyName, sentencePositions);
|
|
388 |
if (defaultUposPropertyName != null && defaultUposPropertyName.length() > 0) {
|
|
389 |
upos = getDefaultValues(mainCorpus, defaultUposPropertyName, sentencePositions);
|
|
371 | 390 |
} |
372 | 391 |
String[] xpos = null; |
373 |
if (xposCorrPropertyName != null && xposCorrPropertyName.length() > 0) {
|
|
374 |
xpos = getDefaultValues(mainCorpus, xposCorrPropertyName, sentencePositions);
|
|
392 |
if (defaultXposPropertyName != null && defaultXposPropertyName.length() > 0) {
|
|
393 |
xpos = getDefaultValues(mainCorpus, defaultXposPropertyName, sentencePositions);
|
|
375 | 394 |
} |
376 | 395 |
|
377 | 396 |
// String[] feats = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(featsCorrPropertyName).getQualifiedName(), sentencePositions); |
... | ... | |
379 | 398 |
// String[] deprel = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions); |
380 | 399 |
// String[] deps = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions); |
381 | 400 |
|
382 |
HashMap<Integer, Integer> sentIds = new HashMap<>();
|
|
401 |
// fix ud properties using CQP values
|
|
383 | 402 |
for (int ip = 0; ip < sentence.size(); ip++) { |
403 |
|
|
384 | 404 |
int p = sentence.get(ip); |
385 | 405 |
|
386 |
if (idValues[p] != 0) { // store "old id -> new id" |
|
387 |
sentIds.put(idValues[p], (ip + 1)); // from 1 to N |
|
388 |
} |
|
389 |
|
|
390 | 406 |
// new word |
391 | 407 |
if (miscValues[p].equals("_")) { |
392 | 408 |
miscValues[p] = "XmlId=" + ids[ip]; |
... | ... | |
405 | 421 |
if (xpos != null && xposValues[p].equals("_")) { |
406 | 422 |
xposValues[p] = xpos[ip]; |
407 | 423 |
} |
408 |
|
|
409 |
if (insertNoSpaceAfter) { // inserttino activated |
|
424 |
} |
|
425 |
|
|
426 |
if (insertNoSpaceAfter) { |
|
427 |
for (int ip = 0; ip < sentence.size(); ip++) { // fix SpaceAfter. !!! this needs to be done after ud properties are fixed |
|
428 |
int p = sentence.get(ip); |
|
429 |
// insertion activated |
|
410 | 430 |
if (!miscValues[p].contains("SpaceAfter=")) { // only update if not present |
411 |
if (afterSpacesRules.contains(formValues[p])) {
|
|
412 |
miscValues[p] += " SpaceAfter=Yes";
|
|
431 |
if (LangFormater.isSpaceAfterNotNeeded(formValues[p], lang)) {
|
|
432 |
miscValues[p] += "|SpaceAfter=No";
|
|
413 | 433 |
} |
414 |
else if (formValues.length < p + 1 && beforeSpacesRules.contains(formValues[p + 1])) { |
|
415 |
miscValues[p] += " SpaceAfter=Yes"; |
|
434 |
else if (formValues.length > (p + 1) && LangFormater.isSpaceBeforeNotNeeded(formValues[p + 1], lang)) { |
|
435 |
// if next token needs a space before, set SpaceAfter=Yes to the previous token |
|
436 |
miscValues[p] += "|SpaceAfter=No"; |
|
416 | 437 |
} |
417 | 438 |
} |
418 | 439 |
} |
419 |
|
|
440 |
} |
|
420 | 441 |
|
442 |
// fixing sentence punct limits |
|
443 |
while (sentence.size() > 0 && iSentence > 0 && formValues[sentence.get(0)].matches("\\p{P}") && !formValues[sentence.get(0)].matches(openingPunct)) { |
|
444 |
System.out.println("FIXING: first non-openingPunct position " + formValues[sentence.get(0)] + " in " + iSentence); |
|
445 |
int p2 = sentence.remove(0); |
|
446 |
sentences.get(iSentence - 1).add(p2); |
|
421 | 447 |
} |
448 |
// |
|
449 |
while (sentence.size() > 0 && iSentence + 1 < sentences.size() && formValues[sentence.get(sentence.size() - 1)].matches(openingPunct)) { |
|
450 |
System.out.println("FIXING: last openingPunct position " + formValues[sentence.get(sentence.size() - 1)] + " in " + iSentence); |
|
451 |
int p2 = sentence.remove(sentence.size() - 1); |
|
452 |
sentences.get(iSentence + 1).add(0, p2); |
|
453 |
} |
|
422 | 454 |
|
423 |
|
|
424 |
// fixing sentence punct limits |
|
425 |
|
|
426 |
|
|
427 |
// while (sentence.size() > 0 && iSentence > 0 && xpos[0].matches(punctStrongRegex)) { |
|
428 |
// System.out.println("FIXING: first punctStrong position " + xposValues[sentence.get(0)] + " in " + iSentence); |
|
429 |
// int p2 = sentence.remove(0); |
|
430 |
// sentences.get(iSentence - 1).add(p2); |
|
431 |
// } |
|
432 |
|
|
433 |
while (sentence.size() > 0 && iSentence > 0 && formValues[sentence.get(0)].matches("\\p{P}") && !formValues[sentence.get(0)].matches(openingPunct)) { |
|
434 |
System.out.println("FIXING: first non-openingPunct position " + formValues[sentence.get(0)] + " in " + iSentence); |
|
435 |
int p2 = sentence.remove(0); |
|
436 |
sentences.get(iSentence - 1).add(p2); |
|
437 |
} |
|
438 |
// |
|
439 |
while (sentence.size() > 0 && iSentence + 1 < sentences.size() && formValues[sentence.get(sentence.size() - 1)].matches(openingPunct)) { |
|
440 |
System.out.println("FIXING: last openingPunct position " + formValues[sentence.get(sentence.size() - 1)] + " in " + iSentence); |
|
441 |
int p2 = sentence.remove(sentence.size() - 1); |
|
442 |
sentences.get(iSentence + 1).add(0, p2); |
|
443 |
} |
|
444 |
|
|
445 | 455 |
if (sentence.size() == 0) { // sentence was depleted after fixing it |
446 | 456 |
sentences.remove(iSentence); |
447 | 457 |
iSentence--; |
448 | 458 |
continue; |
449 | 459 |
} |
460 |
} |
|
461 |
|
|
462 |
for (int s = 0; s < sentences.size(); s++) { |
|
450 | 463 |
|
464 |
// fix only ud sentences limits |
|
465 |
ArrayList<Integer> sentence = sentences.get(s); |
|
466 |
HashMap<Integer, Integer> oldToNewIds = new HashMap<>(); |
|
467 |
for (int ip = 0; ip < sentence.size(); ip++) { // computing old to new ids |
|
468 |
int p = sentence.get(ip); |
|
469 |
|
|
470 |
if (idValues[p] != 0) { // store "old id -> new id" |
|
471 |
oldToNewIds.put(idValues[p], (ip + 1)); // from 1 to N |
|
472 |
} |
|
473 |
} |
|
474 |
|
|
451 | 475 |
// fixing head and set missing head to 0 and root |
452 | 476 |
for (int ip = 0; ip < sentence.size(); ip++) { |
453 | 477 |
int p = sentence.get(ip); |
... | ... | |
456 | 480 |
idValues[p] = (ip + 1); // from 1 to N |
457 | 481 |
|
458 | 482 |
// fixing head values |
459 |
if (sentIds.containsKey(headValues[p])) {
|
|
460 |
headValues[p] = sentIds.get(headValues[p]);
|
|
483 |
if (oldToNewIds.containsKey(headValues[p])) {
|
|
484 |
headValues[p] = oldToNewIds.get(headValues[p]);
|
|
461 | 485 |
} |
462 |
else { // new word, set to default values |
|
463 |
headValues[p] = -1;
|
|
486 |
else if (headValues[p] != 0) { // new word, set to default values
|
|
487 |
headValues[p] = 0;
|
|
464 | 488 |
deprelValues[p] = "_"; |
465 | 489 |
depsValues[p] = "_"; |
466 | 490 |
} |
... | ... | |
485 | 509 |
gap = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("gap").getQualifiedName(), sentencePositions); |
486 | 510 |
} |
487 | 511 |
|
488 |
|
|
489 |
|
|
490 | 512 |
String[] tokens = new String[sentence.size()]; |
491 | 513 |
for (int ip = 0; ip < sentence.size(); ip++) { |
492 | 514 |
tokens[ip] = formValues[sentence.get(ip)]; |
... | ... | |
506 | 528 |
writer.println("# newdoc id = " + textIds[iText]); |
507 | 529 |
} |
508 | 530 |
|
509 |
if (sentidStartPositions.containsKey(sentence.get(0))) { |
|
510 |
writer.println("# sent_id = " + sentidStartPositions.get(sentence.get(0))); |
|
531 |
boolean foundSentId = false; |
|
532 |
for (int ip : sentence) { |
|
533 |
if (!foundSentId && sentidStartPositions.containsKey(ip)) { |
|
534 |
writer.println("# sent_id = " + sentidStartPositions.get(ip)); |
|
535 |
foundSentId = true; |
|
536 |
} |
|
511 | 537 |
} |
512 |
else {
|
|
538 |
if (!foundSentId) { // no sent_id found
|
|
513 | 539 |
writer.println("# sent_id = " + textIds[iText] + "-" + (iSentence + 1) + ".new"); |
514 | 540 |
} |
515 | 541 |
|
... | ... | |
519 | 545 |
iParagraph++; |
520 | 546 |
} |
521 | 547 |
|
522 |
for (int ip = 0 ; ip < sentence.size() ; ip++) {
|
|
548 |
for (int ip = 0; ip < sentence.size(); ip++) {
|
|
523 | 549 |
int p = sentence.get(ip); |
524 |
|
|
550 |
|
|
525 | 551 |
// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" }; |
526 | 552 |
writer.println(idValues[p] + "\t" + formValues[p] + "\t" + lemmaValues[p] + "\t" + uposValues[p] |
527 | 553 |
+ "\t" + xposValues[p] + "\t" + featsValues[p] + "\t" + headValues[p] + "\t" + deprelValues[p] |
... | ... | |
538 | 564 |
numberOfTextsWritten++; |
539 | 565 |
} |
540 | 566 |
|
541 |
System.out.println("N words written: " + numberOfWordsWritten);
|
|
542 |
System.out.println("N sentences written: " + numberOfSentencesWritten);
|
|
543 |
System.out.println("N texts written: " + numberOfTextsWritten);
|
|
567 |
System.out.println("# words written: " + numberOfWordsWritten);
|
|
568 |
System.out.println("# sentences written: " + numberOfSentencesWritten);
|
|
569 |
System.out.println("# texts written: " + numberOfTextsWritten);
|
|
544 | 570 |
|
545 | 571 |
return numberOfWordsWritten; |
546 | 572 |
} |
... | ... | |
579 | 605 |
} |
580 | 606 |
} |
581 | 607 |
} |
608 |
|
|
609 |
|
tmp/org.txm.tigersearch.rcp/src/org/txm/tigersearch/commands/ImportCONNLUAnnotationsFromDirectory.java (revision 2941) | ||
---|---|---|
1 |
// Copyright © 2010-2020 ENS de Lyon., University of Franche-Comté |
|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice |
|
4 |
// Sophia Antipolis, University of Paris 3. |
|
5 |
// |
|
6 |
// The TXM platform is free software: you can redistribute it |
|
7 |
// and/or modify it under the terms of the GNU General Public |
|
8 |
// License as published by the Free Software Foundation, |
|
9 |
// either version 2 of the License, or (at your option) any |
|
10 |
// later version. |
|
11 |
// |
|
12 |
// The TXM platform is distributed in the hope that it will be |
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
|
15 |
// PURPOSE. See the GNU General Public License for more |
|
16 |
// details. |
|
17 |
// |
|
18 |
// You should have received a copy of the GNU General |
|
19 |
// Public License along with the TXM platform. If not, see |
|
20 |
// http://www.gnu.org/licenses. |
|
21 |
// |
|
22 |
// |
|
23 |
// |
|
24 |
// $LastChangedDate:$ |
|
25 |
// $LastChangedRevision:$ |
|
26 |
// $LastChangedBy:$ |
|
27 |
// |
|
28 |
package org.txm.tigersearch.commands; |
|
29 |
|
|
30 |
import java.io.BufferedReader; |
|
31 |
import java.io.File; |
|
32 |
import java.io.FileFilter; |
|
33 |
import java.io.IOException; |
|
34 |
import java.util.HashMap; |
|
35 |
|
|
36 |
import javax.xml.stream.XMLStreamException; |
|
37 |
|
|
38 |
import org.apache.commons.lang.StringUtils; |
|
39 |
import org.eclipse.core.commands.AbstractHandler; |
|
40 |
import org.eclipse.core.commands.ExecutionEvent; |
|
41 |
import org.eclipse.core.commands.ExecutionException; |
|
42 |
import org.eclipse.jface.viewers.IStructuredSelection; |
|
43 |
import org.eclipse.ui.handlers.HandlerUtil; |
|
44 |
import org.kohsuke.args4j.Option; |
|
45 |
import org.txm.core.messages.TXMCoreMessages; |
|
46 |
import org.txm.objects.Text; |
|
47 |
import org.txm.rcp.commands.workspace.UpdateCorpus; |
|
48 |
import org.txm.rcp.swt.widget.parameters.ParametersDialog; |
|
49 |
import org.txm.searchengine.cqp.clientExceptions.CqiClientException; |
|
50 |
import org.txm.searchengine.cqp.corpus.CQPCorpus; |
|
51 |
import org.txm.searchengine.cqp.corpus.MainCorpus; |
|
52 |
import org.txm.searchengine.cqp.serverException.CqiServerError; |
|
53 |
import org.txm.utils.io.FileCopy; |
|
54 |
import org.txm.utils.io.IOUtils; |
|
55 |
import org.txm.utils.logger.Log; |
|
56 |
import org.txm.xml.xmltxm.XMLTXMWordPropertiesInjection; |
|
57 |
|
|
58 |
import cern.colt.Arrays; |
|
59 |
|
|
60 |
/** |
|
61 |
* Import CONNLU annotations into a TXM corpus |
|
62 |
* |
|
63 |
* IF the corpus already contains CONNLU annotations, they are replaced |
|
64 |
* |
|
65 |
* @author mdecorde. |
|
66 |
*/ |
|
67 |
public class ImportCONNLUAnnotationsFromDirectory extends AbstractHandler { |
|
68 |
|
|
69 |
public static final String ID = ImportCONNLUAnnotationsFromDirectory.class.getName(); |
|
70 |
|
|
71 |
@Option(name = "connluDirectory", usage = "connluDirectory", widget = "Folder", required = true, def = "connlu-directory") |
|
72 |
File connluDirectory; |
|
73 |
|
|
74 |
@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-") |
|
75 |
String propertiesPrefix; |
|
76 |
|
|
77 |
/* |
|
78 |
* (non-Javadoc) |
|
79 |
* @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent) |
|
80 |
*/ |
|
81 |
@Override |
|
82 |
public Object execute(final ExecutionEvent event) throws ExecutionException { |
|
83 |
|
|
84 |
IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event); |
|
85 |
|
|
86 |
Object s = selection.getFirstElement(); |
|
87 |
if (!(s instanceof MainCorpus)) { |
|
88 |
Log.warning("Selection is not a corpus. Aborting."); |
|
89 |
return null; |
|
90 |
} |
|
91 |
|
|
92 |
if (!ParametersDialog.open(this)) { |
|
93 |
return null; |
|
94 |
} |
|
95 |
if (connluDirectory == null || !connluDirectory.exists() || !connluDirectory.isDirectory() || connluDirectory.listFiles().length == 0) { |
|
96 |
Log.warning("Error: connlu directory is empty: " + connluDirectory); |
|
97 |
return null; |
|
98 |
} |
|
99 |
|
|
100 |
CQPCorpus corpus = (CQPCorpus) s; |
|
101 |
MainCorpus mainCorpus = corpus.getMainCorpus(); |
|
102 |
|
|
103 |
try { |
|
104 |
return importAnnotations(mainCorpus, connluDirectory, propertiesPrefix); |
|
105 |
} |
|
106 |
catch (Exception e) { |
|
107 |
Log.warning(e); |
|
108 |
e.printStackTrace(); |
|
109 |
} |
|
110 |
|
|
111 |
return null; |
|
112 |
} |
|
113 |
|
|
114 |
/** |
|
115 |
* |
|
116 |
* if import CONNLU annotations in the corpus with the same name already exists, it is replaced |
|
117 |
* |
|
118 |
* @param corpus |
|
119 |
* @param connluDirectory |
|
120 |
* @param propertiesPrefix |
|
121 |
* @return the number of imported annotations |
|
122 |
* @throws CqiClientException |
|
123 |
* @throws CqiServerError |
|
124 |
* @throws IOException |
|
125 |
* @throws XMLStreamException |
|
126 |
*/ |
|
127 |
public static int importAnnotations(MainCorpus mainCorpus, File connluDirectory, String propertiesPrefix) throws IOException, |
|
128 |
CqiServerError, CqiClientException, XMLStreamException { |
|
129 |
Log.info(TXMCoreMessages.bind("Importing CONNL-u annotations of {0} in {1} using the ''{2}'' prefix...", connluDirectory, mainCorpus, propertiesPrefix)); |
|
130 |
|
|
131 |
File[] files = connluDirectory.listFiles(new FileFilter() { |
|
132 |
|
|
133 |
@Override |
|
134 |
public boolean accept(File file) { |
|
135 |
return file.isFile() && file.getName().endsWith(".conllu"); |
|
136 |
} |
|
137 |
}); |
|
138 |
|
|
139 |
int nTextProcessed = 0; |
|
140 |
int nWordsInserted = 0; |
|
141 |
for (File coonluFile : files) { |
|
142 |
|
|
143 |
nWordsInserted += _importAnnotations(coonluFile, mainCorpus, propertiesPrefix, null); |
|
144 |
nTextProcessed++; |
|
145 |
} |
|
146 |
|
|
147 |
if (nTextProcessed == 0) { |
|
148 |
Log.warning("** No text to process. Aborting."); |
|
149 |
return 0; |
|
150 |
} |
|
151 |
|
|
152 |
if (nWordsInserted == 0) { |
|
153 |
Log.warning("** No annotation imported. Aborting."); |
|
154 |
return 0; |
|
155 |
} |
|
156 |
|
|
157 |
Log.info("XML-TXM source files updated. Updating indexes..."); |
|
158 |
|
|
159 |
UpdateCorpus.update(mainCorpus); |
|
160 |
|
|
161 |
Log.info("Done."); |
|
162 |
|
|
163 |
return 0; |
|
164 |
} |
|
165 |
|
|
166 |
/** |
|
167 |
* |
|
168 |
* if import CONNLU annotations in the corpus with the same name already exists, it is replaced |
|
169 |
* |
|
170 |
* @param corpus |
|
171 |
* @param connluFile |
|
172 |
* @param propertiesPrefix |
|
173 |
* @return the number of imported annotations |
|
174 |
* @throws CqiClientException |
|
175 |
* @throws CqiServerError |
|
176 |
* @throws IOException |
|
177 |
* @throws XMLStreamException |
|
178 |
*/ |
|
179 |
public static int importAnnotationsFromCoNNLUFile(MainCorpus mainCorpus, File connluFile, String propertiesPrefix, String textId) throws IOException, |
|
180 |
CqiServerError, CqiClientException, XMLStreamException { |
|
181 |
Log.info(TXMCoreMessages.bind("Importing CONNL-u annotations of {0} in {1} using the ''{2}'' prefix...", connluFile, mainCorpus, propertiesPrefix)); |
|
182 |
|
|
183 |
|
|
184 |
int nWordsInserted = _importAnnotations(connluFile, mainCorpus, propertiesPrefix, textId); |
|
185 |
|
|
186 |
if (nWordsInserted == 0) { |
|
187 |
Log.warning("** No annotation imported. Aborting."); |
|
188 |
return 0; |
|
189 |
} |
|
190 |
|
|
191 |
Log.info("XML-TXM source files updated. Updating indexes..."); |
|
192 |
|
|
193 |
UpdateCorpus.update(mainCorpus); |
|
194 |
|
|
195 |
Log.info("Done."); |
|
196 |
|
|
197 |
return 0; |
|
198 |
} |
|
199 |
|
|
200 |
private static int _importAnnotations(File coonluFile, MainCorpus mainCorpus, String propertiesPrefix, String textId) throws IOException, XMLStreamException { |
|
201 |
if (textId == null || textId.length() == 0) { // no text id provided, using the connlu file name |
|
202 |
textId = coonluFile.getName().substring(0, coonluFile.getName().length() - 7); |
|
203 |
} |
|
204 |
Log.info("** processing text: " + textId); |
|
205 |
Text text = mainCorpus.getProject().getText(textId); |
|
206 |
if (text == null) { |
|
207 |
Log.warning("No text found with ID=" + textId); |
|
208 |
return 0; |
|
209 |
} |
|
210 |
File xmltxmFile = mainCorpus.getProject().getText(textId).getXMLTXMFile(); |
|
211 |
File xmltxmUpdatedFile = new File(System.getProperty("java.io.tmpdir"), xmltxmFile.getName()); |
|
212 |
|
|
213 |
XMLTXMWordPropertiesInjection processor = new XMLTXMWordPropertiesInjection(xmltxmFile); |
|
214 |
HashMap<String, HashMap<String, String>> rules = new HashMap<>(); |
|
215 |
processor.setProperties(rules); |
|
216 |
|
|
217 |
BufferedReader reader = IOUtils.getReader(coonluFile); |
|
218 |
String line = reader.readLine(); |
|
219 |
String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" }; |
|
220 |
|
|
221 |
int nWords2 = 0; |
|
222 |
int nLine = 0; |
|
223 |
String sent_id = null; |
|
224 |
String newpar_id = null; |
|
225 |
String newdoc_id = null; |
|
226 |
while (line != null) { |
|
227 |
nLine++; |
|
228 |
if (line.length() == 0) { |
|
229 |
line = reader.readLine(); |
|
230 |
continue; // comment |
|
231 |
} |
|
232 |
|
|
233 |
if (line.startsWith("#")) { |
|
234 |
if (line.startsWith("# sent_id = ")) { |
|
235 |
sent_id = line.substring(12).trim(); |
|
236 |
} |
|
237 |
else if (line.startsWith("# newdoc id = ")) { |
|
238 |
newdoc_id = line.substring(14).trim(); |
|
239 |
} |
|
240 |
else if (line.startsWith("# newpar id = ")) { |
|
241 |
newpar_id = line.substring(14).trim(); |
|
242 |
} |
|
243 |
else { |
|
244 |
// nothing for now |
|
245 |
} |
|
246 |
|
|
247 |
line = reader.readLine(); |
|
248 |
continue; // comment |
|
249 |
} |
|
250 |
|
|
251 |
String[] split = line.split("\t", 10); |
|
252 |
if (split.length < 10) { |
|
253 |
Log.warning("Error: line " + nLine + " : " + line + " -> " + Arrays.toString(split) + " len=" + split.length); |
|
254 |
line = reader.readLine(); |
|
255 |
continue; // comment |
|
256 |
} |
|
257 |
|
|
258 |
String misc = split[9]; |
|
259 |
String[] miscValues = misc.split("\\|"); |
|
260 |
String id = null; |
|
261 |
for (String miscValue : miscValues) { |
|
262 |
if (miscValue.startsWith("XmlId=")) { |
|
263 |
id = miscValue.substring(6); |
|
264 |
} |
|
265 |
} |
|
266 |
|
|
267 |
HashMap<String, String> properties = new HashMap<>(); |
|
268 |
for (int i = 0; i < split.length; i++) { |
|
269 |
properties.put("#" + propertiesPrefix + propNames[i], split[i]); // add the property name using the prefix ; XML-TXM types are prefixed with '#' |
|
270 |
} |
|
271 |
|
|
272 |
if (sent_id != null) { |
|
273 |
properties.put("#ud-sentid", sent_id); |
|
274 |
sent_id = ""; // reset value for next sentence |
|
275 |
} |
|
276 |
else { |
|
277 |
properties.put("#ud-sentid", ""); |
|
278 |
} |
|
279 |
|
|
280 |
if (newdoc_id != null) { |
|
281 |
properties.put("#ud-newdocid", newdoc_id); |
|
282 |
newdoc_id = null; // reset value for next sentence |
|
283 |
} |
|
284 |
else { |
|
285 |
properties.put("#ud-newdocid", ""); |
|
286 |
} |
|
287 |
|
|
288 |
if (newpar_id != null) { |
|
289 |
properties.put("#ud-newparid", newpar_id); |
|
290 |
newpar_id = null; // reset value for next sentence |
|
291 |
} |
|
292 |
else { |
|
293 |
properties.put("#ud-newparid", ""); |
|
294 |
} |
|
295 |
|
|
296 |
if (id == null) { |
|
297 |
Log.warning("No 'XmlId=' found for UD line: " + line); |
|
298 |
} |
|
299 |
else { |
|
300 |
processor.addProperty(id, properties); |
|
301 |
nWords2++; |
|
302 |
} |
|
303 |
line = reader.readLine(); |
|
304 |
} |
|
305 |
reader.close(); |
|
306 |
|
|
307 |
if (nWords2 == 0) { |
|
308 |
Log.warning("** No annotation to import in " + coonluFile); |
|
309 |
return 0; |
|
310 |
} |
|
311 |
|
|
312 |
Log.info("** loading annotations from : " + coonluFile); |
|
313 |
if (processor.process(xmltxmUpdatedFile)) { |
|
314 |
if (xmltxmFile.delete() && FileCopy.copy(xmltxmUpdatedFile, xmltxmFile)) { |
|
315 |
if (processor.getNonActivatedRules().size() > 0) { |
|
316 |
Log.warning("Warning: some words were not imported: " + StringUtils.join(processor.getNonActivatedRules(), ", ")); |
|
317 |
} |
|
318 |
} |
|
319 |
else { |
|
320 |
Log.warning("** Warning: annotation import failed for replace the corpus XML-TXM file: " + xmltxmFile + ". TEMP file: " + xmltxmUpdatedFile); |
|
321 |
return 0; |
|
322 |
} |
|
323 |
} |
|
324 |
else { |
|
325 |
Log.warning("** Warning: annotation import failed for text: " + textId); |
|
326 |
return 0; |
|
327 |
} |
|
328 |
|
|
329 |
if (processor.getNInsertions() == 0) { |
|
330 |
Log.warning("** No annotation imported in " + textId); |
|
331 |
} |
|
332 |
|
|
333 |
return processor.getNInsertions(); |
|
334 |
} |
|
335 |
} |
|
0 | 336 |
Formats disponibles : Unified diff