Révision 3731
TXM/trunk/bundles/org.txm.rcp/src/main/java/org/txm/rcp/commands/function/WordPropertiesFromTable.java (revision 3731) | ||
---|---|---|
30 | 30 |
|
31 | 31 |
MainCorpus corpus = (MainCorpus)first; |
32 | 32 |
|
33 |
File script = new File(Toolbox.getTxmHomePath(), "/scripts/groovy/user/org/txm/macro/annotation/ImportWordPropertiesFromTableMacro.groovy"); //$NON-NLS-1$
|
|
33 |
File script = new File(Toolbox.getTxmHomePath(), "/scripts/groovy/user/org/txm/macro/annotation/ImportWordPropertiesFromTableMacro.groovy"); //$NON-NLS-1$ |
|
34 | 34 |
//File parametersFile = new File(Toolbox.getTxmHomePath(), "/scripts/groovy/user/org/txm/macro/annotation/ImportWordPropertiesFromTableMacro.properties"); |
35 | 35 |
|
36 | 36 |
HashMap<String, Object> defaultParameters = new HashMap<String, Object>(); |
... | ... | |
38 | 38 |
defaultParameters.put("csvFile", corpus.getName()+"_annotations.tsv"); //$NON-NLS-1$ //$NON-NLS-2$ |
39 | 39 |
|
40 | 40 |
ExecuteGroovyMacro.execute(script.getAbsolutePath(), part, selection, null, null, defaultParameters); //$NON-NLS-1$ |
41 |
|
|
42 |
|
|
41 | 43 |
return null; |
42 | 44 |
} |
43 | 45 |
|
TXM/trunk/bundles/org.txm.rcp/src/main/java/org/txm/rcp/commands/workspace/UpdateCorpus.java (revision 3731) | ||
---|---|---|
2 | 2 |
|
3 | 3 |
import java.io.File; |
4 | 4 |
import java.io.FileFilter; |
5 |
import java.util.Date; |
|
5 | 6 |
|
6 | 7 |
import org.eclipse.core.commands.AbstractHandler; |
7 | 8 |
import org.eclipse.core.commands.ExecutionEvent; |
... | ... | |
15 | 16 |
import org.eclipse.osgi.util.NLS; |
16 | 17 |
import org.eclipse.swt.widgets.Display; |
17 | 18 |
import org.eclipse.ui.handlers.HandlerUtil; |
19 |
import org.txm.Toolbox; |
|
18 | 20 |
import org.txm.core.preferences.TBXPreferences; |
19 | 21 |
import org.txm.objects.Project; |
20 | 22 |
import org.txm.rcp.commands.CloseEditorsUsing; |
... | ... | |
144 | 146 |
try { |
145 | 147 |
if (project.compute(monitor, true)) { // TODO children should be recomputed later only when the user needs it |
146 | 148 |
|
149 |
project.appendToHistory("Updated"); |
|
150 |
|
|
147 | 151 |
this.syncExec(new Runnable() { |
148 | 152 |
|
149 | 153 |
@Override |
TXM/trunk/bundles/org.txm.rcp/src/main/java/org/txm/rcp/swt/widget/QueryWidget.java (revision 3731) | ||
---|---|---|
78 | 78 |
|
79 | 79 |
h = null; |
80 | 80 |
|
81 |
// if (this.project != null) { |
|
82 |
// h = this.project.getFirstChild(QueryHistory.class); |
|
83 |
// |
|
84 |
// if (h == null) { |
|
85 |
// h = new QueryHistory(project); |
|
86 |
// } |
|
87 |
// } |
|
88 |
// |
|
89 |
// try { // load history from queries.txt file |
|
90 |
// h.compute(false); |
|
91 |
// } catch (InterruptedException e) { |
|
92 |
// // TODO Auto-generated catch block |
|
93 |
// e.printStackTrace(); |
|
94 |
// } |
|
81 |
if (this.project != null) { |
|
82 |
h = this.project.getFirstChild(QueryHistory.class); |
|
83 |
|
|
84 |
if (h == null) { |
|
85 |
h = new QueryHistory(project); |
|
86 |
} |
|
87 |
} |
|
95 | 88 |
|
89 |
try { // load history from queries.txt file |
|
90 |
h.compute(false); |
|
91 |
} catch (InterruptedException e) { |
|
92 |
// TODO Auto-generated catch block |
|
93 |
e.printStackTrace(); |
|
94 |
} |
|
95 |
|
|
96 | 96 |
setHistoryItems(); |
97 | 97 |
} |
98 | 98 |
|
TXM/trunk/bundles/org.txm.rcp/src/main/java/org/txm/rcp/swt/widget/parameters/SeparatorField.java (revision 3731) | ||
---|---|---|
1 | 1 |
package org.txm.rcp.swt.widget.parameters; |
2 | 2 |
|
3 | 3 |
import org.eclipse.swt.SWT; |
4 |
import org.eclipse.swt.graphics.Font; |
|
5 |
import org.eclipse.swt.graphics.FontData; |
|
6 | 4 |
import org.eclipse.swt.layout.GridData; |
7 | 5 |
import org.eclipse.swt.layout.GridLayout; |
8 | 6 |
import org.eclipse.swt.widgets.Composite; |
9 | 7 |
import org.eclipse.swt.widgets.Label; |
10 |
import org.eclipse.swt.widgets.Text; |
|
11 | 8 |
import org.kohsuke.args4j.NamedOptionDef; |
12 | 9 |
|
13 | 10 |
/** |
... | ... | |
37 | 34 |
l.setLayoutData(gd); |
38 | 35 |
l.setText(str); |
39 | 36 |
l.setToolTipText(getWidgetUsage()); |
40 |
Font f = parent.getFont(); |
|
41 |
l.setFont(new Font(f.getDevice(), new FontData(f.getFontData()[0].getName(), f.getFontData()[0].getHeight(), f.getFontData()[0].getStyle()|SWT.BOLD)));
|
|
37 |
// Font f = parent.getFont();
|
|
38 |
// l.setFont(new Font(Display.getCurrent(), new FontData(f.getFontData()[0].getName(), f.getFontData()[0].getHeight(), f.getFontData()[0].getStyle()|SWT.BOLD)));
|
|
42 | 39 |
Label dt = new Label(this, SWT.SEPARATOR | SWT.HORIZONTAL); |
43 | 40 |
dt.setLayoutData(new GridData(SWT.FILL, SWT.END, true, false)); |
44 | 41 |
dt.setToolTipText(getWidgetUsage()); |
TXM/trunk/bundles/org.txm.rcp/src/main/java/org/txm/rcp/ApplicationWorkbenchAdvisor.java (revision 3731) | ||
---|---|---|
513 | 513 |
*/ |
514 | 514 |
@Override |
515 | 515 |
public void postShutdown() { |
516 |
|
|
516 | 517 |
callPreStopScript(); |
517 | 518 |
|
518 | 519 |
Toolbox.shutdown(); |
TXM/trunk/bundles/org.txm.tigersearch.rcp/src/org/txm/tigersearch/commands/ImportTIGERAnnotations.java (revision 3731) | ||
---|---|---|
232 | 232 |
Log.info("Finalizing TIGERSearch corpus"); |
233 | 233 |
if (numberOfWordsAnnotated > 0) { // copy the TIGERcorpus to import |
234 | 234 |
FileCopy.copyFiles(tigerCorpusDirectory, tigerCorpusExistingDirectory); |
235 |
|
|
236 |
corpus.getProject().appendToHistory("TIGER Annotations imported from "+tigerDirectory); |
|
237 |
|
|
235 | 238 |
Log.info("Done. " + numberOfWordsAnnotated + " words annotated."); |
236 | 239 |
} |
237 | 240 |
else { |
TXM/trunk/bundles/org.txm.analec.rcp/src/org/txm/annotation/urs/commands/ImportGlozzAnnotations.java (revision 3731) | ||
---|---|---|
95 | 95 |
Log.info(Messages.ImportGlozzAnnotations_9); |
96 | 96 |
URSCorpora.saveCorpus(analecCorpus); |
97 | 97 |
|
98 |
mainCorpus.getProject().appendToHistory("URS Annotations imported from "+aafile+", "+aamfile+" and "+acfile); |
|
99 |
|
|
98 | 100 |
Log.info(Messages.ImportGlozzAnnotations_10); |
99 | 101 |
return true; |
100 | 102 |
} |
TXM/trunk/bundles/org.txm.analec.rcp/src/org/txm/annotation/urs/commands/SaveCorpus.java (revision 3731) | ||
---|---|---|
87 | 87 |
Log.warning(Messages.SaveCorpus_5); |
88 | 88 |
return false; |
89 | 89 |
} |
90 |
|
|
91 |
mainCorpus.getProject().appendToHistory("URS annotations saved"); |
|
90 | 92 |
|
91 | 93 |
mainCorpus.setIsModified(false); |
92 | 94 |
if (event != null) { |
TXM/trunk/bundles/org.txm.analec.rcp/src/org/txm/annotation/urs/commands/ImportTEIAnnotations.java (revision 3731) | ||
---|---|---|
76 | 76 |
Log.warning(Messages.ImportTEIAnnotations_2); |
77 | 77 |
return Status.CANCEL_STATUS; |
78 | 78 |
} else { |
79 |
mainCorpus.getProject().appendToHistory("URS annotations imported from "+ directory +" : "+analecCorpus.getStructure().toString()); |
|
79 | 80 |
return Status.OK_STATUS; |
80 | 81 |
} |
81 | 82 |
} catch (Throwable e) { |
... | ... | |
136 | 137 |
if (ret) { |
137 | 138 |
Log.info(TXMCoreMessages.bind(Messages.ImportTEIAnnotations_5, analecCorpus.getToutesUnites().size(), analecCorpus.getToutesRelations().size(), analecCorpus.getTousSchemas().size())); |
138 | 139 |
mainCorpus.setIsModified(true); |
140 |
|
|
141 |
mainCorpus.getProject().appendToHistory("URS Annotations imported from TEI files of "+annotationDirectory); |
|
142 |
|
|
139 | 143 |
CorporaView.refreshObject(mainCorpus); |
140 | 144 |
} |
141 | 145 |
return ret; |
TXM/trunk/bundles/org.txm.conllu.rcp/src/org/txm/conllu/rcp/commands/ExportCorpusAsFullCoNLLU.java (revision 3731) | ||
---|---|---|
42 | 42 |
import org.eclipse.osgi.util.NLS; |
43 | 43 |
import org.eclipse.ui.handlers.HandlerUtil; |
44 | 44 |
import org.kohsuke.args4j.Option; |
45 |
import org.txm.conllu.core.function.ImportCoNLLUAnnotations; |
|
45 | 46 |
import org.txm.rcp.swt.widget.parameters.ParametersDialog; |
46 | 47 |
import org.txm.searchengine.cqp.CQPSearchEngine; |
47 | 48 |
import org.txm.searchengine.cqp.clientExceptions.CqiClientException; |
... | ... | |
63 | 64 |
* @author mdecorde. |
64 | 65 |
*/ |
65 | 66 |
public class ExportCorpusAsFullCoNLLU extends AbstractHandler { |
66 |
|
|
67 |
|
|
67 | 68 |
public static final String ID = ExportCorpusAsFullCoNLLU.class.getName(); |
68 |
|
|
69 |
|
|
69 | 70 |
@Option(name = "conlluResultDirectory", usage = "conlluResultDirectory", widget = "Folder", required = true, def = "conllu-result-directory") |
70 | 71 |
File conlluResultDirectory; |
71 |
|
|
72 |
|
|
72 | 73 |
@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-") |
73 | 74 |
String propertiesPrefix; |
74 |
|
|
75 |
|
|
75 | 76 |
@Option(name = "separator", usage = "Options", widget = "Separator", required = true, def = "comment properties") |
76 | 77 |
Boolean separator = false; |
77 |
|
|
78 |
|
|
78 | 79 |
@Option(name = "insertParagraphs", usage = "Insert paragraph marks in the CoNLLU corpus", widget = "Boolean", required = true, def = "true") |
79 | 80 |
Boolean insertParagraphs = false; |
80 |
|
|
81 |
|
|
81 | 82 |
@Option(name = "detectGap", usage = "Insert gap comment using the CQP 'gap' property", widget = "Boolean", required = true, def = "true") |
82 | 83 |
Boolean detectGap = false; |
83 |
|
|
84 |
|
|
84 | 85 |
@Option(name = "separator3", usage = "Options", widget = "Separator", required = true, def = "tokens options") |
85 | 86 |
Boolean separator3 = false; |
86 |
|
|
87 |
|
|
87 | 88 |
@Option(name = "insertNoSpaceAfter", usage = "Insert the NoSpaceAfter misc property if not initially in the CoNLLU corpus", widget = "Boolean", required = true, def = "true") |
88 | 89 |
Boolean insertNoSpaceAfter = true; |
89 |
|
|
90 |
|
|
90 | 91 |
@Option(name = "insertTokenWithoutUdAnnotations", usage = "if checked words without ud annotations are exported as well", widget = "Boolean", required = false, def = "false") |
91 | 92 |
Boolean insertTokenWithoutUdAnnotations; |
92 |
|
|
93 |
|
|
93 | 94 |
// "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" }; |
94 | 95 |
@Option(name = "separator_properties", usage = "Options", widget = "Separator", required = true, def = "fixing UD properties") |
95 | 96 |
Boolean separator_properties = false; |
96 |
|
|
97 |
|
|
97 | 98 |
@Option(name = "defaultFormPropertyName", usage = "optional CQP property to fix the missing 'form' ud property", widget = "String", required = false, def = "") |
98 | 99 |
String defaultFormPropertyName; |
99 |
|
|
100 |
|
|
100 | 101 |
@Option(name = "defaultLemmaPropertyName", usage = "optional CQP property to fix the missing 'lemma' ud property", widget = "String", required = false, def = "") |
101 | 102 |
String defaultLemmaPropertyName; |
102 |
|
|
103 |
|
|
103 | 104 |
@Option(name = "defaultUposPropertyName", usage = "optional CQP property to fix the missing 'upos' ud property", widget = "String", required = false, def = "") |
104 | 105 |
String defaultUposPropertyName; |
105 |
|
|
106 |
|
|
106 | 107 |
@Option(name = "defaultXposPropertyName", usage = "optional CQP property to fix the missing 'xpos' ud property", widget = "String", required = false, def = "") |
107 | 108 |
String defaultXposPropertyName; |
108 |
|
|
109 |
|
|
109 | 110 |
@Option(name = "defaultFeatsPropertyName", usage = "optional CQP property to fix the missing 'feats' ud property", widget = "String", required = false, def = "") |
110 | 111 |
String defaultFeatsPropertyName; |
111 |
|
|
112 |
|
|
112 | 113 |
@Option(name = "defaultHeadPropertyName", usage = "optional CQP property to fix the missing 'head' ud property", widget = "String", required = false, def = "") |
113 | 114 |
String defaultHeadPropertyName; |
114 |
|
|
115 |
|
|
115 | 116 |
@Option(name = "defaultDeprelPropertyName", usage = "optional CQP property to fix the missing 'deprel' ud property", widget = "String", required = false, def = "") |
116 | 117 |
String defaultDeprelPropertyName; |
117 |
|
|
118 |
|
|
118 | 119 |
@Option(name = "defaultDepsPropertyName", usage = "optional CQP property to fix the missing 'deps' ud property", widget = "String", required = false, def = "") |
119 | 120 |
String defaultDepsPropertyName; |
120 |
|
|
121 |
|
|
121 | 122 |
@Option(name = "defaultMiscPropertyName", usage = "optional CQP property to fix the missing 'misc' ud property", widget = "String", required = false, def = "") |
122 | 123 |
String defaultMiscPropertyName; |
123 |
|
|
124 |
|
|
124 | 125 |
@Option(name = "separator2", usage = "Options", widget = "Separator", required = true, def = "sentence fix options") |
125 | 126 |
Boolean separator2 = false; |
126 |
|
|
127 |
|
|
127 | 128 |
@Option(name = "openingPunct", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "[\\-–«‘“\\(]") |
128 | 129 |
String openingPunct; |
129 |
|
|
130 |
|
|
130 | 131 |
/** |
131 | 132 |
* the UD property suffixes, will be used to create the CQP properties like propertiesPrefix + suffix |
132 | 133 |
*/ |
133 |
public static String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
|
|
134 |
|
|
134 |
public static String[] propNames = ImportCoNLLUAnnotations.UD_PROPERTY_NAMES;
|
|
135 |
|
|
135 | 136 |
/* |
136 | 137 |
* (non-Javadoc) |
137 | 138 |
* @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent) |
138 | 139 |
*/ |
139 | 140 |
@Override |
140 | 141 |
public Object execute(final ExecutionEvent event) throws ExecutionException { |
141 |
|
|
142 |
|
|
142 | 143 |
IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event); |
143 |
|
|
144 |
|
|
144 | 145 |
Object s = selection.getFirstElement(); |
145 | 146 |
if (!(s instanceof MainCorpus)) { |
146 | 147 |
Log.warning("Selection is not a corpus. Aborting."); |
147 | 148 |
return null; |
148 | 149 |
} |
149 |
|
|
150 |
|
|
150 | 151 |
if (!ParametersDialog.open(this)) { |
151 | 152 |
return null; |
152 | 153 |
} |
153 |
|
|
154 |
|
|
154 | 155 |
conlluResultDirectory.mkdirs(); |
155 | 156 |
if (conlluResultDirectory == null || !conlluResultDirectory.exists() || !conlluResultDirectory.isDirectory()) { |
156 | 157 |
Log.warning("Error: conllu result directory does not exists: " + conlluResultDirectory); |
157 | 158 |
return null; |
158 | 159 |
} |
159 |
|
|
160 |
|
|
160 | 161 |
CQPCorpus corpus = (CQPCorpus) s; |
161 | 162 |
MainCorpus mainCorpus = corpus.getMainCorpus(); |
162 |
|
|
163 |
|
|
163 | 164 |
try { |
164 | 165 |
return exportAnnotationsAsCorpus(mainCorpus, conlluResultDirectory, propertiesPrefix, openingPunct, insertTokenWithoutUdAnnotations, |
165 | 166 |
defaultFormPropertyName, defaultLemmaPropertyName, defaultUposPropertyName, defaultXposPropertyName, |
... | ... | |
171 | 172 |
Log.warning(e); |
172 | 173 |
Log.printStackTrace(e); |
173 | 174 |
} |
174 |
|
|
175 |
|
|
175 | 176 |
return null; |
176 | 177 |
} |
177 |
|
|
178 |
|
|
178 | 179 |
/** |
179 | 180 |
* export the corpus in a directory of conllu files (one per text) |
180 | 181 |
* |
... | ... | |
202 | 203 |
String defaultFeatsPropertyName, String defaultHeadPropertyName, String defaultDeprelPropertyName, String defaultDepsPropertyName, |
203 | 204 |
String defaultMiscPropertyName, |
204 | 205 |
boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter) |
205 |
throws UnexpectedAnswerException, |
|
206 |
IOException, |
|
207 |
CqiServerError, |
|
208 |
CqiClientException, InvalidCqpIdException { |
|
209 |
|
|
206 |
throws UnexpectedAnswerException,
|
|
207 |
IOException,
|
|
208 |
CqiServerError,
|
|
209 |
CqiClientException, InvalidCqpIdException {
|
|
210 |
|
|
210 | 211 |
if (!conlluResultDirectory.exists()) { |
211 | 212 |
conlluResultDirectory.mkdirs(); |
212 | 213 |
} |
213 | 214 |
int numberOfWordsWritten = 0; |
214 | 215 |
int numberOfSentencesWritten = 0; |
215 | 216 |
int numberOfTextsWritten = 0; |
216 |
|
|
217 |
|
|
217 | 218 |
String[] textIds = mainCorpus.getCorpusTextIdsList(); |
218 | 219 |
int[] start_limits = mainCorpus.getTextStartLimits(); |
219 | 220 |
int[] end_limits = mainCorpus.getTextEndLimits(); |
220 |
|
|
221 |
|
|
221 | 222 |
String lang = mainCorpus.getLang(); |
222 | 223 |
// HashSet<String> beforeSpacesRules = new HashSet<>(LangFormater.getNoSpaceBefore(mainCorpus.getLang())); |
223 | 224 |
// HashSet<String> afterSpacesRules = new HashSet<>(LangFormater.getNoSpaceAfter(mainCorpus.getLang())); |
224 |
|
|
225 |
|
|
225 | 226 |
for (String p : propNames) { |
226 | 227 |
WordProperty wp = mainCorpus.getProperty(prefix + p); |
227 | 228 |
if (wp == null) { |
228 |
Log.warning("Error: cannot find the Conllu property: " + prefix + p);
|
|
229 |
return 0; |
|
229 |
Log.warning("Warning: cannot find the Conllu property: " + prefix + p);
|
|
230 |
//return 0;
|
|
230 | 231 |
} |
231 | 232 |
} |
232 |
|
|
233 |
|
|
233 | 234 |
if (insertTokenWithoutUdAnnotations && (defaultFormPropertyName == null || mainCorpus.getProperty(defaultFormPropertyName) == null)) { |
234 | 235 |
Log.warning("Error: the defaultFormPropertyName parameter needs to be set if insertTokenWithoutUdAnnotations is set to true"); |
235 | 236 |
return 0; |
236 | 237 |
} |
237 |
|
|
238 |
|
|
238 | 239 |
for (int iText = 0; iText < start_limits.length; iText++) { |
239 |
|
|
240 |
|
|
240 | 241 |
// Build corpus positions |
241 | 242 |
int[] positions = new int[end_limits[iText] - start_limits[iText] + 1]; |
242 | 243 |
int tmp = 0; |
... | ... | |
244 | 245 |
positions[tmp++] = n; |
245 | 246 |
} |
246 | 247 |
numberOfWordsWritten += positions.length; |
247 |
|
|
248 |
|
|
248 | 249 |
// Get UD properties |
249 | 250 |
WordProperty wp; |
250 | 251 |
wp = mainCorpus.getProperty(prefix + "id"); |
... | ... | |
259 | 260 |
} |
260 | 261 |
} |
261 | 262 |
tmpValues = null; |
262 |
|
|
263 |
WordProperty formWordProperty = mainCorpus.getProperty(prefix + "form"); |
|
264 |
String[] formValues = CQPSearchEngine.getCqiClient().cpos2Str(formWordProperty.getQualifiedName(), positions); |
|
265 |
fixUNDEFValues(formValues); |
|
266 |
|
|
263 |
|
|
264 |
String[] emptyvalues = new String[positions.length]; |
|
265 |
for (int i = 0 ; i < emptyvalues.length ; i++) { |
|
266 |
emptyvalues[i] = "_"; |
|
267 |
} |
|
268 |
|
|
269 |
wp = mainCorpus.getProperty(prefix + "form"); |
|
270 |
String[] formValues = null; |
|
271 |
if (wp != null) { |
|
272 |
formValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
273 |
fixUNDEFValues(formValues); |
|
274 |
} else { |
|
275 |
formValues = emptyvalues; |
|
276 |
} |
|
277 |
|
|
278 |
|
|
267 | 279 |
wp = mainCorpus.getProperty(prefix + "lemma"); |
268 |
String[] lemmaValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
269 |
fixUNDEFValues(lemmaValues); |
|
270 |
|
|
280 |
String[] lemmaValues = null; |
|
281 |
if (wp != null) { |
|
282 |
lemmaValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
283 |
fixUNDEFValues(lemmaValues); |
|
284 |
} else { |
|
285 |
lemmaValues = emptyvalues; |
|
286 |
} |
|
287 |
|
|
271 | 288 |
wp = mainCorpus.getProperty(prefix + "upos"); |
272 |
String[] uposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
273 |
fixUNDEFValues(uposValues); |
|
274 |
|
|
289 |
String[] uposValues = null; |
|
290 |
if (wp != null) { |
|
291 |
uposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
292 |
fixUNDEFValues(uposValues); |
|
293 |
} else { |
|
294 |
uposValues = emptyvalues; |
|
295 |
} |
|
296 |
|
|
275 | 297 |
wp = mainCorpus.getProperty(prefix + "xpos"); |
276 |
String[] xposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
277 |
fixUNDEFValues(xposValues); |
|
278 |
|
|
298 |
String[] xposValues = null; |
|
299 |
if (wp != null) { |
|
300 |
xposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
301 |
fixUNDEFValues(xposValues); |
|
302 |
} else { |
|
303 |
xposValues = emptyvalues; |
|
304 |
} |
|
305 |
|
|
279 | 306 |
wp = mainCorpus.getProperty(prefix + "feats"); |
280 |
String[] featsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
281 |
fixUNDEFValues(featsValues); |
|
282 |
|
|
307 |
String[] featsValues = null; |
|
308 |
if (wp != null) { |
|
309 |
featsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
310 |
fixUNDEFValues(featsValues); |
|
311 |
} else { |
|
312 |
featsValues = emptyvalues; |
|
313 |
} |
|
314 |
|
|
283 | 315 |
wp = mainCorpus.getProperty(prefix + "head"); |
284 | 316 |
// String[] headValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
285 | 317 |
tmpValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
... | ... | |
293 | 325 |
} |
294 | 326 |
} |
295 | 327 |
tmpValues = null; |
296 |
|
|
328 |
|
|
297 | 329 |
wp = mainCorpus.getProperty(prefix + "deprel"); |
298 |
String[] deprelValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
299 |
fixUNDEFValues(deprelValues); |
|
300 |
|
|
330 |
String[] deprelValues = null; |
|
331 |
if (wp != null) { |
|
332 |
deprelValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
333 |
fixUNDEFValues(deprelValues); |
|
334 |
} else { |
|
335 |
deprelValues = emptyvalues; |
|
336 |
} |
|
337 |
|
|
301 | 338 |
wp = mainCorpus.getProperty(prefix + "deps"); |
302 |
String[] depsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
303 |
fixUNDEFValues(depsValues); |
|
304 |
|
|
339 |
String[] depsValues = null; |
|
340 |
if (wp != null) { |
|
341 |
depsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
342 |
fixUNDEFValues(depsValues); |
|
343 |
} else { |
|
344 |
depsValues = emptyvalues; |
|
345 |
} |
|
346 |
|
|
305 | 347 |
wp = mainCorpus.getProperty(prefix + "misc"); |
306 |
String[] miscValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
307 |
fixUNDEFValues(miscValues); |
|
308 |
|
|
348 |
String[] miscValues = null; |
|
349 |
if (wp != null) { |
|
350 |
miscValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
351 |
fixUNDEFValues(miscValues); |
|
352 |
} else { |
|
353 |
miscValues = emptyvalues; |
|
354 |
} |
|
355 |
|
|
309 | 356 |
HashSet<Integer> paragraphsStartPositions = new HashSet<>(); |
310 | 357 |
if (insertParagraphs) { |
311 | 358 |
StructuralUnit p_struct = mainCorpus.getStructuralUnit("p"); |
... | ... | |
319 | 366 |
} |
320 | 367 |
} |
321 | 368 |
} |
322 |
|
|
369 |
|
|
323 | 370 |
HashMap<Integer, String> sentidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, prefix+"sentid"); |
324 | 371 |
HashMap<Integer, String> newdocidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, prefix+"newdocid"); |
325 |
|
|
372 |
|
|
326 | 373 |
// build sentence, first pass using UD word sentence positions |
327 | 374 |
ArrayList<ArrayList<Integer>> sentences = new ArrayList<>(); |
328 | 375 |
ArrayList<Integer> tmpSentence = new ArrayList<>(); |
... | ... | |
331 | 378 |
// + featsValues[p] + " head=" |
332 | 379 |
// + headValues[p] + " deprel=" + deprelValues[p] + " deps=" + depsValues[p] + " misc=" + miscValues[p]); |
333 | 380 |
if (sentidStartPositions.containsKey(p)) { // new ud sentence |
334 |
|
|
381 |
|
|
335 | 382 |
if (tmpSentence.size() > 0) { |
336 | 383 |
sentences.add(new ArrayList<>(tmpSentence)); |
337 | 384 |
} |
338 |
|
|
385 |
|
|
339 | 386 |
// System.out.println("new sentence: " + " id=" + idValues[p] + " form=" + formValues[p] + " lemma=" + lemmaValues[p] + " upos=" + uposValues[p] + " xpos=" + xposValues[p] + " |
340 | 387 |
// feats=" |
341 | 388 |
// + featsValues[p] + " head=" |
342 | 389 |
// + headValues[p] + " deprel=" + deprelValues[p] + " deps=" + depsValues[p] + " misc=" + miscValues[p]); |
343 | 390 |
tmpSentence.clear(); |
344 | 391 |
} |
345 |
|
|
392 |
|
|
346 | 393 |
if (insertTokenWithoutUdAnnotations) { |
347 | 394 |
tmpSentence.add(p); // insert all tokens |
348 | 395 |
} |
349 | 396 |
else if (idValues[p] != 0) { |
350 | 397 |
tmpSentence.add(p); // insert all tokens |
351 | 398 |
} |
352 |
|
|
399 |
|
|
353 | 400 |
} |
354 | 401 |
positions = null; // free memory |
355 |
|
|
402 |
|
|
356 | 403 |
// fixing sentences |
357 | 404 |
for (int s = 0; s < sentences.size(); s++) { |
358 |
|
|
405 |
|
|
359 | 406 |
// fix only ud sentences limits |
360 | 407 |
ArrayList<Integer> sentence = sentences.get(s); |
361 |
|
|
408 |
|
|
362 | 409 |
if (sentidStartPositions.get(sentence.get(0)) == null) { |
363 | 410 |
continue; // this is not a UD sentence |
364 | 411 |
} |
365 |
|
|
412 |
|
|
366 | 413 |
int max = -1; |
367 | 414 |
int imax = 0; |
368 | 415 |
for (int ip = 0; ip < sentence.size(); ip++) { |
... | ... | |
372 | 419 |
imax = ip; |
373 | 420 |
} |
374 | 421 |
} |
375 |
|
|
422 |
|
|
376 | 423 |
ArrayList<Integer> newSentence = new ArrayList<>(); |
377 | 424 |
for (int ip = imax + 1; ip < sentence.size(); ip++) { |
378 | 425 |
newSentence.add(sentence.get(ip)); |
... | ... | |
388 | 435 |
sentences.add(s + 1, newSentence); |
389 | 436 |
} |
390 | 437 |
} |
391 |
|
|
438 |
|
|
392 | 439 |
if (tmpSentence.size() > 0) { // add last sentence |
393 | 440 |
sentences.add(new ArrayList<>(tmpSentence)); |
394 | 441 |
} |
395 |
|
|
442 |
|
|
396 | 443 |
// fixing sentence __NULL__ ud properties |
397 | 444 |
for (int iSentence = 0; iSentence < sentences.size(); iSentence++) { |
398 | 445 |
ArrayList<Integer> sentence = sentences.get(iSentence); |
399 |
|
|
446 |
|
|
400 | 447 |
int[] sentencePositions = new int[sentence.size()]; |
401 | 448 |
for (int p = 0; p < sentence.size(); p++) { |
402 | 449 |
sentencePositions[p] = sentence.get(p); |
403 | 450 |
} |
404 |
|
|
451 |
|
|
405 | 452 |
// get CQP values fixing "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps" |
406 | 453 |
String[] ids = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("id").getQualifiedName(), sentencePositions); |
407 |
|
|
454 |
|
|
408 | 455 |
String[] words = null; |
409 | 456 |
if (defaultFormPropertyName != null && defaultFormPropertyName.length() > 0) { |
410 | 457 |
words = getDefaultValues(mainCorpus, defaultFormPropertyName, sentencePositions); |
... | ... | |
421 | 468 |
if (defaultXposPropertyName != null && defaultXposPropertyName.length() > 0) { |
422 | 469 |
xposs = getDefaultValues(mainCorpus, defaultXposPropertyName, sentencePositions); |
423 | 470 |
} |
424 |
|
|
471 |
|
|
425 | 472 |
String[] feats = null; |
426 | 473 |
if (defaultFeatsPropertyName != null && defaultFeatsPropertyName.length() > 0) { |
427 | 474 |
feats = getDefaultValues(mainCorpus, defaultFeatsPropertyName, sentencePositions); |
... | ... | |
442 | 489 |
if (defaultMiscPropertyName != null && defaultMiscPropertyName.length() > 0) { |
443 | 490 |
miscs = getDefaultValues(mainCorpus, defaultMiscPropertyName, sentencePositions); |
444 | 491 |
} |
445 |
|
|
492 |
|
|
446 | 493 |
// String[] feats = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(featsCorrPropertyName).getQualifiedName(), sentencePositions); |
447 | 494 |
// String[] head = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(headCorrPropertyName).getQualifiedName(), sentencePositions); |
448 | 495 |
// String[] deprel = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions); |
449 | 496 |
// String[] deps = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions); |
450 |
|
|
497 |
|
|
451 | 498 |
// fix ud properties using CQP values |
452 | 499 |
for (int ip = 0; ip < sentence.size(); ip++) { |
453 |
|
|
500 |
|
|
454 | 501 |
int p = sentence.get(ip); |
455 |
|
|
502 |
|
|
456 | 503 |
// new word |
457 | 504 |
if (miscValues[p].equals("_")) { |
458 | 505 |
miscValues[p] = "XmlId=" + ids[ip]; |
459 | 506 |
} |
460 |
|
|
507 |
|
|
461 | 508 |
// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" }; |
462 | 509 |
if (words != null && formValues[p].equals("_")) { |
463 | 510 |
formValues[p] = words[ip]; |
... | ... | |
487 | 534 |
miscValues[p] = miscs[ip]; |
488 | 535 |
} |
489 | 536 |
} |
490 |
|
|
537 |
|
|
491 | 538 |
if (insertNoSpaceAfter) { |
492 | 539 |
for (int ip = 0; ip < sentence.size(); ip++) { // fix SpaceAfter. !!! this needs to be done after ud properties are fixed |
493 | 540 |
int p = sentence.get(ip); |
... | ... | |
503 | 550 |
} |
504 | 551 |
} |
505 | 552 |
} |
506 |
|
|
553 |
|
|
507 | 554 |
// fixing sentence punct limits |
508 | 555 |
while (sentence.size() > 0 && iSentence > 0 && formValues[sentence.get(0)].matches("\\p{P}") && !formValues[sentence.get(0)].matches(openingPunct)) { |
509 | 556 |
// System.out.println("FIXING: first non-openingPunct position " + formValues[sentence.get(0)] + " in " + iSentence); |
... | ... | |
516 | 563 |
int p2 = sentence.remove(sentence.size() - 1); |
517 | 564 |
sentences.get(iSentence + 1).add(0, p2); |
518 | 565 |
} |
519 |
|
|
566 |
|
|
520 | 567 |
if (sentence.size() == 0) { // sentence was depleted after fixing it |
521 | 568 |
sentences.remove(iSentence); |
522 | 569 |
iSentence--; |
523 | 570 |
continue; |
524 | 571 |
} |
525 | 572 |
} |
526 |
|
|
573 |
|
|
527 | 574 |
for (int s = 0; s < sentences.size(); s++) { |
528 |
|
|
575 |
|
|
529 | 576 |
// fix only ud sentences limits |
530 | 577 |
ArrayList<Integer> sentence = sentences.get(s); |
531 | 578 |
HashMap<Integer, Integer> oldToNewIds = new HashMap<>(); |
532 | 579 |
for (int ip = 0; ip < sentence.size(); ip++) { // computing old to new ids |
533 | 580 |
int p = sentence.get(ip); |
534 |
|
|
581 |
|
|
535 | 582 |
if (idValues[p] != 0) { // store "old id -> new id" |
536 | 583 |
oldToNewIds.put(idValues[p], (ip + 1)); // from 1 to N |
537 | 584 |
} |
538 | 585 |
} |
539 |
|
|
586 |
|
|
540 | 587 |
// fixing head and set missing head to 0 and root |
541 | 588 |
for (int ip = 0; ip < sentence.size(); ip++) { |
542 | 589 |
int p = sentence.get(ip); |
543 |
|
|
590 |
|
|
544 | 591 |
// fixing id value |
545 | 592 |
idValues[p] = (ip + 1); // from 1 to N |
546 |
|
|
593 |
|
|
547 | 594 |
// fixing head values |
548 | 595 |
if (oldToNewIds.containsKey(headValues[p])) { |
549 | 596 |
headValues[p] = oldToNewIds.get(headValues[p]); |
... | ... | |
555 | 602 |
} |
556 | 603 |
} |
557 | 604 |
} |
558 |
|
|
605 |
|
|
559 | 606 |
// writing sentences |
560 | 607 |
File resultConlluFile = new File(conlluResultDirectory, textIds[iText] + ".conllu"); |
561 | 608 |
PrintWriter writer = IOUtils.getWriter(resultConlluFile); |
562 |
|
|
609 |
|
|
563 | 610 |
int iParagraph = 1; |
564 |
|
|
611 |
|
|
565 | 612 |
for (int iSentence = 0; iSentence < sentences.size(); iSentence++) { |
566 | 613 |
ArrayList<Integer> sentence = sentences.get(iSentence); |
567 |
|
|
614 |
|
|
568 | 615 |
int[] sentencePositions = new int[sentence.size()]; |
569 | 616 |
for (int p = 0; p < sentence.size(); p++) { |
570 | 617 |
sentencePositions[p] = sentence.get(p); |
571 | 618 |
} |
572 |
|
|
619 |
|
|
573 | 620 |
String[] gap = null; |
574 | 621 |
if (detectGap && mainCorpus.getProperty("gap") != null) { |
575 | 622 |
gap = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("gap").getQualifiedName(), sentencePositions); |
576 | 623 |
} |
577 |
|
|
624 |
|
|
578 | 625 |
String[] tokens = new String[sentence.size()]; |
579 | 626 |
for (int ip = 0; ip < sentence.size(); ip++) { |
580 | 627 |
tokens[ip] = formValues[sentence.get(ip)]; |
581 | 628 |
} |
582 |
|
|
629 |
|
|
583 | 630 |
if (insertNoSpaceAfter) { |
584 | 631 |
writer.println("# text = " + LangFormater.format(StringUtils.join(tokens, " "), mainCorpus.getLang())); |
585 | 632 |
} |
586 | 633 |
else { |
587 | 634 |
writer.println("# text = " + StringUtils.join(tokens, " ")); |
588 | 635 |
} |
589 |
|
|
636 |
|
|
590 | 637 |
if (newdocidStartPositions.containsKey(sentence.get(0))) { |
591 | 638 |
writer.println("# newdoc id = " + newdocidStartPositions.get(sentence.get(0))); |
592 | 639 |
} |
593 | 640 |
else { |
594 | 641 |
writer.println("# newdoc id = " + textIds[iText]); |
595 | 642 |
} |
596 |
|
|
643 |
|
|
597 | 644 |
boolean foundSentId = false; |
598 | 645 |
for (int ip : sentence) { |
599 | 646 |
if (!foundSentId && sentidStartPositions.containsKey(ip)) { |
... | ... | |
604 | 651 |
if (!foundSentId) { // no sent_id found |
605 | 652 |
writer.println("# sent_id = " + textIds[iText] + "-" + (iSentence + 1) + ".new"); |
606 | 653 |
} |
607 |
|
|
654 |
|
|
608 | 655 |
if (paragraphsStartPositions.contains(sentence.get(0))) { // paragraphsStartPositions is empty if the injectParagraph option is not set |
609 | 656 |
writer.println("# newpar id = " + iParagraph); |
610 | 657 |
iParagraph++; |
611 | 658 |
} |
612 |
|
|
659 |
|
|
613 | 660 |
for (int ip = 0; ip < sentence.size(); ip++) { |
614 | 661 |
int p = sentence.get(ip); |
615 |
|
|
662 |
|
|
616 | 663 |
// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" }; |
617 | 664 |
writer.println(idValues[p] + "\t" + formValues[p] + "\t" + lemmaValues[p] + "\t" + uposValues[p] |
618 | 665 |
+ "\t" + xposValues[p] + "\t" + featsValues[p] + "\t" + headValues[p] + "\t" + deprelValues[p] |
619 |
+ "\t" + depsValues[p] + "\t" + miscValues[p]); |
|
620 |
|
|
666 |
+ "\t" + depsValues[p] + "\t" + miscValues[p]);
|
|
667 |
|
|
621 | 668 |
if (gap != null && gap[ip].equals("next")) { |
622 | 669 |
writer.println("# gap"); |
623 | 670 |
} |
... | ... | |
626 | 673 |
numberOfSentencesWritten++; |
627 | 674 |
} |
628 | 675 |
writer.close(); |
629 |
|
|
676 |
|
|
630 | 677 |
System.out.println(" Text done: " + resultConlluFile); |
631 | 678 |
numberOfTextsWritten++; |
632 | 679 |
} |
633 |
|
|
680 |
|
|
634 | 681 |
System.out.println("# words written: " + numberOfWordsWritten); |
635 | 682 |
System.out.println("# sentences written: " + numberOfSentencesWritten); |
636 | 683 |
System.out.println("# texts written: " + numberOfTextsWritten); |
637 |
|
|
684 |
|
|
638 | 685 |
return numberOfWordsWritten; |
639 | 686 |
} |
640 |
|
|
687 |
|
|
641 | 688 |
private static String[] getDefaultValues(MainCorpus mainCorpus, String property, int[] positions) throws UnexpectedAnswerException, IOException, CqiServerError, CqiClientException { |
642 |
|
|
689 |
|
|
643 | 690 |
String[] values = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(property).getQualifiedName(), positions); |
644 | 691 |
for (int iupos = 0; iupos < values.length; iupos++) { // recode the || CQP multiple values to ud multiple values |
645 | 692 |
if (values[iupos].length() > 2 && values[iupos].startsWith("|") && values[iupos].endsWith("|")) { |
646 | 693 |
values[iupos] = values[iupos].substring(1, values[iupos].length() - 1); |
647 | 694 |
} |
648 | 695 |
} |
649 |
|
|
696 |
|
|
650 | 697 |
return values; |
651 | 698 |
} |
652 |
|
|
699 |
|
|
653 | 700 |
private static HashMap<Integer, String> getNonUNDEFPositionsAndValues(MainCorpus mainCorpus, String property) throws UnexpectedAnswerException, IOException, CqiServerError, CqiClientException { |
654 |
|
|
701 |
|
|
702 |
|
|
703 |
|
|
655 | 704 |
HashMap<Integer, String> sentidStartPositions = new HashMap<>(); |
656 |
int[] ids = CQPSearchEngine.getCqiClient().regex2Id(mainCorpus.getProperty(property).getQualifiedName(), "(?!__UNDEF__).+"); |
|
657 |
String[] strs = CQPSearchEngine.getCqiClient().id2Str(mainCorpus.getProperty(property).getQualifiedName(), ids); |
|
658 |
for (int iId = 0; iId < ids.length; iId++) { |
|
659 |
int id = ids[iId]; |
|
660 |
int[] pp = CQPSearchEngine.getCqiClient().id2Cpos(mainCorpus.getProperty(property).getQualifiedName(), id); |
|
661 |
for (int p : pp) { |
|
662 |
sentidStartPositions.put(p, strs[iId]); |
|
705 |
if (mainCorpus.getProperty(property) != null) { |
|
706 |
int[] ids = CQPSearchEngine.getCqiClient().regex2Id(mainCorpus.getProperty(property).getQualifiedName(), "(?!__UNDEF__).+"); |
|
707 |
String[] strs = CQPSearchEngine.getCqiClient().id2Str(mainCorpus.getProperty(property).getQualifiedName(), ids); |
|
708 |
for (int iId = 0; iId < ids.length; iId++) { |
|
709 |
int id = ids[iId]; |
|
710 |
int[] pp = CQPSearchEngine.getCqiClient().id2Cpos(mainCorpus.getProperty(property).getQualifiedName(), id); |
|
711 |
for (int p : pp) { |
|
712 |
sentidStartPositions.put(p, strs[iId]); |
|
713 |
} |
|
663 | 714 |
} |
664 | 715 |
} |
665 |
|
|
666 | 716 |
return sentidStartPositions; |
667 | 717 |
} |
668 |
|
|
718 |
|
|
669 | 719 |
private static void fixUNDEFValues(String[] values) { |
670 |
|
|
720 |
|
|
671 | 721 |
for (int i = 0; i < values.length; i++) { |
672 | 722 |
if (values[i].equals("__UNDEF__") || values[i].equals("") || values[i].equals("|_|")) { |
673 | 723 |
values[i] = "_"; |
TXM/trunk/bundles/org.txm.conllu.rcp/src/org/txm/conllu/rcp/commands/ToCoNLL2009.java (revision 3731) | ||
---|---|---|
12 | 12 |
import org.txm.searchengine.cqp.corpus.CorpusManager; |
13 | 13 |
import org.txm.searchengine.cqp.corpus.Property; |
14 | 14 |
import org.txm.searchengine.cqp.corpus.StructuralUnit; |
15 |
import org.txm.searchengine.cqp.corpus.StructuralUnitProperty; |
|
16 | 15 |
import org.txm.searchengine.cqp.corpus.WordProperty; |
17 | 16 |
import org.txm.searchengine.cqp.corpus.query.CQLQuery; |
18 | 17 |
import org.txm.searchengine.cqp.corpus.query.Match; |
19 | 18 |
import org.txm.searchengine.cqp.serverException.CqiServerError; |
20 | 19 |
import org.txm.utils.ConsoleProgressBar; |
20 |
import org.txm.utils.logger.Log; |
|
21 | 21 |
|
22 |
import jline.internal.Log; |
|
23 |
|
|
24 | 22 |
public class ToCoNLL2009 { |
25 | 23 |
|
26 | 24 |
boolean debug = false; |
TXM/trunk/bundles/org.txm.conllu.rcp/src/org/txm/conllu/rcp/commands/ExportCorpusAsCoNLLU.java (revision 3731) | ||
---|---|---|
42 | 42 |
import org.eclipse.osgi.util.NLS; |
43 | 43 |
import org.eclipse.ui.handlers.HandlerUtil; |
44 | 44 |
import org.kohsuke.args4j.Option; |
45 |
import org.txm.conllu.core.function.ImportCoNLLUAnnotations; |
|
45 | 46 |
import org.txm.rcp.swt.widget.parameters.ParametersDialog; |
46 | 47 |
import org.txm.searchengine.cqp.CQPSearchEngine; |
47 | 48 |
import org.txm.searchengine.cqp.clientExceptions.CqiClientException; |
... | ... | |
63 | 64 |
* @author mdecorde. |
64 | 65 |
*/ |
65 | 66 |
public class ExportCorpusAsCoNLLU extends AbstractHandler { |
66 |
|
|
67 |
|
|
67 | 68 |
public static final String ID = ExportCorpusAsCoNLLU.class.getName(); |
68 |
|
|
69 |
|
|
69 | 70 |
@Option(name="outputDirectory", usage="an example file", widget="Folder", required=true, def="outputDirectory") |
70 | 71 |
File outputDirectory; |
71 |
|
|
72 |
|
|
72 | 73 |
@Option(name="encoding", usage="sentenceProperty", widget="String", required=true, def="UTF-8") |
73 | 74 |
String encoding = "UTF-8"; |
74 | 75 |
|
75 |
@Option(name="sentenceStructure", usage="sentenceProperty", widget="String", required=true, def="s")
|
|
76 |
@Option(name="sentenceStructure", usage="sentenceProperty", widget="String", required=true, def="s") |
|
76 | 77 |
String sentenceStructure; |
77 |
|
|
78 |
|
|
78 | 79 |
@Option(name="posProperty", usage="if set posProperty used to fill the UPOS ud property", widget="String", required=true, def="frpos") |
79 | 80 |
String posProperty; |
80 |
|
|
81 |
|
|
81 | 82 |
@Option(name="lemmaProperty", usage="if set lemmaProperty used to fill the LEMMA ud property", widget="String", required=true, def="frlemma") |
82 | 83 |
String lemmaProperty; |
83 |
|
|
84 |
|
|
84 | 85 |
/** |
85 | 86 |
* the UD property suffixes, will be used to create the CQP properties like propertiesPrefix + suffix |
86 | 87 |
*/ |
87 |
public static String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
|
|
88 |
|
|
88 |
public static String[] propNames = ImportCoNLLUAnnotations.UD_PROPERTY_NAMES;
|
|
89 |
|
|
89 | 90 |
/* |
90 | 91 |
* (non-Javadoc) |
91 | 92 |
* @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent) |
92 | 93 |
*/ |
93 | 94 |
@Override |
94 | 95 |
public Object execute(final ExecutionEvent event) throws ExecutionException { |
95 |
|
|
96 |
|
|
96 | 97 |
IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event); |
97 |
|
|
98 |
|
|
98 | 99 |
Object s = selection.getFirstElement(); |
99 | 100 |
if (!(s instanceof MainCorpus)) { |
100 | 101 |
Log.warning("Selection is not a corpus. Aborting."); |
101 | 102 |
return null; |
102 | 103 |
} |
103 |
|
|
104 |
|
|
104 | 105 |
if (!ParametersDialog.open(this)) { |
105 | 106 |
return null; |
106 | 107 |
} |
107 |
|
|
108 |
|
|
108 | 109 |
outputDirectory.mkdirs(); |
109 | 110 |
if (outputDirectory == null || !outputDirectory.exists() || !outputDirectory.isDirectory()) { |
110 | 111 |
Log.warning("Error: conllu result directory does not exists: " + outputDirectory); |
111 | 112 |
return null; |
112 | 113 |
} |
113 |
|
|
114 |
|
|
114 | 115 |
CQPCorpus corpus = (CQPCorpus) s; |
115 | 116 |
MainCorpus mainCorpus = corpus.getMainCorpus(); |
116 |
|
|
117 |
|
|
117 | 118 |
return exportCorpus(mainCorpus, outputDirectory, sentenceStructure, lemmaProperty, posProperty, encoding); |
118 |
|
|
119 |
|
|
119 | 120 |
} |
120 |
|
|
121 |
|
|
121 | 122 |
public static boolean exportCorpus(MainCorpus mainCorpus, File outputDirectory, String sentenceStructure, String lemmaProperty, String posProperty, String encoding) { |
122 | 123 |
try { |
123 | 124 |
return new ToCoNLL2009().process(outputDirectory, mainCorpus, mainCorpus.getStructuralUnit(sentenceStructure), mainCorpus.getProperty("word"), mainCorpus.getProperty(lemmaProperty), mainCorpus.getProperty(posProperty), encoding); |
124 |
|
|
125 |
|
|
125 | 126 |
} catch (Exception e) { |
126 | 127 |
Log.warning(e); |
127 | 128 |
Log.printStackTrace(e); |
... | ... | |
156 | 157 |
String defaultFeatsPropertyName, String defaultHeadPropertyName, String defaultDeprelPropertyName, String defaultDepsPropertyName, |
157 | 158 |
String defaultMiscPropertyName, |
158 | 159 |
boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter) |
159 |
throws UnexpectedAnswerException, |
|
160 |
IOException, |
|
161 |
CqiServerError, |
|
162 |
CqiClientException, InvalidCqpIdException { |
|
163 |
|
|
160 |
throws UnexpectedAnswerException,
|
|
161 |
IOException,
|
|
162 |
CqiServerError,
|
|
163 |
CqiClientException, InvalidCqpIdException {
|
|
164 |
|
|
164 | 165 |
if (!conlluResultDirectory.exists()) { |
165 | 166 |
conlluResultDirectory.mkdirs(); |
166 | 167 |
} |
167 | 168 |
int numberOfWordsWritten = 0; |
168 | 169 |
int numberOfSentencesWritten = 0; |
169 | 170 |
int numberOfTextsWritten = 0; |
170 |
|
|
171 |
|
|
171 | 172 |
String[] textIds = mainCorpus.getCorpusTextIdsList(); |
172 | 173 |
int[] start_limits = mainCorpus.getTextStartLimits(); |
173 | 174 |
int[] end_limits = mainCorpus.getTextEndLimits(); |
174 |
|
|
175 |
|
|
175 | 176 |
String lang = mainCorpus.getLang(); |
176 | 177 |
// HashSet<String> beforeSpacesRules = new HashSet<>(LangFormater.getNoSpaceBefore(mainCorpus.getLang())); |
177 | 178 |
// HashSet<String> afterSpacesRules = new HashSet<>(LangFormater.getNoSpaceAfter(mainCorpus.getLang())); |
178 |
|
|
179 |
|
|
179 | 180 |
for (String p : propNames) { |
180 | 181 |
WordProperty wp = mainCorpus.getProperty(prefix + p); |
181 | 182 |
if (wp == null) { |
182 |
Log.warning("Error: cannot find the Conllu property: " + prefix + p);
|
|
183 |
Log.warning("Error: cannot find the CoNLLU property: " + prefix + p);
|
|
183 | 184 |
return 0; |
184 | 185 |
} |
185 | 186 |
} |
186 |
|
|
187 |
|
|
187 | 188 |
if (insertTokenWithoutUdAnnotations && (defaultFormPropertyName == null || mainCorpus.getProperty(defaultFormPropertyName) == null)) { |
188 | 189 |
Log.warning("Error: the defaultFormPropertyName parameter needs to be set if insertTokenWithoutUdAnnotations is set to true"); |
189 | 190 |
return 0; |
190 | 191 |
} |
191 |
|
|
192 |
|
|
192 | 193 |
for (int iText = 0; iText < start_limits.length; iText++) { |
193 |
|
|
194 |
|
|
194 | 195 |
// Build corpus positions |
195 | 196 |
int[] positions = new int[end_limits[iText] - start_limits[iText] + 1]; |
196 | 197 |
int tmp = 0; |
... | ... | |
198 | 199 |
positions[tmp++] = n; |
199 | 200 |
} |
200 | 201 |
numberOfWordsWritten += positions.length; |
201 |
|
|
202 |
|
|
202 | 203 |
// Get UD properties |
203 | 204 |
WordProperty wp; |
204 | 205 |
wp = mainCorpus.getProperty(prefix + "id"); |
... | ... | |
213 | 214 |
} |
214 | 215 |
} |
215 | 216 |
tmpValues = null; |
216 |
|
|
217 |
|
|
217 | 218 |
WordProperty formWordProperty = mainCorpus.getProperty(prefix + "form"); |
218 | 219 |
String[] formValues = CQPSearchEngine.getCqiClient().cpos2Str(formWordProperty.getQualifiedName(), positions); |
219 | 220 |
fixUNDEFValues(formValues); |
220 |
|
|
221 |
|
|
221 | 222 |
wp = mainCorpus.getProperty(prefix + "lemma"); |
222 | 223 |
String[] lemmaValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
223 | 224 |
fixUNDEFValues(lemmaValues); |
224 |
|
|
225 |
|
|
225 | 226 |
wp = mainCorpus.getProperty(prefix + "upos"); |
226 | 227 |
String[] uposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
227 | 228 |
fixUNDEFValues(uposValues); |
228 |
|
|
229 |
|
|
229 | 230 |
wp = mainCorpus.getProperty(prefix + "xpos"); |
230 | 231 |
String[] xposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
231 | 232 |
fixUNDEFValues(xposValues); |
232 |
|
|
233 |
|
|
233 | 234 |
wp = mainCorpus.getProperty(prefix + "feats"); |
234 | 235 |
String[] featsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
235 | 236 |
fixUNDEFValues(featsValues); |
236 |
|
|
237 |
|
|
237 | 238 |
wp = mainCorpus.getProperty(prefix + "head"); |
238 | 239 |
// String[] headValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
239 | 240 |
tmpValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
... | ... | |
247 | 248 |
} |
248 | 249 |
} |
249 | 250 |
tmpValues = null; |
250 |
|
|
251 |
|
|
251 | 252 |
wp = mainCorpus.getProperty(prefix + "deprel"); |
252 | 253 |
String[] deprelValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
253 | 254 |
fixUNDEFValues(deprelValues); |
254 |
|
|
255 |
|
|
255 | 256 |
wp = mainCorpus.getProperty(prefix + "deps"); |
256 | 257 |
String[] depsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
257 | 258 |
fixUNDEFValues(depsValues); |
258 |
|
|
259 |
|
|
259 | 260 |
wp = mainCorpus.getProperty(prefix + "misc"); |
260 | 261 |
String[] miscValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
261 | 262 |
fixUNDEFValues(miscValues); |
262 |
|
|
263 |
|
|
263 | 264 |
HashSet<Integer> paragraphsStartPositions = new HashSet<>(); |
264 | 265 |
if (insertParagraphs) { |
265 | 266 |
StructuralUnit p_struct = mainCorpus.getStructuralUnit("p"); |
... | ... | |
273 | 274 |
} |
274 | 275 |
} |
275 | 276 |
} |
276 |
|
|
277 |
|
|
277 | 278 |
HashMap<Integer, String> sentidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, prefix+"sentid"); |
278 | 279 |
HashMap<Integer, String> newdocidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, prefix+"newdocid"); |
279 |
|
|
280 |
|
|
280 | 281 |
// build sentence, first pass using UD word sentence positions |
281 | 282 |
ArrayList<ArrayList<Integer>> sentences = new ArrayList<>(); |
282 | 283 |
ArrayList<Integer> tmpSentence = new ArrayList<>(); |
283 | 284 |
for (int p = 0; p < positions.length; p++) { |
284 |
// System.out.println("p=" + p + " id=" + idValues[p] + " form=" + formValues[p] + " lemma=" + lemmaValues[p] + " upos=" + uposValues[p] + " xpos=" + xposValues[p] + " feats=" |
|
285 |
// + featsValues[p] + " head=" |
|
286 |
// + headValues[p] + " deprel=" + deprelValues[p] + " deps=" + depsValues[p] + " misc=" + miscValues[p]); |
|
287 | 285 |
if (sentidStartPositions.containsKey(p)) { // new ud sentence |
288 |
|
|
286 |
|
|
289 | 287 |
if (tmpSentence.size() > 0) { |
290 | 288 |
sentences.add(new ArrayList<>(tmpSentence)); |
291 | 289 |
} |
292 |
|
|
293 |
// System.out.println("new sentence: " + " id=" + idValues[p] + " form=" + formValues[p] + " lemma=" + lemmaValues[p] + " upos=" + uposValues[p] + " xpos=" + xposValues[p] + " |
|
294 |
// feats=" |
|
295 |
// + featsValues[p] + " head=" |
|
296 |
// + headValues[p] + " deprel=" + deprelValues[p] + " deps=" + depsValues[p] + " misc=" + miscValues[p]); |
|
290 |
|
|
297 | 291 |
tmpSentence.clear(); |
298 | 292 |
} |
299 |
|
|
293 |
|
|
300 | 294 |
if (insertTokenWithoutUdAnnotations) { |
301 | 295 |
tmpSentence.add(p); // insert all tokens |
302 | 296 |
} |
303 | 297 |
else if (idValues[p] != 0) { |
304 | 298 |
tmpSentence.add(p); // insert all tokens |
305 | 299 |
} |
306 |
|
|
300 |
|
|
307 | 301 |
} |
308 | 302 |
positions = null; // free memory |
309 |
|
|
303 |
|
|
310 | 304 |
// fixing sentences |
311 | 305 |
for (int s = 0; s < sentences.size(); s++) { |
312 |
|
|
306 |
|
|
313 | 307 |
// fix only ud sentences limits |
314 | 308 |
ArrayList<Integer> sentence = sentences.get(s); |
315 |
|
|
309 |
|
|
316 | 310 |
if (sentidStartPositions.get(sentence.get(0)) == null) { |
317 | 311 |
continue; // this is not a UD sentence |
318 | 312 |
} |
319 |
|
|
313 |
|
|
320 | 314 |
int max = -1; |
321 | 315 |
int imax = 0; |
322 | 316 |
for (int ip = 0; ip < sentence.size(); ip++) { |
... | ... | |
326 | 320 |
imax = ip; |
327 | 321 |
} |
328 | 322 |
} |
329 |
|
|
323 |
|
|
330 | 324 |
ArrayList<Integer> newSentence = new ArrayList<>(); |
331 | 325 |
for (int ip = imax + 1; ip < sentence.size(); ip++) { |
332 | 326 |
newSentence.add(sentence.get(ip)); |
... | ... | |
342 | 336 |
sentences.add(s + 1, newSentence); |
343 | 337 |
} |
344 | 338 |
} |
345 |
|
|
339 |
|
|
346 | 340 |
if (tmpSentence.size() > 0) { // add last sentence |
347 | 341 |
sentences.add(new ArrayList<>(tmpSentence)); |
348 | 342 |
} |
349 |
|
|
343 |
|
|
350 | 344 |
// fixing sentence __NULL__ ud properties |
351 | 345 |
for (int iSentence = 0; iSentence < sentences.size(); iSentence++) { |
352 | 346 |
ArrayList<Integer> sentence = sentences.get(iSentence); |
353 |
|
|
347 |
|
|
354 | 348 |
int[] sentencePositions = new int[sentence.size()]; |
355 | 349 |
for (int p = 0; p < sentence.size(); p++) { |
356 | 350 |
sentencePositions[p] = sentence.get(p); |
357 | 351 |
} |
358 |
|
|
352 |
|
|
359 | 353 |
// get CQP values fixing "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps" |
360 | 354 |
String[] ids = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("id").getQualifiedName(), sentencePositions); |
361 |
|
|
355 |
|
|
362 | 356 |
String[] words = null; |
363 | 357 |
if (defaultFormPropertyName != null && defaultFormPropertyName.length() > 0) { |
364 | 358 |
words = getDefaultValues(mainCorpus, defaultFormPropertyName, sentencePositions); |
... | ... | |
375 | 369 |
if (defaultXposPropertyName != null && defaultXposPropertyName.length() > 0) { |
376 | 370 |
xposs = getDefaultValues(mainCorpus, defaultXposPropertyName, sentencePositions); |
377 | 371 |
} |
378 |
|
|
372 |
|
|
379 | 373 |
String[] feats = null; |
380 | 374 |
if (defaultFeatsPropertyName != null && defaultFeatsPropertyName.length() > 0) { |
381 | 375 |
feats = getDefaultValues(mainCorpus, defaultFeatsPropertyName, sentencePositions); |
... | ... | |
396 | 390 |
if (defaultMiscPropertyName != null && defaultMiscPropertyName.length() > 0) { |
397 | 391 |
miscs = getDefaultValues(mainCorpus, defaultMiscPropertyName, sentencePositions); |
398 | 392 |
} |
399 |
|
|
393 |
|
|
400 | 394 |
// String[] feats = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(featsCorrPropertyName).getQualifiedName(), sentencePositions); |
401 | 395 |
// String[] head = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(headCorrPropertyName).getQualifiedName(), sentencePositions); |
402 | 396 |
// String[] deprel = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions); |
403 | 397 |
// String[] deps = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions); |
404 |
|
|
398 |
|
|
405 | 399 |
// fix ud properties using CQP values |
406 | 400 |
for (int ip = 0; ip < sentence.size(); ip++) { |
407 |
|
|
401 |
|
|
408 | 402 |
int p = sentence.get(ip); |
409 |
|
|
403 |
|
|
410 | 404 |
// new word |
411 | 405 |
if (miscValues[p].equals("_")) { |
412 | 406 |
miscValues[p] = "XmlId=" + ids[ip]; |
413 | 407 |
} |
414 |
|
|
408 |
|
|
415 | 409 |
// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" }; |
416 | 410 |
if (words != null && formValues[p].equals("_")) { |
417 | 411 |
formValues[p] = words[ip]; |
... | ... | |
441 | 435 |
miscValues[p] = miscs[ip]; |
442 | 436 |
} |
443 | 437 |
} |
444 |
|
|
438 |
|
|
445 | 439 |
if (insertNoSpaceAfter) { |
446 | 440 |
for (int ip = 0; ip < sentence.size(); ip++) { // fix SpaceAfter. !!! this needs to be done after ud properties are fixed |
447 | 441 |
int p = sentence.get(ip); |
... | ... | |
457 | 451 |
} |
458 | 452 |
} |
459 | 453 |
} |
460 |
|
|
454 |
|
|
461 | 455 |
// fixing sentence punct limits |
462 | 456 |
while (sentence.size() > 0 && iSentence > 0 && formValues[sentence.get(0)].matches("\\p{P}") && !formValues[sentence.get(0)].matches(openingPunct)) { |
463 | 457 |
// System.out.println("FIXING: first non-openingPunct position " + formValues[sentence.get(0)] + " in " + iSentence); |
... | ... | |
470 | 464 |
int p2 = sentence.remove(sentence.size() - 1); |
471 | 465 |
sentences.get(iSentence + 1).add(0, p2); |
472 | 466 |
} |
473 |
|
|
467 |
|
|
474 | 468 |
if (sentence.size() == 0) { // sentence was depleted after fixing it |
475 | 469 |
sentences.remove(iSentence); |
476 | 470 |
iSentence--; |
477 | 471 |
continue; |
478 | 472 |
} |
479 | 473 |
} |
480 |
|
|
474 |
|
|
481 | 475 |
for (int s = 0; s < sentences.size(); s++) { |
482 |
|
|
476 |
|
|
483 | 477 |
// fix only ud sentences limits |
484 | 478 |
ArrayList<Integer> sentence = sentences.get(s); |
485 | 479 |
HashMap<Integer, Integer> oldToNewIds = new HashMap<>(); |
486 | 480 |
for (int ip = 0; ip < sentence.size(); ip++) { // computing old to new ids |
487 | 481 |
int p = sentence.get(ip); |
488 |
|
|
482 |
|
|
489 | 483 |
if (idValues[p] != 0) { // store "old id -> new id" |
490 | 484 |
oldToNewIds.put(idValues[p], (ip + 1)); // from 1 to N |
491 | 485 |
} |
492 | 486 |
} |
493 |
|
|
487 |
|
|
494 | 488 |
// fixing head and set missing head to 0 and root |
495 | 489 |
for (int ip = 0; ip < sentence.size(); ip++) { |
496 | 490 |
int p = sentence.get(ip); |
497 |
|
|
491 |
|
|
498 | 492 |
// fixing id value |
499 | 493 |
idValues[p] = (ip + 1); // from 1 to N |
500 |
|
|
494 |
|
|
501 | 495 |
// fixing head values |
502 | 496 |
if (oldToNewIds.containsKey(headValues[p])) { |
503 | 497 |
headValues[p] = oldToNewIds.get(headValues[p]); |
... | ... | |
509 | 503 |
} |
510 | 504 |
} |
511 | 505 |
} |
512 |
|
|
506 |
|
|
513 | 507 |
// writing sentences |
514 | 508 |
File resultConlluFile = new File(conlluResultDirectory, textIds[iText] + ".conllu"); |
515 | 509 |
PrintWriter writer = IOUtils.getWriter(resultConlluFile); |
516 |
|
|
510 |
|
|
517 | 511 |
int iParagraph = 1; |
518 |
|
|
512 |
|
|
519 | 513 |
for (int iSentence = 0; iSentence < sentences.size(); iSentence++) { |
520 | 514 |
ArrayList<Integer> sentence = sentences.get(iSentence); |
521 |
|
|
515 |
|
|
522 | 516 |
int[] sentencePositions = new int[sentence.size()]; |
523 | 517 |
for (int p = 0; p < sentence.size(); p++) { |
524 | 518 |
sentencePositions[p] = sentence.get(p); |
525 | 519 |
} |
526 |
|
|
520 |
|
|
527 | 521 |
String[] gap = null; |
528 | 522 |
if (detectGap && mainCorpus.getProperty("gap") != null) { |
529 | 523 |
gap = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("gap").getQualifiedName(), sentencePositions); |
530 | 524 |
} |
531 |
|
|
525 |
|
|
532 | 526 |
String[] tokens = new String[sentence.size()]; |
533 | 527 |
for (int ip = 0; ip < sentence.size(); ip++) { |
534 | 528 |
tokens[ip] = formValues[sentence.get(ip)]; |
535 | 529 |
} |
536 |
|
|
530 |
|
|
537 | 531 |
if (insertNoSpaceAfter) { |
538 | 532 |
writer.println("# text = " + LangFormater.format(StringUtils.join(tokens, " "), mainCorpus.getLang())); |
539 | 533 |
} |
540 | 534 |
else { |
541 | 535 |
writer.println("# text = " + StringUtils.join(tokens, " ")); |
542 | 536 |
} |
543 |
|
|
537 |
|
|
544 | 538 |
if (newdocidStartPositions.containsKey(sentence.get(0))) { |
545 | 539 |
writer.println("# newdoc id = " + newdocidStartPositions.get(sentence.get(0))); |
546 | 540 |
} |
547 | 541 |
else { |
548 | 542 |
writer.println("# newdoc id = " + textIds[iText]); |
549 | 543 |
} |
550 |
|
|
544 |
|
|
551 | 545 |
boolean foundSentId = false; |
552 | 546 |
for (int ip : sentence) { |
553 | 547 |
if (!foundSentId && sentidStartPositions.containsKey(ip)) { |
... | ... | |
558 | 552 |
if (!foundSentId) { // no sent_id found |
559 | 553 |
writer.println("# sent_id = " + textIds[iText] + "-" + (iSentence + 1) + ".new"); |
560 | 554 |
} |
561 |
|
|
555 |
|
|
562 | 556 |
if (paragraphsStartPositions.contains(sentence.get(0))) { // paragraphsStartPositions is empty if the injectParagraph option is not set |
563 | 557 |
writer.println("# newpar id = " + iParagraph); |
564 | 558 |
iParagraph++; |
565 | 559 |
} |
566 |
|
|
560 |
|
|
567 | 561 |
for (int ip = 0; ip < sentence.size(); ip++) { |
568 | 562 |
int p = sentence.get(ip); |
569 |
|
|
563 |
|
|
570 | 564 |
// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" }; |
571 | 565 |
writer.println(idValues[p] + "\t" + formValues[p] + "\t" + lemmaValues[p] + "\t" + uposValues[p] |
572 | 566 |
+ "\t" + xposValues[p] + "\t" + featsValues[p] + "\t" + headValues[p] + "\t" + deprelValues[p] |
573 |
+ "\t" + depsValues[p] + "\t" + miscValues[p]); |
|
574 |
|
|
567 |
+ "\t" + depsValues[p] + "\t" + miscValues[p]);
|
|
568 |
|
|
575 | 569 |
if (gap != null && gap[ip].equals("next")) { |
576 | 570 |
writer.println("# gap"); |
577 | 571 |
} |
... | ... | |
580 | 574 |
numberOfSentencesWritten++; |
581 | 575 |
} |
582 | 576 |
writer.close(); |
583 |
|
|
577 |
|
|
584 | 578 |
System.out.println(" Text done: " + resultConlluFile); |
585 | 579 |
numberOfTextsWritten++; |
586 | 580 |
} |
587 |
|
|
581 |
|
|
588 | 582 |
System.out.println("# words written: " + numberOfWordsWritten); |
589 | 583 |
System.out.println("# sentences written: " + numberOfSentencesWritten); |
590 | 584 |
System.out.println("# texts written: " + numberOfTextsWritten); |
591 |
|
|
585 |
|
|
592 | 586 |
return numberOfWordsWritten; |
593 | 587 |
} |
594 |
|
|
588 |
|
|
595 | 589 |
private static String[] getDefaultValues(MainCorpus mainCorpus, String property, int[] positions) throws UnexpectedAnswerException, IOException, CqiServerError, CqiClientException { |
596 |
|
|
597 |
String[] values = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(property).getQualifiedName(), positions);
|
|
598 |
for (int iupos = 0; iupos < values.length; iupos++) { // recode the || CQP multiple values to ud multiple values
|
|
599 |
if (values[iupos].length() > 2 && values[iupos].startsWith("|") && values[iupos].endsWith("|")) {
|
|
600 |
values[iupos] = values[iupos].substring(1, values[iupos].length() - 1);
|
|
590 |
|
|
591 |
if (mainCorpus.getProperty(property) != null) {
|
|
592 |
String[] emptyvalues = new String[positions.length];
|
|
593 |
for (int i = 0 ; i < emptyvalues.length ; i++) {
|
|
594 |
emptyvalues[i] = "_";
|
|
601 | 595 |
} |
596 |
return emptyvalues; |
|
597 |
} else { |
|
598 |
String[] values = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(property).getQualifiedName(), positions); |
|
599 |
for (int iupos = 0; iupos < values.length; iupos++) { // recode the || CQP multiple values to ud multiple values |
|
600 |
if (values[iupos].length() > 2 && values[iupos].startsWith("|") && values[iupos].endsWith("|")) { |
|
601 |
values[iupos] = values[iupos].substring(1, values[iupos].length() - 1); |
|
602 |
} |
|
603 |
} |
|
604 |
|
|
605 |
return values; |
|
602 | 606 |
} |
603 |
|
|
604 |
return values; |
|
605 | 607 |
} |
606 |
|
|
608 |
|
|
607 | 609 |
private static HashMap<Integer, String> getNonUNDEFPositionsAndValues(MainCorpus mainCorpus, String property) throws UnexpectedAnswerException, IOException, CqiServerError, CqiClientException { |
608 |
|
|
610 |
|
|
609 | 611 |
HashMap<Integer, String> sentidStartPositions = new HashMap<>(); |
610 | 612 |
int[] ids = CQPSearchEngine.getCqiClient().regex2Id(mainCorpus.getProperty(property).getQualifiedName(), "(?!__UNDEF__).+"); |
611 | 613 |
String[] strs = CQPSearchEngine.getCqiClient().id2Str(mainCorpus.getProperty(property).getQualifiedName(), ids); |
... | ... | |
616 | 618 |
sentidStartPositions.put(p, strs[iId]); |
617 | 619 |
} |
618 | 620 |
} |
619 |
|
|
621 |
|
|
620 | 622 |
return sentidStartPositions; |
621 | 623 |
} |
622 |
|
|
624 |
|
|
623 | 625 |
private static void fixUNDEFValues(String[] values) { |
624 |
|
|
626 |
|
|
625 | 627 |
for (int i = 0; i < values.length; i++) { |
626 | 628 |
if (values[i].equals("__UNDEF__") || values[i].equals("") || values[i].equals("|_|")) { |
627 | 629 |
values[i] = "_"; |
TXM/trunk/bundles/org.txm.conllu.rcp/src/org/txm/conllu/rcp/commands/ImportCoNLLUAnnotationsFromDirectory.java (revision 3731) | ||
---|---|---|
189 | 189 |
return 0; |
190 | 190 |
} |
191 | 191 |
|
192 |
mainCorpus.getProject().appendToHistory("CoNLL-U annotations imported from "+conlluDirectory+" : "+nTextProcessed+" texts and "+nWordsInserted+" words processed."); |
|
193 |
|
|
192 | 194 |
Log.info("XML-TXM source files updated. Updating indexes..."); |
193 | 195 |
|
194 | 196 |
UDPreferences.getInstance().setProjectPreferenceValue(mainCorpus.getProject(), UDPreferences.UDPREFIX, propertiesPrefix); |
... | ... | |
226 | 228 |
return 0; |
227 | 229 |
} |
228 | 230 |
|
231 |
mainCorpus.getProject().appendToHistory("CoNLL-U annotations imported from "+conlluFile+" texts and "+nWordsInserted+" words processed."); |
|
232 |
|
|
229 | 233 |
Log.info("XML-TXM source files updated. Updating indexes..."); |
230 | 234 |
|
231 | 235 |
UDPreferences.getInstance().setProjectPreferenceValue(mainCorpus.getProject(), UDPreferences.UDPREFIX, propertiesPrefix); |
TXM/trunk/bundles/org.txm.annotation.kr.rcp/src/org/txm/annotation/kr/rcp/commands/ImportTable.java (revision 3731) | ||
---|---|---|
130 | 130 |
} |
131 | 131 |
monitor.worked(30); |
132 | 132 |
|
133 |
corpus.getProject().appendToHistory("CQP Annotations imported from "+annotationsFile); |
|
133 | 134 |
Log.info("Done."); |
Formats disponibles : Unified diff