Révision 3540
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/misc/TXM2CoNLL2009Macro.groovy (revision 3540) | ||
---|---|---|
1 |
package org.txm.macro.misc |
|
2 |
|
|
3 |
import org.txm.scripts.export.conll2009.ToCoNLL2009 |
|
4 |
|
|
5 |
import java.io.File; |
|
6 |
import org.txm.searchengine.cqp.corpus.* |
|
7 |
import org.kohsuke.args4j.* |
|
8 |
import groovy.transform.Field |
|
9 |
import org.txm.rcp.swt.widget.parameters.* |
|
10 |
|
|
11 |
if (!(corpusViewSelection instanceof CQPCorpus)) { |
|
12 |
println "Select a corpus or a sub-corpus" |
|
13 |
return; |
|
14 |
} |
|
15 |
def corpus = corpusViewSelection |
|
16 |
|
|
17 |
@Field @Option(name="outputFile", usage="an example file", widget="File", required=true, def="conn_export.tsv") |
|
18 |
File outputFile = new File("conn_export.tsv") |
|
19 |
@Field @Option(name="encoding", usage="sentenceProperty", widget="String", required=true, def="UTF-8") |
|
20 |
String encoding = "UTF-8" |
|
21 |
|
|
22 |
@Field @Option(name="sentenceProperty", usage="sentenceProperty", widget="String", required=true, def="s_id") |
|
23 |
String sentenceProperty = "s_id" |
|
24 |
@Field @Option(name="posProperty", usage="sentenceProperty", widget="String", required=true, def="pos") |
|
25 |
String posProperty = "frpos" |
|
26 |
@Field @Option(name="lemmaProperty", usage="sentenceProperty", widget="String", required=true, def="lemma") |
|
27 |
String lemmaProperty = "frlemma" |
|
28 |
|
|
29 |
if (!ParametersDialog.open(this)) return; |
|
30 |
|
|
31 |
outputFile = outputFile.getAbsoluteFile() |
|
32 |
|
|
33 |
def split = sentenceProperty.split("_", 2) |
|
34 |
StructuralUnitProperty s = corpus.getStructuralUnit(split[0]).getProperty(split[1]) |
|
35 |
Property word = corpus.getProperty("word") |
|
36 |
Property lemma = corpus.getProperty(lemmaProperty) |
|
37 |
Property pos = corpus.getProperty(posProperty) |
|
38 |
|
|
39 |
if (s == null) { println "Error sentence property: $sentenceProperty"; return} |
|
40 |
if (word == null) { println "Error no word property"; return} |
|
41 |
if (pos == null) { println "Error pos property: $posProperty"; return} |
|
42 |
if (lemma == null) { println "Error lemma property: $lemmaProperty"; return} |
|
43 |
|
|
44 |
|
|
45 |
ToCoNLL2009 processor = new ToCoNLL2009() |
|
46 |
processor.process(outputFile, corpus, s, word, lemma, pos, encoding) |
|
47 |
|
|
48 |
println "Done: $outputFile" |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/export/conll2009/ToCoNLL2009.groovy (revision 3540) | ||
---|---|---|
14 | 14 |
|
15 | 15 |
public class ToCoNLL2009 { |
16 | 16 |
|
17 |
boolean debug = true
|
|
17 |
boolean debug = false
|
|
18 | 18 |
/** |
19 | 19 |
* To CoNLL2009. |
20 | 20 |
* |
... | ... | |
80 | 80 |
writer.write("# newdoc id=$text\n"); |
81 | 81 |
writer.write("# sent_id=$match\n"); |
82 | 82 |
|
83 |
id = 0 // first position in sentence
|
|
84 |
for (i = match.start ; i < match.end ; i++) { // loop n times, n = mathc length |
|
83 |
id = 1 // first position in sentence
|
|
84 |
for (i = match.start ; i <= match.end ; i++) { // loop n times, n = mathc length
|
|
85 | 85 |
|
86 | 86 |
if (lemme != null) |
87 | 87 |
lemmestr = lemmes.get(iposition); |
... | ... | |
94 | 94 |
posstr = "_"; |
95 | 95 |
|
96 | 96 |
//print(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\n"); |
97 |
writer.write(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\tXmlID="+idsList.get(iposition)+"\n");
|
|
97 |
writer.write(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\tXmlID="+idsList.get(iposition)+"\n"); |
|
98 | 98 |
|
99 | 99 |
id++; // next word |
100 | 100 |
iposition++ // nextposition |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZDefaultPagerStep.groovy (revision 3540) | ||
---|---|---|
9 | 9 |
|
10 | 10 |
import org.txm.scripts.importer.StaxStackWriter; |
11 | 11 |
import org.eclipse.ui.part.PageSwitcher |
12 |
import org.txm.core.preferences.TBXPreferences |
|
12 | 13 |
import org.txm.importer.xtz.* |
13 | 14 |
|
14 | 15 |
public class XTZDefaultPagerStep { |
... | ... | |
659 | 660 |
endOfLastWord = lastword.subSequence(l-1, l) |
660 | 661 |
} |
661 | 662 |
|
662 |
String interpvalue = "- "+anaValues.entrySet().join("\n- ")+"\n- "+wordid |
|
663 |
|
|
663 |
String interpvalue = null; |
|
664 |
def tooltipProperties = pager.project.getEditionDefinition("default").get(TBXPreferences.EDITION_DEFINITION_TOOLTIP_PROPERTIES, "*"); |
|
665 |
println tooltipProperties |
|
666 |
if (tooltipProperties.equals("*")) { |
|
667 |
interpvalue = "- "+anaValues.entrySet().join("\n- ")+"\n- "+wordid |
|
668 |
} else { |
|
669 |
for (String p : tooltipProperties.split(",")) { |
|
670 |
interpvalue += "- "+p+"="+anaValues.get(p)+"\n" |
|
671 |
} |
|
672 |
interpvalue += "- "+wordid |
|
673 |
} |
|
674 |
|
|
664 | 675 |
if (NoSpaceBefore.contains(wordvalue) || |
665 | 676 |
NoSpaceAfter.contains(lastword) || |
666 | 677 |
wordvalue.startsWith("-") || |
TXM/trunk/org.txm.searchengine.cqp.core/src/org/txm/searchengine/cqp/corpus/MainCorpus.java (revision 3540) | ||
---|---|---|
29 | 29 |
|
30 | 30 |
import java.io.File; |
31 | 31 |
import java.io.IOException; |
32 |
import java.text.NumberFormat; |
|
32 | 33 |
import java.util.ArrayList; |
33 | 34 |
import java.util.Arrays; |
34 | 35 |
import java.util.HashMap; |
... | ... | |
368 | 369 |
@Override |
369 | 370 |
public String getDetails() { |
370 | 371 |
try { |
371 |
return this.getName() + " T=" + this.getSize();
|
|
372 |
return this.getName() + " T=" + NumberFormat.getInstance().format(this.getSize());
|
|
372 | 373 |
} |
373 | 374 |
catch (CqiClientException e) { |
374 | 375 |
return "corpus not ready."; |
TXM/trunk/org.txm.searchengine.cqp.core/src/org/txm/searchengine/cqp/corpus/Subcorpus.java (revision 3540) | ||
---|---|---|
29 | 29 |
|
30 | 30 |
import java.io.File; |
31 | 31 |
import java.io.IOException; |
32 |
import java.text.NumberFormat; |
|
32 | 33 |
import java.util.ArrayList; |
33 | 34 |
import java.util.Arrays; |
34 | 35 |
import java.util.HashMap; |
... | ... | |
765 | 766 |
return textids; |
766 | 767 |
} |
767 | 768 |
|
768 |
|
|
769 | 769 |
@Override |
770 | 770 |
public String getName() { |
771 | 771 |
return this.getCorpusParent().getSimpleName() + TXMPreferences.PARENT_NAME_SEPARATOR + this.getSimpleName(); |
... | ... | |
775 | 775 |
public String getDetails() { |
776 | 776 |
try { |
777 | 777 |
if (hasBeenComputedOnce()) { |
778 |
return "T " + this.getSize();
|
|
778 |
return "CQP ID "+this.pID+" T " + NumberFormat.getInstance().format(this.getSize());
|
|
779 | 779 |
} else { |
780 | 780 |
return this.getName(); |
781 | 781 |
} |
TXM/trunk/org.txm.rcp/src/main/java/org/txm/rcp/editors/imports/sections/EditionSection.java (revision 3540) | ||
---|---|---|
19 | 19 |
import org.eclipse.ui.forms.widgets.TableWrapData; |
20 | 20 |
import org.eclipse.ui.forms.widgets.TableWrapLayout; |
21 | 21 |
import org.txm.Toolbox; |
22 |
import org.txm.core.preferences.TBXPreferences; |
|
23 |
import org.txm.core.preferences.TXMPreferences; |
|
22 | 24 |
import org.txm.objects.EditionDefinition; |
23 | 25 |
import org.txm.objects.Project; |
24 | 26 |
import org.txm.rcp.editors.imports.ImportFormEditor; |
... | ... | |
51 | 53 |
private Text defaultEditions; |
52 | 54 |
|
53 | 55 |
private Combo unmanagedElementsPolicyCombo; |
56 |
|
|
57 |
private Text defaultEditionTooltipPropertiesText; |
|
54 | 58 |
|
55 | 59 |
/** |
56 | 60 |
* |
... | ... | |
220 | 224 |
gdata = getTextGridData(); |
221 | 225 |
gdata.colspan = 2; |
222 | 226 |
defaultEditions.setLayoutData(gdata); |
227 |
|
|
228 |
tmpLabel = toolkit.createLabel(sectionClient, "Tooltip properties"); |
|
229 |
tmpLabel.setToolTipText("comma separated list (comma*=all)"); |
|
230 |
gdata = getLabelGridData(); |
|
231 |
gdata.indent = 20; |
|
232 |
tmpLabel.setLayoutData(gdata); |
|
233 |
|
|
234 |
defaultEditionTooltipPropertiesText = toolkit.createText(sectionClient, "", SWT.BORDER); //$NON-NLS-1$ |
|
235 |
gdata = getTextGridData(); |
|
236 |
gdata.colspan = 2; |
|
237 |
defaultEditionTooltipPropertiesText.setLayoutData(gdata); |
|
223 | 238 |
} |
224 | 239 |
|
225 | 240 |
//// First draft of interface to add editions to a corpus |
... | ... | |
323 | 338 |
} |
324 | 339 |
} |
325 | 340 |
|
341 |
if (defaultEditionTooltipPropertiesText != null) { |
|
342 |
defaultEditionTooltipPropertiesText.setText(project.getEditionDefinition("default").get(TBXPreferences.EDITION_DEFINITION_TOOLTIP_PROPERTIES, "*").replace(" ", "")); |
|
343 |
} |
|
344 |
|
|
326 | 345 |
EditionDefinition edition_params = project.getEditionDefinition("facs"); //$NON-NLS-1$ |
327 | 346 |
boolean bfe = edition_params.getBuildEdition(); |
328 | 347 |
if (buildFacsEditionCheckButton != null && edition_params != null) { |
... | ... | |
376 | 395 |
project.setTextualPlan("UnmanagedElementsPolicy", unmanagedElementsPolicyCombo.getText()); |
377 | 396 |
} |
378 | 397 |
|
398 |
if (defaultEditionTooltipPropertiesText != null) { |
|
399 |
project.getEditionDefinition("default").set(TBXPreferences.EDITION_DEFINITION_TOOLTIP_PROPERTIES, defaultEditionTooltipPropertiesText.getText().replace(" ", "")); |
|
400 |
} |
|
401 |
|
|
379 | 402 |
if (buildFacsEditionCheckButton != null) { |
380 | 403 |
|
381 | 404 |
String images_directory = ""; //$NON-NLS-1$ |
TXM/trunk/org.txm.conllu.rcp/src/org/txm/conllu/rcp/commands/ExportCorpusAsCoNLLU.java (revision 3540) | ||
---|---|---|
66 | 66 |
|
67 | 67 |
public static final String ID = ExportCorpusAsCoNLLU.class.getName(); |
68 | 68 |
|
69 |
@Option(name = "conlluResultDirectory", usage = "conlluResultDirectory", widget = "Folder", required = true, def = "conllu-result-directory")
|
|
70 |
File conlluResultDirectory;
|
|
69 |
@Option(name="outputDirectory", usage="an example file", widget="Folder", required=true, def="outputDirectory")
|
|
70 |
File outputDirectory;
|
|
71 | 71 |
|
72 |
@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-") |
|
73 |
String propertiesPrefix; |
|
72 |
@Option(name="encoding", usage="sentenceProperty", widget="String", required=true, def="UTF-8") |
|
73 |
String encoding = "UTF-8"; |
|
74 |
|
|
75 |
@Option(name="sentenceStructure", usage="sentenceProperty", widget="String", required=true, def="s") |
|
76 |
String sentenceStructure; |
|
74 | 77 |
|
75 |
@Option(name = "separator", usage = "Options", widget = "Separator", required = true, def = "comment properties")
|
|
76 |
Boolean separator = false;
|
|
78 |
@Option(name="posProperty", usage="if set posProperty used to fill the UPOS ud property", widget="String", required=true, def="frpos")
|
|
79 |
String posProperty;
|
|
77 | 80 |
|
78 |
@Option(name = "insertParagraphs", usage = "Insert paragraph marks in the CoNLLU corpus", widget = "Boolean", required = true, def = "true")
|
|
79 |
Boolean insertParagraphs = false;
|
|
81 |
@Option(name="lemmaProperty", usage="if set lemmaProperty used to fill the LEMMA ud property", widget="String", required=true, def="frlemma")
|
|
82 |
String lemmaProperty;
|
|
80 | 83 |
|
81 |
@Option(name = "detectGap", usage = "Insert gap comment using the CQP 'gap' property", widget = "Boolean", required = true, def = "true") |
|
82 |
Boolean detectGap = false; |
|
83 |
|
|
84 |
@Option(name = "separator3", usage = "Options", widget = "Separator", required = true, def = "tokens options") |
|
85 |
Boolean separator3 = false; |
|
86 |
|
|
87 |
@Option(name = "insertNoSpaceAfter", usage = "Insert the NoSpaceAfter misc property if not initially in the CoNLLU corpus", widget = "Boolean", required = true, def = "true") |
|
88 |
Boolean insertNoSpaceAfter = true; |
|
89 |
|
|
90 |
@Option(name = "insertTokenWithoutUdAnnotations", usage = "if checked words without ud annotations are exported as well", widget = "Boolean", required = false, def = "false") |
|
91 |
Boolean insertTokenWithoutUdAnnotations; |
|
92 |
|
|
93 |
// "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" }; |
|
94 |
@Option(name = "separator_properties", usage = "Options", widget = "Separator", required = true, def = "fixing UD properties") |
|
95 |
Boolean separator_properties = false; |
|
96 |
|
|
97 |
@Option(name = "defaultFormPropertyName", usage = "optional CQP property to fix the missing 'form' ud property", widget = "String", required = false, def = "") |
|
98 |
String defaultFormPropertyName; |
|
99 |
|
|
100 |
@Option(name = "defaultLemmaPropertyName", usage = "optional CQP property to fix the missing 'lemma' ud property", widget = "String", required = false, def = "") |
|
101 |
String defaultLemmaPropertyName; |
|
102 |
|
|
103 |
@Option(name = "defaultUposPropertyName", usage = "optional CQP property to fix the missing 'upos' ud property", widget = "String", required = false, def = "") |
|
104 |
String defaultUposPropertyName; |
|
105 |
|
|
106 |
@Option(name = "defaultXposPropertyName", usage = "optional CQP property to fix the missing 'xpos' ud property", widget = "String", required = false, def = "") |
|
107 |
String defaultXposPropertyName; |
|
108 |
|
|
109 |
@Option(name = "defaultFeatsPropertyName", usage = "optional CQP property to fix the missing 'feats' ud property", widget = "String", required = false, def = "") |
|
110 |
String defaultFeatsPropertyName; |
|
111 |
|
|
112 |
@Option(name = "defaultHeadPropertyName", usage = "optional CQP property to fix the missing 'head' ud property", widget = "String", required = false, def = "") |
|
113 |
String defaultHeadPropertyName; |
|
114 |
|
|
115 |
@Option(name = "defaultDeprelPropertyName", usage = "optional CQP property to fix the missing 'deprel' ud property", widget = "String", required = false, def = "") |
|
116 |
String defaultDeprelPropertyName; |
|
117 |
|
|
118 |
@Option(name = "defaultDepsPropertyName", usage = "optional CQP property to fix the missing 'deps' ud property", widget = "String", required = false, def = "") |
|
119 |
String defaultDepsPropertyName; |
|
120 |
|
|
121 |
@Option(name = "defaultMiscPropertyName", usage = "optional CQP property to fix the missing 'misc' ud property", widget = "String", required = false, def = "") |
|
122 |
String defaultMiscPropertyName; |
|
123 |
|
|
124 |
@Option(name = "separator2", usage = "Options", widget = "Separator", required = true, def = "sentence fix options") |
|
125 |
Boolean separator2 = false; |
|
126 |
|
|
127 |
@Option(name = "openingPunct", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "[\\-–«‘“\\(]") |
|
128 |
String openingPunct; |
|
129 |
|
|
130 | 84 |
/** |
131 | 85 |
* the UD property suffixes, will be used to create the CQP properties like propertiesPrefix + suffix |
132 | 86 |
*/ |
... | ... | |
151 | 105 |
return null; |
152 | 106 |
} |
153 | 107 |
|
154 |
conlluResultDirectory.mkdirs();
|
|
155 |
if (conlluResultDirectory == null || !conlluResultDirectory.exists() || !conlluResultDirectory.isDirectory()) {
|
|
156 |
Log.warning("Error: conllu result directory does not exists: " + conlluResultDirectory);
|
|
108 |
outputDirectory.mkdirs();
|
|
109 |
if (outputDirectory == null || !outputDirectory.exists() || !outputDirectory.isDirectory()) {
|
|
110 |
Log.warning("Error: conllu result directory does not exists: " + outputDirectory);
|
|
157 | 111 |
return null; |
158 | 112 |
} |
159 | 113 |
|
... | ... | |
161 | 115 |
MainCorpus mainCorpus = corpus.getMainCorpus(); |
162 | 116 |
|
163 | 117 |
try { |
164 |
return exportAnnotationsAsCorpus(mainCorpus, conlluResultDirectory, propertiesPrefix, openingPunct, insertTokenWithoutUdAnnotations, |
|
165 |
defaultFormPropertyName, defaultLemmaPropertyName, defaultUposPropertyName, defaultXposPropertyName, |
|
166 |
defaultFeatsPropertyName, defaultHeadPropertyName, defaultDeprelPropertyName, defaultDepsPropertyName, |
|
167 |
defaultMiscPropertyName, |
|
168 |
detectGap, insertParagraphs, insertNoSpaceAfter); |
|
169 |
} |
|
170 |
catch (Exception e) { |
|
118 |
new ToCoNLL2009().process(outputDirectory, mainCorpus, corpus.getStructuralUnit(sentenceStructure), corpus.getProperty("word"), corpus.getProperty(lemmaProperty), corpus.getProperty(posProperty), encoding); |
|
119 |
} catch (Exception e) { |
|
171 | 120 |
Log.warning(e); |
172 | 121 |
Log.printStackTrace(e); |
173 | 122 |
} |
TXM/trunk/org.txm.conllu.rcp/src/org/txm/conllu/rcp/commands/ExportCorpusAsFullCoNLLU.java (revision 3540) | ||
---|---|---|
1 |
// Copyright © 2010-2020 ENS de Lyon., University of Franche-Comté |
|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice |
|
4 |
// Sophia Antipolis, University of Paris 3. |
|
5 |
// |
|
6 |
// The TXM platform is free software: you can redistribute it |
|
7 |
// and/or modify it under the terms of the GNU General Public |
|
8 |
// License as published by the Free Software Foundation, |
|
9 |
// either version 2 of the License, or (at your option) any |
|
10 |
// later version. |
|
11 |
// |
|
12 |
// The TXM platform is distributed in the hope that it will be |
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
|
15 |
// PURPOSE. See the GNU General Public License for more |
|
16 |
// details. |
|
17 |
// |
|
18 |
// You should have received a copy of the GNU General |
|
19 |
// Public License along with the TXM platform. If not, see |
|
20 |
// http://www.gnu.org/licenses. |
|
21 |
// |
|
22 |
// |
|
23 |
// |
|
24 |
// $LastChangedDate:$ |
|
25 |
// $LastChangedRevision:$ |
|
26 |
// $LastChangedBy:$ |
|
27 |
// |
|
28 |
package org.txm.conllu.rcp.commands; |
|
29 |
|
|
30 |
import java.io.File; |
|
31 |
import java.io.IOException; |
|
32 |
import java.io.PrintWriter; |
|
33 |
import java.util.ArrayList; |
|
34 |
import java.util.HashMap; |
|
35 |
import java.util.HashSet; |
|
36 |
|
|
37 |
import org.apache.commons.lang.StringUtils; |
|
38 |
import org.eclipse.core.commands.AbstractHandler; |
|
39 |
import org.eclipse.core.commands.ExecutionEvent; |
|
40 |
import org.eclipse.core.commands.ExecutionException; |
|
41 |
import org.eclipse.jface.viewers.IStructuredSelection; |
|
42 |
import org.eclipse.osgi.util.NLS; |
|
43 |
import org.eclipse.ui.handlers.HandlerUtil; |
|
44 |
import org.kohsuke.args4j.Option; |
|
45 |
import org.txm.rcp.swt.widget.parameters.ParametersDialog; |
|
46 |
import org.txm.searchengine.cqp.CQPSearchEngine; |
|
47 |
import org.txm.searchengine.cqp.clientExceptions.CqiClientException; |
|
48 |
import org.txm.searchengine.cqp.clientExceptions.InvalidCqpIdException; |
|
49 |
import org.txm.searchengine.cqp.clientExceptions.UnexpectedAnswerException; |
|
50 |
import org.txm.searchengine.cqp.corpus.CQPCorpus; |
|
51 |
import org.txm.searchengine.cqp.corpus.MainCorpus; |
|
52 |
import org.txm.searchengine.cqp.corpus.StructuralUnit; |
|
53 |
import org.txm.searchengine.cqp.corpus.WordProperty; |
|
54 |
import org.txm.searchengine.cqp.corpus.query.CQLQuery; |
|
55 |
import org.txm.searchengine.cqp.serverException.CqiServerError; |
|
56 |
import org.txm.utils.i18n.LangFormater; |
|
57 |
import org.txm.utils.io.IOUtils; |
|
58 |
import org.txm.utils.logger.Log; |
|
59 |
|
|
60 |
/** |
|
61 |
* Export the conllu properties and CQP words into a conllu corpus of several files (one per text) |
|
62 |
* |
|
63 |
* @author mdecorde. |
|
64 |
*/ |
|
65 |
public class ExportCorpusAsFullCoNLLU extends AbstractHandler { |
|
66 |
|
|
67 |
public static final String ID = ExportCorpusAsFullCoNLLU.class.getName(); |
|
68 |
|
|
69 |
@Option(name = "conlluResultDirectory", usage = "conlluResultDirectory", widget = "Folder", required = true, def = "conllu-result-directory") |
|
70 |
File conlluResultDirectory; |
|
71 |
|
|
72 |
@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-") |
|
73 |
String propertiesPrefix; |
|
74 |
|
|
75 |
@Option(name = "separator", usage = "Options", widget = "Separator", required = true, def = "comment properties") |
|
76 |
Boolean separator = false; |
|
77 |
|
|
78 |
@Option(name = "insertParagraphs", usage = "Insert paragraph marks in the CoNLLU corpus", widget = "Boolean", required = true, def = "true") |
|
79 |
Boolean insertParagraphs = false; |
|
80 |
|
|
81 |
@Option(name = "detectGap", usage = "Insert gap comment using the CQP 'gap' property", widget = "Boolean", required = true, def = "true") |
|
82 |
Boolean detectGap = false; |
|
83 |
|
|
84 |
@Option(name = "separator3", usage = "Options", widget = "Separator", required = true, def = "tokens options") |
|
85 |
Boolean separator3 = false; |
|
86 |
|
|
87 |
@Option(name = "insertNoSpaceAfter", usage = "Insert the NoSpaceAfter misc property if not initially in the CoNLLU corpus", widget = "Boolean", required = true, def = "true") |
|
88 |
Boolean insertNoSpaceAfter = true; |
|
89 |
|
|
90 |
@Option(name = "insertTokenWithoutUdAnnotations", usage = "if checked words without ud annotations are exported as well", widget = "Boolean", required = false, def = "false") |
|
91 |
Boolean insertTokenWithoutUdAnnotations; |
|
92 |
|
|
93 |
// "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" }; |
|
94 |
@Option(name = "separator_properties", usage = "Options", widget = "Separator", required = true, def = "fixing UD properties") |
|
95 |
Boolean separator_properties = false; |
|
96 |
|
|
97 |
@Option(name = "defaultFormPropertyName", usage = "optional CQP property to fix the missing 'form' ud property", widget = "String", required = false, def = "") |
|
98 |
String defaultFormPropertyName; |
|
99 |
|
|
100 |
@Option(name = "defaultLemmaPropertyName", usage = "optional CQP property to fix the missing 'lemma' ud property", widget = "String", required = false, def = "") |
|
101 |
String defaultLemmaPropertyName; |
|
102 |
|
|
103 |
@Option(name = "defaultUposPropertyName", usage = "optional CQP property to fix the missing 'upos' ud property", widget = "String", required = false, def = "") |
|
104 |
String defaultUposPropertyName; |
|
105 |
|
|
106 |
@Option(name = "defaultXposPropertyName", usage = "optional CQP property to fix the missing 'xpos' ud property", widget = "String", required = false, def = "") |
|
107 |
String defaultXposPropertyName; |
|
108 |
|
|
109 |
@Option(name = "defaultFeatsPropertyName", usage = "optional CQP property to fix the missing 'feats' ud property", widget = "String", required = false, def = "") |
|
110 |
String defaultFeatsPropertyName; |
|
111 |
|
|
112 |
@Option(name = "defaultHeadPropertyName", usage = "optional CQP property to fix the missing 'head' ud property", widget = "String", required = false, def = "") |
|
113 |
String defaultHeadPropertyName; |
|
114 |
|
|
115 |
@Option(name = "defaultDeprelPropertyName", usage = "optional CQP property to fix the missing 'deprel' ud property", widget = "String", required = false, def = "") |
|
116 |
String defaultDeprelPropertyName; |
|
117 |
|
|
118 |
@Option(name = "defaultDepsPropertyName", usage = "optional CQP property to fix the missing 'deps' ud property", widget = "String", required = false, def = "") |
|
119 |
String defaultDepsPropertyName; |
|
120 |
|
|
121 |
@Option(name = "defaultMiscPropertyName", usage = "optional CQP property to fix the missing 'misc' ud property", widget = "String", required = false, def = "") |
|
122 |
String defaultMiscPropertyName; |
|
123 |
|
|
124 |
@Option(name = "separator2", usage = "Options", widget = "Separator", required = true, def = "sentence fix options") |
|
125 |
Boolean separator2 = false; |
|
126 |
|
|
127 |
@Option(name = "openingPunct", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "[\\-–«‘“\\(]") |
|
128 |
String openingPunct; |
|
129 |
|
|
130 |
/** |
|
131 |
* the UD property suffixes, will be used to create the CQP properties like propertiesPrefix + suffix |
|
132 |
*/ |
|
133 |
public static String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" }; |
|
134 |
|
|
135 |
/* |
|
136 |
* (non-Javadoc) |
|
137 |
* @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent) |
|
138 |
*/ |
|
139 |
@Override |
|
140 |
public Object execute(final ExecutionEvent event) throws ExecutionException { |
|
141 |
|
|
142 |
IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event); |
|
143 |
|
|
144 |
Object s = selection.getFirstElement(); |
|
145 |
if (!(s instanceof MainCorpus)) { |
|
146 |
Log.warning("Selection is not a corpus. Aborting."); |
|
147 |
return null; |
|
148 |
} |
|
149 |
|
|
150 |
if (!ParametersDialog.open(this)) { |
|
151 |
return null; |
|
152 |
} |
|
153 |
|
|
154 |
conlluResultDirectory.mkdirs(); |
|
155 |
if (conlluResultDirectory == null || !conlluResultDirectory.exists() || !conlluResultDirectory.isDirectory()) { |
|
156 |
Log.warning("Error: conllu result directory does not exists: " + conlluResultDirectory); |
|
157 |
return null; |
|
158 |
} |
|
159 |
|
|
160 |
CQPCorpus corpus = (CQPCorpus) s; |
|
161 |
MainCorpus mainCorpus = corpus.getMainCorpus(); |
|
162 |
|
|
163 |
try { |
|
164 |
return exportAnnotationsAsCorpus(mainCorpus, conlluResultDirectory, propertiesPrefix, openingPunct, insertTokenWithoutUdAnnotations, |
|
165 |
defaultFormPropertyName, defaultLemmaPropertyName, defaultUposPropertyName, defaultXposPropertyName, |
|
166 |
defaultFeatsPropertyName, defaultHeadPropertyName, defaultDeprelPropertyName, defaultDepsPropertyName, |
|
167 |
defaultMiscPropertyName, |
|
168 |
detectGap, insertParagraphs, insertNoSpaceAfter); |
|
169 |
} |
|
170 |
catch (Exception e) { |
|
171 |
Log.warning(e); |
|
172 |
Log.printStackTrace(e); |
|
173 |
} |
|
174 |
|
|
175 |
return null; |
|
176 |
} |
|
177 |
|
|
178 |
/** |
|
179 |
* export the corpus in a directory of conllu files (one per text) |
|
180 |
* |
|
181 |
* @param mainCorpus |
|
182 |
* @param conlluResultDirectory |
|
183 |
* @param prefix |
|
184 |
* @param openingPunct |
|
185 |
* @param insertTokenWithoutUdAnnotations |
|
186 |
* @param defaultFormPropertyName |
|
187 |
* @param defaultLemmaPropertyName |
|
188 |
* @param defaultUposPropertyName |
|
189 |
* @param defaultXposPropertyName |
|
190 |
* @param detectGap |
|
191 |
* @param insertParagraphs |
|
192 |
* @param insertNoSpaceAfter |
|
193 |
* @return the number of annotation exported |
|
194 |
* @throws UnexpectedAnswerException |
|
195 |
* @throws IOException |
|
196 |
* @throws CqiServerError |
|
197 |
* @throws CqiClientException |
|
198 |
* @throws InvalidCqpIdException |
|
199 |
*/ |
|
200 |
public static int exportAnnotationsAsCorpus(MainCorpus mainCorpus, File conlluResultDirectory, String prefix, String openingPunct, boolean insertTokenWithoutUdAnnotations, |
|
201 |
String defaultFormPropertyName, String defaultLemmaPropertyName, String defaultUposPropertyName, String defaultXposPropertyName, |
|
202 |
String defaultFeatsPropertyName, String defaultHeadPropertyName, String defaultDeprelPropertyName, String defaultDepsPropertyName, |
|
203 |
String defaultMiscPropertyName, |
|
204 |
boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter) |
|
205 |
throws UnexpectedAnswerException, |
|
206 |
IOException, |
|
207 |
CqiServerError, |
|
208 |
CqiClientException, InvalidCqpIdException { |
|
209 |
|
|
210 |
if (!conlluResultDirectory.exists()) { |
|
211 |
conlluResultDirectory.mkdirs(); |
|
212 |
} |
|
213 |
int numberOfWordsWritten = 0; |
|
214 |
int numberOfSentencesWritten = 0; |
|
215 |
int numberOfTextsWritten = 0; |
|
216 |
|
|
217 |
String[] textIds = mainCorpus.getCorpusTextIdsList(); |
|
218 |
int[] start_limits = mainCorpus.getTextStartLimits(); |
|
219 |
int[] end_limits = mainCorpus.getTextEndLimits(); |
|
220 |
|
|
221 |
String lang = mainCorpus.getLang(); |
|
222 |
// HashSet<String> beforeSpacesRules = new HashSet<>(LangFormater.getNoSpaceBefore(mainCorpus.getLang())); |
|
223 |
// HashSet<String> afterSpacesRules = new HashSet<>(LangFormater.getNoSpaceAfter(mainCorpus.getLang())); |
|
224 |
|
|
225 |
for (String p : propNames) { |
|
226 |
WordProperty wp = mainCorpus.getProperty(prefix + p); |
|
227 |
if (wp == null) { |
|
228 |
Log.warning("Error: cannot find the Conllu property: " + prefix + p); |
|
229 |
return 0; |
|
230 |
} |
|
231 |
} |
|
232 |
|
|
233 |
if (insertTokenWithoutUdAnnotations && (defaultFormPropertyName == null || mainCorpus.getProperty(defaultFormPropertyName) == null)) { |
|
234 |
Log.warning("Error: the defaultFormPropertyName parameter needs to be set if insertTokenWithoutUdAnnotations is set to true"); |
|
235 |
return 0; |
|
236 |
} |
|
237 |
|
|
238 |
for (int iText = 0; iText < start_limits.length; iText++) { |
|
239 |
|
|
240 |
// Build corpus positions |
|
241 |
int[] positions = new int[end_limits[iText] - start_limits[iText] + 1]; |
|
242 |
int tmp = 0; |
|
243 |
for (int n = start_limits[iText]; n <= end_limits[iText]; n++) { |
|
244 |
positions[tmp++] = n; |
|
245 |
} |
|
246 |
numberOfWordsWritten += positions.length; |
|
247 |
|
|
248 |
// Get UD properties |
|
249 |
WordProperty wp; |
|
250 |
wp = mainCorpus.getProperty(prefix + "id"); |
|
251 |
String[] tmpValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
252 |
int[] idValues = new int[tmpValues.length]; |
|
253 |
for (int iId = 0; iId < tmpValues.length; iId++) { |
|
254 |
if (tmpValues[iId].length() > 0 && !tmpValues[iId].equals("_") && !tmpValues[iId].equals("__UNDEF__")) { |
|
255 |
idValues[iId] = Integer.parseInt(tmpValues[iId]); |
|
256 |
} |
|
257 |
else { |
|
258 |
idValues[iId] = 0; |
|
259 |
} |
|
260 |
} |
|
261 |
tmpValues = null; |
|
262 |
|
|
263 |
WordProperty formWordProperty = mainCorpus.getProperty(prefix + "form"); |
|
264 |
String[] formValues = CQPSearchEngine.getCqiClient().cpos2Str(formWordProperty.getQualifiedName(), positions); |
|
265 |
fixUNDEFValues(formValues); |
|
266 |
|
|
267 |
wp = mainCorpus.getProperty(prefix + "lemma"); |
|
268 |
String[] lemmaValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
269 |
fixUNDEFValues(lemmaValues); |
|
270 |
|
|
271 |
wp = mainCorpus.getProperty(prefix + "upos"); |
|
272 |
String[] uposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
273 |
fixUNDEFValues(uposValues); |
|
274 |
|
|
275 |
wp = mainCorpus.getProperty(prefix + "xpos"); |
|
276 |
String[] xposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
277 |
fixUNDEFValues(xposValues); |
|
278 |
|
|
279 |
wp = mainCorpus.getProperty(prefix + "feats"); |
|
280 |
String[] featsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
281 |
fixUNDEFValues(featsValues); |
|
282 |
|
|
283 |
wp = mainCorpus.getProperty(prefix + "head"); |
|
284 |
// String[] headValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
285 |
tmpValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
286 |
int[] headValues = new int[tmpValues.length]; |
|
287 |
for (int iId = 0; iId < tmpValues.length; iId++) { |
|
288 |
if (tmpValues[iId].length() > 0 && !tmpValues[iId].equals("_") && !tmpValues[iId].equals("__UNDEF__")) { |
|
289 |
headValues[iId] = Integer.parseInt(tmpValues[iId]); |
|
290 |
} |
|
291 |
else { |
|
292 |
headValues[iId] = -1; |
|
293 |
} |
|
294 |
} |
|
295 |
tmpValues = null; |
|
296 |
|
|
297 |
wp = mainCorpus.getProperty(prefix + "deprel"); |
|
298 |
String[] deprelValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
299 |
fixUNDEFValues(deprelValues); |
|
300 |
|
|
301 |
wp = mainCorpus.getProperty(prefix + "deps"); |
|
302 |
String[] depsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
303 |
fixUNDEFValues(depsValues); |
|
304 |
|
|
305 |
wp = mainCorpus.getProperty(prefix + "misc"); |
|
306 |
String[] miscValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
307 |
fixUNDEFValues(miscValues); |
|
308 |
|
|
309 |
HashSet<Integer> paragraphsStartPositions = new HashSet<>(); |
|
310 |
if (insertParagraphs) { |
|
311 |
StructuralUnit p_struct = mainCorpus.getStructuralUnit("p"); |
|
312 |
if (p_struct == null) { |
|
313 |
Log.warning(NLS.bind("** insertParagraph parameter is set, but there are no *p* structure (no paragraph) in the {0} CQP corpus. The insertParagraph parameter will be ignored.", |
|
314 |
mainCorpus)); |
|
315 |
} |
|
316 |
else { |
|
317 |
for (int position : mainCorpus.query(new CQLQuery("<p> [_.text_id=\"" + textIds[iText] + "\"]"), "textParagraphPositions", false).getStarts()) { |
|
318 |
paragraphsStartPositions.add(position); |
|
319 |
} |
|
320 |
} |
|
321 |
} |
|
322 |
|
|
323 |
HashMap<Integer, String> sentidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, prefix+"sentid"); |
|
324 |
HashMap<Integer, String> newdocidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, prefix+"newdocid"); |
|
325 |
|
|
326 |
// build sentence, first pass using UD word sentence positions |
|
327 |
ArrayList<ArrayList<Integer>> sentences = new ArrayList<>(); |
|
328 |
ArrayList<Integer> tmpSentence = new ArrayList<>(); |
|
329 |
for (int p = 0; p < positions.length; p++) { |
|
330 |
// System.out.println("p=" + p + " id=" + idValues[p] + " form=" + formValues[p] + " lemma=" + lemmaValues[p] + " upos=" + uposValues[p] + " xpos=" + xposValues[p] + " feats=" |
|
331 |
// + featsValues[p] + " head=" |
|
332 |
// + headValues[p] + " deprel=" + deprelValues[p] + " deps=" + depsValues[p] + " misc=" + miscValues[p]); |
|
333 |
if (sentidStartPositions.containsKey(p)) { // new ud sentence |
|
334 |
|
|
335 |
if (tmpSentence.size() > 0) { |
|
336 |
sentences.add(new ArrayList<>(tmpSentence)); |
|
337 |
} |
|
338 |
|
|
339 |
// System.out.println("new sentence: " + " id=" + idValues[p] + " form=" + formValues[p] + " lemma=" + lemmaValues[p] + " upos=" + uposValues[p] + " xpos=" + xposValues[p] + " |
|
340 |
// feats=" |
|
341 |
// + featsValues[p] + " head=" |
|
342 |
// + headValues[p] + " deprel=" + deprelValues[p] + " deps=" + depsValues[p] + " misc=" + miscValues[p]); |
|
343 |
tmpSentence.clear(); |
|
344 |
} |
|
345 |
|
|
346 |
if (insertTokenWithoutUdAnnotations) { |
|
347 |
tmpSentence.add(p); // insert all tokens |
|
348 |
} |
|
349 |
else if (idValues[p] != 0) { |
|
350 |
tmpSentence.add(p); // insert all tokens |
|
351 |
} |
|
352 |
|
|
353 |
} |
|
354 |
positions = null; // free memory |
|
355 |
|
|
356 |
// fixing sentences |
|
357 |
for (int s = 0; s < sentences.size(); s++) { |
|
358 |
|
|
359 |
// fix only ud sentences limits |
|
360 |
ArrayList<Integer> sentence = sentences.get(s); |
|
361 |
|
|
362 |
if (sentidStartPositions.get(sentence.get(0)) == null) { |
|
363 |
continue; // this is not a UD sentence |
|
364 |
} |
|
365 |
|
|
366 |
int max = -1; |
|
367 |
int imax = 0; |
|
368 |
for (int ip = 0; ip < sentence.size(); ip++) { |
|
369 |
int p = sentence.get(ip); |
|
370 |
if (max < idValues[p]) { |
|
371 |
max = idValues[p]; |
|
372 |
imax = ip; |
|
373 |
} |
|
374 |
} |
|
375 |
|
|
376 |
ArrayList<Integer> newSentence = new ArrayList<>(); |
|
377 |
for (int ip = imax + 1; ip < sentence.size(); ip++) { |
|
378 |
newSentence.add(sentence.get(ip)); |
|
379 |
sentence.remove(ip); |
|
380 |
ip--; |
|
381 |
} |
|
382 |
if (newSentence.size() == 1) { // the new sentence size is 1, resinsert it |
|
383 |
sentence.addAll(newSentence); |
|
384 |
newSentence.clear(); |
|
385 |
} |
|
386 |
if (newSentence.size() > 0) { |
|
387 |
//System.out.println("INSERT " + newSentence); |
|
388 |
sentences.add(s + 1, newSentence); |
|
389 |
} |
|
390 |
} |
|
391 |
|
|
392 |
if (tmpSentence.size() > 0) { // add last sentence |
|
393 |
sentences.add(new ArrayList<>(tmpSentence)); |
|
394 |
} |
|
395 |
|
|
396 |
// fixing sentence __NULL__ ud properties |
|
397 |
for (int iSentence = 0; iSentence < sentences.size(); iSentence++) { |
|
398 |
ArrayList<Integer> sentence = sentences.get(iSentence); |
|
399 |
|
|
400 |
int[] sentencePositions = new int[sentence.size()]; |
|
401 |
for (int p = 0; p < sentence.size(); p++) { |
|
402 |
sentencePositions[p] = sentence.get(p); |
|
403 |
} |
|
404 |
|
|
405 |
// get CQP values fixing "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps" |
|
406 |
String[] ids = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("id").getQualifiedName(), sentencePositions); |
|
407 |
|
|
408 |
String[] words = null; |
|
409 |
if (defaultFormPropertyName != null && defaultFormPropertyName.length() > 0) { |
|
410 |
words = getDefaultValues(mainCorpus, defaultFormPropertyName, sentencePositions); |
|
411 |
} |
|
412 |
String[] lemmas = null; |
|
413 |
if (defaultLemmaPropertyName != null && defaultLemmaPropertyName.length() > 0) { |
|
414 |
lemmas = getDefaultValues(mainCorpus, defaultLemmaPropertyName, sentencePositions); |
|
415 |
} |
|
416 |
String[] uposs = null; |
|
417 |
if (defaultUposPropertyName != null && defaultUposPropertyName.length() > 0) { |
|
418 |
uposs = getDefaultValues(mainCorpus, defaultUposPropertyName, sentencePositions); |
|
419 |
} |
|
420 |
String[] xposs = null; |
|
421 |
if (defaultXposPropertyName != null && defaultXposPropertyName.length() > 0) { |
|
422 |
xposs = getDefaultValues(mainCorpus, defaultXposPropertyName, sentencePositions); |
|
423 |
} |
|
424 |
|
|
425 |
String[] feats = null; |
|
426 |
if (defaultFeatsPropertyName != null && defaultFeatsPropertyName.length() > 0) { |
|
427 |
feats = getDefaultValues(mainCorpus, defaultFeatsPropertyName, sentencePositions); |
|
428 |
} |
|
429 |
String[] heads = null; |
|
430 |
if (defaultHeadPropertyName != null && defaultHeadPropertyName.length() > 0) { |
|
431 |
heads = getDefaultValues(mainCorpus, defaultHeadPropertyName, sentencePositions); |
|
432 |
} |
|
433 |
String[] deprels = null; |
|
434 |
if (defaultDeprelPropertyName != null && defaultDeprelPropertyName.length() > 0) { |
|
435 |
deprels = getDefaultValues(mainCorpus, defaultDeprelPropertyName, sentencePositions); |
|
436 |
} |
|
437 |
String[] depss = null; |
|
438 |
if (defaultDepsPropertyName != null && defaultDepsPropertyName.length() > 0) { |
|
439 |
depss = getDefaultValues(mainCorpus, defaultDepsPropertyName, sentencePositions); |
|
440 |
} |
|
441 |
String[] miscs = null; |
|
442 |
if (defaultMiscPropertyName != null && defaultMiscPropertyName.length() > 0) { |
|
443 |
miscs = getDefaultValues(mainCorpus, defaultMiscPropertyName, sentencePositions); |
|
444 |
} |
|
445 |
|
|
446 |
// String[] feats = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(featsCorrPropertyName).getQualifiedName(), sentencePositions); |
|
447 |
// String[] head = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(headCorrPropertyName).getQualifiedName(), sentencePositions); |
|
448 |
// String[] deprel = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions); |
|
449 |
// String[] deps = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions); |
|
450 |
|
|
451 |
// fix ud properties using CQP values |
|
452 |
for (int ip = 0; ip < sentence.size(); ip++) { |
|
453 |
|
|
454 |
int p = sentence.get(ip); |
|
455 |
|
|
456 |
// new word |
|
457 |
if (miscValues[p].equals("_")) { |
|
458 |
miscValues[p] = "XmlId=" + ids[ip]; |
|
459 |
} |
|
460 |
|
|
461 |
// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" }; |
|
462 |
if (words != null && formValues[p].equals("_")) { |
|
463 |
formValues[p] = words[ip]; |
|
464 |
} |
|
465 |
if (lemmas != null && lemmaValues[p].equals("_")) { |
|
466 |
lemmaValues[p] = lemmas[ip]; |
|
467 |
} |
|
468 |
if (uposs != null && uposValues[p].equals("_")) { |
|
469 |
uposValues[p] = uposs[ip]; |
|
470 |
} |
|
471 |
if (xposs != null && xposValues[p].equals("_")) { |
|
472 |
xposValues[p] = xposs[ip]; |
|
473 |
} |
|
474 |
if (feats != null && featsValues[p].equals("_")) { |
|
475 |
featsValues[p] = feats[ip]; |
|
476 |
} |
|
477 |
if (heads != null && headValues[p] < 0) { |
|
478 |
headValues[p] = Integer.parseInt(heads[ip]); |
|
479 |
} |
|
480 |
if (deprels != null && deprelValues[p].equals("_")) { |
|
481 |
deprelValues[p] = deprels[ip]; |
|
482 |
} |
|
483 |
if (depss != null && depsValues[p].equals("_")) { |
|
484 |
depsValues[p] = depss[ip]; |
|
485 |
} |
|
486 |
if (miscs != null && miscValues[p].equals("_")) { |
|
487 |
miscValues[p] = miscs[ip]; |
|
488 |
} |
|
489 |
} |
|
490 |
|
|
491 |
if (insertNoSpaceAfter) { |
|
492 |
for (int ip = 0; ip < sentence.size(); ip++) { // fix SpaceAfter. !!! this needs to be done after ud properties are fixed |
|
493 |
int p = sentence.get(ip); |
|
494 |
// insertion activated |
|
495 |
if (!miscValues[p].contains("SpaceAfter=")) { // only update if not present |
|
496 |
if (LangFormater.isSpaceAfterNotNeeded(formValues[p], lang)) { |
|
497 |
miscValues[p] += "|SpaceAfter=No"; |
|
498 |
} |
|
499 |
else if (formValues.length > (p + 1) && LangFormater.isSpaceBeforeNotNeeded(formValues[p + 1], lang)) { |
|
500 |
// if next token needs a space before, set SpaceAfter=Yes to the previous token |
|
501 |
miscValues[p] += "|SpaceAfter=No"; |
|
502 |
} |
|
503 |
} |
|
504 |
} |
|
505 |
} |
|
506 |
|
|
507 |
// fixing sentence punct limits |
|
508 |
while (sentence.size() > 0 && iSentence > 0 && formValues[sentence.get(0)].matches("\\p{P}") && !formValues[sentence.get(0)].matches(openingPunct)) { |
|
509 |
// System.out.println("FIXING: first non-openingPunct position " + formValues[sentence.get(0)] + " in " + iSentence); |
|
510 |
int p2 = sentence.remove(0); |
|
511 |
sentences.get(iSentence - 1).add(p2); |
|
512 |
} |
|
513 |
// |
|
514 |
while (sentence.size() > 0 && iSentence + 1 < sentences.size() && formValues[sentence.get(sentence.size() - 1)].matches(openingPunct)) { |
|
515 |
// System.out.println("FIXING: last openingPunct position " + formValues[sentence.get(sentence.size() - 1)] + " in " + iSentence); |
|
516 |
int p2 = sentence.remove(sentence.size() - 1); |
|
517 |
sentences.get(iSentence + 1).add(0, p2); |
|
518 |
} |
|
519 |
|
|
520 |
if (sentence.size() == 0) { // sentence was depleted after fixing it |
|
521 |
sentences.remove(iSentence); |
|
522 |
iSentence--; |
|
523 |
continue; |
|
524 |
} |
|
525 |
} |
|
526 |
|
|
527 |
for (int s = 0; s < sentences.size(); s++) { |
|
528 |
|
|
529 |
// fix only ud sentences limits |
|
530 |
ArrayList<Integer> sentence = sentences.get(s); |
|
531 |
HashMap<Integer, Integer> oldToNewIds = new HashMap<>(); |
|
532 |
for (int ip = 0; ip < sentence.size(); ip++) { // computing old to new ids |
|
533 |
int p = sentence.get(ip); |
|
534 |
|
|
535 |
if (idValues[p] != 0) { // store "old id -> new id" |
|
536 |
oldToNewIds.put(idValues[p], (ip + 1)); // from 1 to N |
|
537 |
} |
|
538 |
} |
|
539 |
|
|
540 |
// fixing head and set missing head to 0 and root |
|
541 |
for (int ip = 0; ip < sentence.size(); ip++) { |
|
542 |
int p = sentence.get(ip); |
|
543 |
|
|
544 |
// fixing id value |
|
545 |
idValues[p] = (ip + 1); // from 1 to N |
|
546 |
|
|
547 |
// fixing head values |
|
548 |
if (oldToNewIds.containsKey(headValues[p])) { |
|
549 |
headValues[p] = oldToNewIds.get(headValues[p]); |
|
550 |
} |
|
551 |
else if (headValues[p] != 0) { // new word, set to default values |
|
552 |
headValues[p] = 0; |
|
553 |
deprelValues[p] = "_"; |
|
554 |
depsValues[p] = "_"; |
|
555 |
} |
|
556 |
} |
|
557 |
} |
|
558 |
|
|
559 |
// writing sentences |
|
560 |
File resultConlluFile = new File(conlluResultDirectory, textIds[iText] + ".conllu"); |
|
561 |
PrintWriter writer = IOUtils.getWriter(resultConlluFile); |
|
562 |
|
|
563 |
int iParagraph = 1; |
|
564 |
|
|
565 |
for (int iSentence = 0; iSentence < sentences.size(); iSentence++) { |
|
566 |
ArrayList<Integer> sentence = sentences.get(iSentence); |
|
567 |
|
|
568 |
int[] sentencePositions = new int[sentence.size()]; |
|
569 |
for (int p = 0; p < sentence.size(); p++) { |
|
570 |
sentencePositions[p] = sentence.get(p); |
|
571 |
} |
|
572 |
|
|
573 |
String[] gap = null; |
|
574 |
if (detectGap && mainCorpus.getProperty("gap") != null) { |
|
575 |
gap = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("gap").getQualifiedName(), sentencePositions); |
|
576 |
} |
|
577 |
|
|
578 |
String[] tokens = new String[sentence.size()]; |
|
579 |
for (int ip = 0; ip < sentence.size(); ip++) { |
|
580 |
tokens[ip] = formValues[sentence.get(ip)]; |
|
581 |
} |
|
582 |
|
|
583 |
if (insertNoSpaceAfter) { |
|
584 |
writer.println("# text = " + LangFormater.format(StringUtils.join(tokens, " "), mainCorpus.getLang())); |
|
585 |
} |
|
586 |
else { |
|
587 |
writer.println("# text = " + StringUtils.join(tokens, " ")); |
|
588 |
} |
|
589 |
|
|
590 |
if (newdocidStartPositions.containsKey(sentence.get(0))) { |
|
591 |
writer.println("# newdoc id = " + newdocidStartPositions.get(sentence.get(0))); |
|
592 |
} |
|
593 |
else { |
|
594 |
writer.println("# newdoc id = " + textIds[iText]); |
|
595 |
} |
|
596 |
|
|
597 |
boolean foundSentId = false; |
|
598 |
for (int ip : sentence) { |
|
599 |
if (!foundSentId && sentidStartPositions.containsKey(ip)) { |
|
600 |
writer.println("# sent_id = " + sentidStartPositions.get(ip)); |
|
601 |
foundSentId = true; |
|
602 |
} |
|
603 |
} |
|
604 |
if (!foundSentId) { // no sent_id found |
|
605 |
writer.println("# sent_id = " + textIds[iText] + "-" + (iSentence + 1) + ".new"); |
|
606 |
} |
|
607 |
|
|
608 |
if (paragraphsStartPositions.contains(sentence.get(0))) { // paragraphsStartPositions is empty if the injectParagraph option is not set |
|
609 |
writer.println("# newpar id = " + iParagraph); |
|
610 |
iParagraph++; |
|
611 |
} |
|
612 |
|
|
613 |
for (int ip = 0; ip < sentence.size(); ip++) { |
|
614 |
int p = sentence.get(ip); |
|
615 |
|
|
616 |
// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" }; |
|
617 |
writer.println(idValues[p] + "\t" + formValues[p] + "\t" + lemmaValues[p] + "\t" + uposValues[p] |
|
618 |
+ "\t" + xposValues[p] + "\t" + featsValues[p] + "\t" + headValues[p] + "\t" + deprelValues[p] |
|
619 |
+ "\t" + depsValues[p] + "\t" + miscValues[p]); |
|
620 |
|
|
621 |
if (gap != null && gap[ip].equals("next")) { |
|
622 |
writer.println("# gap"); |
|
623 |
} |
|
624 |
} |
|
625 |
writer.println(""); |
|
626 |
numberOfSentencesWritten++; |
|
627 |
} |
|
628 |
writer.close(); |
|
629 |
|
|
630 |
System.out.println(" Text done: " + resultConlluFile); |
|
631 |
numberOfTextsWritten++; |
|
632 |
} |
|
633 |
|
|
634 |
System.out.println("# words written: " + numberOfWordsWritten); |
|
635 |
System.out.println("# sentences written: " + numberOfSentencesWritten); |
|
636 |
System.out.println("# texts written: " + numberOfTextsWritten); |
|
637 |
|
|
638 |
return numberOfWordsWritten; |
|
639 |
} |
|
640 |
|
|
641 |
private static String[] getDefaultValues(MainCorpus mainCorpus, String property, int[] positions) throws UnexpectedAnswerException, IOException, CqiServerError, CqiClientException { |
|
642 |
|
|
643 |
String[] values = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(property).getQualifiedName(), positions); |
|
644 |
for (int iupos = 0; iupos < values.length; iupos++) { // recode the || CQP multiple values to ud multiple values |
|
645 |
if (values[iupos].length() > 2 && values[iupos].startsWith("|") && values[iupos].endsWith("|")) { |
|
646 |
values[iupos] = values[iupos].substring(1, values[iupos].length() - 1); |
|
647 |
} |
|
648 |
} |
|
649 |
|
|
650 |
return values; |
|
651 |
} |
|
652 |
|
|
653 |
private static HashMap<Integer, String> getNonUNDEFPositionsAndValues(MainCorpus mainCorpus, String property) throws UnexpectedAnswerException, IOException, CqiServerError, CqiClientException { |
|
654 |
|
|
655 |
HashMap<Integer, String> sentidStartPositions = new HashMap<>(); |
|
656 |
int[] ids = CQPSearchEngine.getCqiClient().regex2Id(mainCorpus.getProperty(property).getQualifiedName(), "(?!__UNDEF__).+"); |
|
657 |
String[] strs = CQPSearchEngine.getCqiClient().id2Str(mainCorpus.getProperty(property).getQualifiedName(), ids); |
|
658 |
for (int iId = 0; iId < ids.length; iId++) { |
|
659 |
int id = ids[iId]; |
|
660 |
int[] pp = CQPSearchEngine.getCqiClient().id2Cpos(mainCorpus.getProperty(property).getQualifiedName(), id); |
|
661 |
for (int p : pp) { |
|
662 |
sentidStartPositions.put(p, strs[iId]); |
|
663 |
} |
|
664 |
} |
|
665 |
|
|
666 |
return sentidStartPositions; |
|
667 |
} |
|
668 |
|
|
669 |
private static void fixUNDEFValues(String[] values) { |
|
670 |
|
|
671 |
for (int i = 0; i < values.length; i++) { |
|
672 |
if (values[i].equals("__UNDEF__") || values[i].equals("") || values[i].equals("|_|")) { |
|
673 |
values[i] = "_"; |
|
674 |
} |
|
675 |
} |
|
676 |
} |
|
677 |
} |
|
678 |
|
|
679 |
|
|
0 | 680 |
TXM/trunk/org.txm.conllu.rcp/src/org/txm/conllu/rcp/commands/ToCoNLL2009.java (revision 3540) | ||
---|---|---|
1 |
package org.txm.conllu.rcp.commands; |
|
2 |
|
|
3 |
import java.io.File; |
|
4 |
import java.io.FileOutputStream; |
|
5 |
import java.io.IOException; |
|
6 |
import java.io.OutputStreamWriter; |
|
7 |
import java.io.Writer; |
|
8 |
import java.util.List; |
|
9 |
|
|
10 |
import org.txm.searchengine.cqp.clientExceptions.CqiClientException; |
|
11 |
import org.txm.searchengine.cqp.corpus.CQPCorpus; |
|
12 |
import org.txm.searchengine.cqp.corpus.CorpusManager; |
|
13 |
import org.txm.searchengine.cqp.corpus.Property; |
|
14 |
import org.txm.searchengine.cqp.corpus.StructuralUnit; |
|
15 |
import org.txm.searchengine.cqp.corpus.StructuralUnitProperty; |
|
16 |
import org.txm.searchengine.cqp.corpus.WordProperty; |
|
17 |
import org.txm.searchengine.cqp.corpus.query.CQLQuery; |
|
18 |
import org.txm.searchengine.cqp.corpus.query.Match; |
|
19 |
import org.txm.searchengine.cqp.serverException.CqiServerError; |
|
20 |
import org.txm.utils.ConsoleProgressBar; |
|
21 |
|
|
22 |
public class ToCoNLL2009 { |
|
23 |
|
|
24 |
boolean debug = false; |
|
25 |
/** |
|
26 |
* To CoNLL2009. |
|
27 |
* |
|
28 |
* @param outfile the outfile, where is result is saved |
|
29 |
* @param corpus the corpus or sub-corpus to export |
|
30 |
* @param sentenceUnit the StructuralUnitProperty that identify a sentence |
|
31 |
* @param word the word property |
|
32 |
* @param lemme the lemme property |
|
33 |
* @param pos the pos property |
|
34 |
* @param encoding the encoding of the outfile |
|
35 |
* @return true, if successful |
|
36 |
* @throws CqiServerError |
|
37 |
* @throws IOException |
|
38 |
* @throws CqiClientException |
|
39 |
*/ |
|
40 |
public boolean process(File outdir, CQPCorpus corpus, StructuralUnit sentenceUnit, Property word, Property lemme, Property pos, String encoding) throws CqiClientException, IOException, CqiServerError { |
|
41 |
|
|
42 |
String sstruct = ""; |
|
43 |
if (sentenceUnit != null) { |
|
44 |
sstruct = "expand to "+sentenceUnit.getName(); |
|
45 |
} |
|
46 |
|
|
47 |
String[] texts = corpus.getCorpusTextIdsList(); |
|
48 |
WordProperty idProperty = corpus.getProperty("id"); |
|
49 |
|
|
50 |
ConsoleProgressBar cpb = new ConsoleProgressBar(texts.length); |
|
51 |
|
|
52 |
for (String text : texts) { |
|
53 |
|
|
54 |
cpb.tick(); |
|
55 |
|
|
56 |
List<Match> matches = corpus.query(new CQLQuery("[_.text_id=\""+text+"\"]"+sstruct), "TMPEXPORTCONLL", false).getMatches(); |
|
57 |
if (debug) System.out.println(matches.size()); |
|
58 |
int npositions = 0; |
|
59 |
for (Match match : matches) npositions += match.size() + 1; |
|
60 |
if (debug) System.out.println("npositions="+npositions); |
|
61 |
|
|
62 |
int[] positions = new int[npositions+1]; |
|
63 |
int i = 0; |
|
64 |
for (Match match : matches) { |
|
65 |
for (int p : match.getRange()) |
|
66 |
positions[i++] = p; |
|
67 |
} |
|
68 |
|
|
69 |
String[] words = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(word.getQualifiedName(), positions); |
|
70 |
|
|
71 |
String[] idsList = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(idProperty.getQualifiedName(), positions); |
|
72 |
|
|
73 |
String[] lemmes = null; |
|
74 |
if (lemme != null) { |
|
75 |
lemmes = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(lemme.getQualifiedName(), positions); |
|
76 |
} |
|
77 |
|
|
78 |
String[] poss = null; |
|
79 |
if (pos != null) { |
|
80 |
poss = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(pos.getQualifiedName(), positions); |
|
81 |
} |
|
82 |
|
|
83 |
File outfile = new File(outdir, text+".conllu"); |
|
84 |
Writer writer = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"); //$NON-NLS-1$; |
|
85 |
|
|
86 |
String lemmestr; |
|
87 |
String posstr; |
|
88 |
int id = 0; |
|
89 |
int iposition = 0; |
|
90 |
for (Match match : matches) { |
|
91 |
|
|
92 |
writer.write("\n"); |
|
93 |
writer.write("# newdoc id="+text+"\n"); |
|
94 |
writer.write("# sent_id="+match+"\n"); |
|
95 |
|
|
96 |
id = 1; // first position in sentence |
|
97 |
for (i = match.getStart() ; i <= match.getEnd() ; i++) { // loop n times, n = mathc length |
|
98 |
|
|
99 |
if (lemme != null) { |
|
100 |
lemmestr = lemmes[iposition]; |
|
101 |
} else { |
|
102 |
lemmestr = "_"; |
|
103 |
} |
|
104 |
|
|
105 |
if (pos != null) { |
|
106 |
posstr = poss[iposition]; |
|
107 |
} else { |
|
108 |
posstr = "_"; |
|
109 |
} |
|
110 |
|
|
111 |
//print(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\n"); |
|
112 |
writer.write(""+id+"\t"+words[iposition]+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\tXmlID="+idsList[iposition]+"\n"); |
|
113 |
|
|
114 |
id++; // next word |
|
115 |
iposition++; // nextposition |
|
116 |
} |
|
117 |
} |
|
118 |
writer.flush(); |
|
119 |
writer.close(); |
|
120 |
} |
|
121 |
|
|
122 |
cpb.done(); |
|
123 |
return true; |
|
124 |
} |
|
125 |
|
|
126 |
public static void main(String[] args) throws CqiClientException, IOException, CqiServerError { |
|
127 |
|
|
128 |
CQPCorpus corpus = null; |
|
129 |
File outfile = new File("/home/mdecorde/Bureau/conn_export.tsv"); |
|
130 |
String encoding = "UTF-8"; |
|
131 |
|
|
132 |
String sentenceProperty = "s_id"; |
|
133 |
String posProperty = "frpos"; |
|
134 |
String lemmaProperty = "frlemma"; |
|
135 |
|
|
136 |
String[] split = sentenceProperty.split("_", 2); |
|
137 |
StructuralUnit s = null;//corpus.getStructuralUnit(split[0]).getProperty(split[1]) |
|
138 |
Property word = null;//corpus.getProperty("word") |
|
139 |
Property lemma = null;//corpus.getProperty(lemmaProperty) |
|
140 |
Property pos = null;//corpus.getProperty(posProperty) |
|
141 |
|
|
142 |
// if (s == null) { println "Error sentence property: $sentenceProperty"; return} |
|
143 |
// if (word == null) { println "Error no word property"; return} |
|
144 |
// if (pos == null) { println "Error pos property: $posProperty"; return} |
|
145 |
// if (lemma == null) { println "Error lemma property: $lemmaProperty"; return} |
|
146 |
|
|
147 |
ToCoNLL2009 processor = new ToCoNLL2009(); |
|
148 |
processor.process(outfile, corpus, s, word, lemma, pos, encoding); |
|
149 |
} |
|
150 |
} |
|
0 | 151 |
TXM/trunk/org.txm.conllu.rcp/plugin.xml (revision 3540) | ||
---|---|---|
21 | 21 |
id="org.txm.conllu.rcp.commands.ImportCoNLLUAnnotationsFromFile" |
22 | 22 |
name="%command.name.1"> |
23 | 23 |
</command> |
24 |
<command |
|
25 |
categoryId="TreeSearch4TXM.commands.category" |
|
26 |
defaultHandler="org.txm.conllu.rcp.commands.ExportCorpusAsFullCoNLLU" |
|
27 |
id="org.txm.conllu.rcp.commands.ExportCorpusAsFullCoNLLU" |
|
28 |
name="Export as a full CoNLL-U corpus..."> |
|
29 |
</command> |
|
24 | 30 |
</extension> |
25 | 31 |
<extension |
26 | 32 |
point="org.eclipse.ui.menus"> |
... | ... | |
42 | 48 |
<command |
43 | 49 |
commandId="org.txm.conllu.rcp.commands.ExportCorpusAsCoNLLU" |
44 | 50 |
icon="icons/functions/UD.png" |
45 |
label="%command.label.0" |
|
46 | 51 |
style="push"> |
47 | 52 |
<visibleWhen |
48 | 53 |
checkEnabled="false"> |
... | ... | |
51 | 56 |
</reference> |
52 | 57 |
</visibleWhen> |
53 | 58 |
</command> |
59 |
<command |
|
60 |
commandId="org.txm.conllu.rcp.commands.ExportCorpusAsFullCoNLLU" |
|
61 |
icon="icons/functions/UD.png" |
|
62 |
style="push"> |
|
63 |
<visibleWhen |
|
64 |
checkEnabled="false"> |
|
65 |
<reference |
|
66 |
definitionId="OneMainCorpusSelected"> |
|
67 |
</reference> |
|
68 |
</visibleWhen> |
|
69 |
</command> |
|
54 | 70 |
</menuContribution> |
55 | 71 |
<menuContribution |
56 | 72 |
locationURI="menu:menu.file.import.annotations"> |
TXM/trunk/org.txm.core/src/java/org/txm/core/preferences/TBXPreferences.java (revision 3540) | ||
---|---|---|
75 | 75 |
|
76 | 76 |
public static final String EDITION_DEFINITION_IMAGES_DIRECTORY = "edition_definition_images_directory"; //$NON-NLS-1$ |
77 | 77 |
|
78 |
public static final String EDITION_DEFINITION_TOOLTIP_PROPERTIES = "edition_definition_tooltip_properties"; //$NON-NLS-1$ |
|
79 |
|
|
78 | 80 |
/** |
79 | 81 |
* To show or not all nodes in some views, eg. in CorporaView tree. |
80 | 82 |
*/ |
TXM/trunk/org.txm.core/src/java/org/txm/objects/EditionDefinition.java (revision 3540) | ||
---|---|---|
62 | 62 |
node.put(key, value); |
63 | 63 |
} |
64 | 64 |
|
65 |
public String get(String key) { |
|
66 |
return node.get(key, null);
|
|
65 |
public String get(String key, String defaultValue) {
|
|
66 |
return node.get(key, defaultValue);
|
|
67 | 67 |
} |
68 | 68 |
|
69 |
public final String get(String key) { |
|
70 |
return get(key, ""); |
|
71 |
} |
|
72 |
|
|
69 | 73 |
public void copyParametersTo(EditionDefinition anotherEditionDefinition) { |
70 | 74 |
anotherEditionDefinition.setPageBreakTag(this.getPageElement()); |
71 | 75 |
anotherEditionDefinition.setWordsPerPage(this.getWordsPerPage()); |
Formats disponibles : Unified diff