Révision 3731
| TXM/trunk/bundles/org.txm.rcp/src/main/java/org/txm/rcp/commands/function/WordPropertiesFromTable.java (revision 3731) | ||
|---|---|---|
| 30 | 30 |
|
| 31 | 31 |
MainCorpus corpus = (MainCorpus)first; |
| 32 | 32 |
|
| 33 |
File script = new File(Toolbox.getTxmHomePath(), "/scripts/groovy/user/org/txm/macro/annotation/ImportWordPropertiesFromTableMacro.groovy"); //$NON-NLS-1$
|
|
| 33 |
File script = new File(Toolbox.getTxmHomePath(), "/scripts/groovy/user/org/txm/macro/annotation/ImportWordPropertiesFromTableMacro.groovy"); //$NON-NLS-1$ |
|
| 34 | 34 |
//File parametersFile = new File(Toolbox.getTxmHomePath(), "/scripts/groovy/user/org/txm/macro/annotation/ImportWordPropertiesFromTableMacro.properties"); |
| 35 | 35 |
|
| 36 | 36 |
HashMap<String, Object> defaultParameters = new HashMap<String, Object>(); |
| ... | ... | |
| 38 | 38 |
defaultParameters.put("csvFile", corpus.getName()+"_annotations.tsv"); //$NON-NLS-1$ //$NON-NLS-2$
|
| 39 | 39 |
|
| 40 | 40 |
ExecuteGroovyMacro.execute(script.getAbsolutePath(), part, selection, null, null, defaultParameters); //$NON-NLS-1$ |
| 41 |
|
|
| 42 |
|
|
| 41 | 43 |
return null; |
| 42 | 44 |
} |
| 43 | 45 |
|
| TXM/trunk/bundles/org.txm.rcp/src/main/java/org/txm/rcp/commands/workspace/UpdateCorpus.java (revision 3731) | ||
|---|---|---|
| 2 | 2 |
|
| 3 | 3 |
import java.io.File; |
| 4 | 4 |
import java.io.FileFilter; |
| 5 |
import java.util.Date; |
|
| 5 | 6 |
|
| 6 | 7 |
import org.eclipse.core.commands.AbstractHandler; |
| 7 | 8 |
import org.eclipse.core.commands.ExecutionEvent; |
| ... | ... | |
| 15 | 16 |
import org.eclipse.osgi.util.NLS; |
| 16 | 17 |
import org.eclipse.swt.widgets.Display; |
| 17 | 18 |
import org.eclipse.ui.handlers.HandlerUtil; |
| 19 |
import org.txm.Toolbox; |
|
| 18 | 20 |
import org.txm.core.preferences.TBXPreferences; |
| 19 | 21 |
import org.txm.objects.Project; |
| 20 | 22 |
import org.txm.rcp.commands.CloseEditorsUsing; |
| ... | ... | |
| 144 | 146 |
try {
|
| 145 | 147 |
if (project.compute(monitor, true)) { // TODO children should be recomputed later only when the user needs it
|
| 146 | 148 |
|
| 149 |
project.appendToHistory("Updated");
|
|
| 150 |
|
|
| 147 | 151 |
this.syncExec(new Runnable() {
|
| 148 | 152 |
|
| 149 | 153 |
@Override |
| TXM/trunk/bundles/org.txm.rcp/src/main/java/org/txm/rcp/swt/widget/QueryWidget.java (revision 3731) | ||
|---|---|---|
| 78 | 78 |
|
| 79 | 79 |
h = null; |
| 80 | 80 |
|
| 81 |
// if (this.project != null) {
|
|
| 82 |
// h = this.project.getFirstChild(QueryHistory.class); |
|
| 83 |
// |
|
| 84 |
// if (h == null) {
|
|
| 85 |
// h = new QueryHistory(project); |
|
| 86 |
// } |
|
| 87 |
// } |
|
| 88 |
// |
|
| 89 |
// try { // load history from queries.txt file
|
|
| 90 |
// h.compute(false); |
|
| 91 |
// } catch (InterruptedException e) {
|
|
| 92 |
// // TODO Auto-generated catch block |
|
| 93 |
// e.printStackTrace(); |
|
| 94 |
// } |
|
| 81 |
if (this.project != null) {
|
|
| 82 |
h = this.project.getFirstChild(QueryHistory.class); |
|
| 83 |
|
|
| 84 |
if (h == null) {
|
|
| 85 |
h = new QueryHistory(project); |
|
| 86 |
} |
|
| 87 |
} |
|
| 95 | 88 |
|
| 89 |
try { // load history from queries.txt file
|
|
| 90 |
h.compute(false); |
|
| 91 |
} catch (InterruptedException e) {
|
|
| 92 |
// TODO Auto-generated catch block |
|
| 93 |
e.printStackTrace(); |
|
| 94 |
} |
|
| 95 |
|
|
| 96 | 96 |
setHistoryItems(); |
| 97 | 97 |
} |
| 98 | 98 |
|
| TXM/trunk/bundles/org.txm.rcp/src/main/java/org/txm/rcp/swt/widget/parameters/SeparatorField.java (revision 3731) | ||
|---|---|---|
| 1 | 1 |
package org.txm.rcp.swt.widget.parameters; |
| 2 | 2 |
|
| 3 | 3 |
import org.eclipse.swt.SWT; |
| 4 |
import org.eclipse.swt.graphics.Font; |
|
| 5 |
import org.eclipse.swt.graphics.FontData; |
|
| 6 | 4 |
import org.eclipse.swt.layout.GridData; |
| 7 | 5 |
import org.eclipse.swt.layout.GridLayout; |
| 8 | 6 |
import org.eclipse.swt.widgets.Composite; |
| 9 | 7 |
import org.eclipse.swt.widgets.Label; |
| 10 |
import org.eclipse.swt.widgets.Text; |
|
| 11 | 8 |
import org.kohsuke.args4j.NamedOptionDef; |
| 12 | 9 |
|
| 13 | 10 |
/** |
| ... | ... | |
| 37 | 34 |
l.setLayoutData(gd); |
| 38 | 35 |
l.setText(str); |
| 39 | 36 |
l.setToolTipText(getWidgetUsage()); |
| 40 |
Font f = parent.getFont(); |
|
| 41 |
l.setFont(new Font(f.getDevice(), new FontData(f.getFontData()[0].getName(), f.getFontData()[0].getHeight(), f.getFontData()[0].getStyle()|SWT.BOLD)));
|
|
| 37 |
// Font f = parent.getFont();
|
|
| 38 |
// l.setFont(new Font(Display.getCurrent(), new FontData(f.getFontData()[0].getName(), f.getFontData()[0].getHeight(), f.getFontData()[0].getStyle()|SWT.BOLD)));
|
|
| 42 | 39 |
Label dt = new Label(this, SWT.SEPARATOR | SWT.HORIZONTAL); |
| 43 | 40 |
dt.setLayoutData(new GridData(SWT.FILL, SWT.END, true, false)); |
| 44 | 41 |
dt.setToolTipText(getWidgetUsage()); |
| TXM/trunk/bundles/org.txm.rcp/src/main/java/org/txm/rcp/ApplicationWorkbenchAdvisor.java (revision 3731) | ||
|---|---|---|
| 513 | 513 |
*/ |
| 514 | 514 |
@Override |
| 515 | 515 |
public void postShutdown() {
|
| 516 |
|
|
| 516 | 517 |
callPreStopScript(); |
| 517 | 518 |
|
| 518 | 519 |
Toolbox.shutdown(); |
| TXM/trunk/bundles/org.txm.tigersearch.rcp/src/org/txm/tigersearch/commands/ImportTIGERAnnotations.java (revision 3731) | ||
|---|---|---|
| 232 | 232 |
Log.info("Finalizing TIGERSearch corpus");
|
| 233 | 233 |
if (numberOfWordsAnnotated > 0) { // copy the TIGERcorpus to import
|
| 234 | 234 |
FileCopy.copyFiles(tigerCorpusDirectory, tigerCorpusExistingDirectory); |
| 235 |
|
|
| 236 |
corpus.getProject().appendToHistory("TIGER Annotations imported from "+tigerDirectory);
|
|
| 237 |
|
|
| 235 | 238 |
Log.info("Done. " + numberOfWordsAnnotated + " words annotated.");
|
| 236 | 239 |
} |
| 237 | 240 |
else {
|
| TXM/trunk/bundles/org.txm.analec.rcp/src/org/txm/annotation/urs/commands/ImportGlozzAnnotations.java (revision 3731) | ||
|---|---|---|
| 95 | 95 |
Log.info(Messages.ImportGlozzAnnotations_9); |
| 96 | 96 |
URSCorpora.saveCorpus(analecCorpus); |
| 97 | 97 |
|
| 98 |
mainCorpus.getProject().appendToHistory("URS Annotations imported from "+aafile+", "+aamfile+" and "+acfile);
|
|
| 99 |
|
|
| 98 | 100 |
Log.info(Messages.ImportGlozzAnnotations_10); |
| 99 | 101 |
return true; |
| 100 | 102 |
} |
| TXM/trunk/bundles/org.txm.analec.rcp/src/org/txm/annotation/urs/commands/SaveCorpus.java (revision 3731) | ||
|---|---|---|
| 87 | 87 |
Log.warning(Messages.SaveCorpus_5); |
| 88 | 88 |
return false; |
| 89 | 89 |
} |
| 90 |
|
|
| 91 |
mainCorpus.getProject().appendToHistory("URS annotations saved");
|
|
| 90 | 92 |
|
| 91 | 93 |
mainCorpus.setIsModified(false); |
| 92 | 94 |
if (event != null) {
|
| TXM/trunk/bundles/org.txm.analec.rcp/src/org/txm/annotation/urs/commands/ImportTEIAnnotations.java (revision 3731) | ||
|---|---|---|
| 76 | 76 |
Log.warning(Messages.ImportTEIAnnotations_2); |
| 77 | 77 |
return Status.CANCEL_STATUS; |
| 78 | 78 |
} else {
|
| 79 |
mainCorpus.getProject().appendToHistory("URS annotations imported from "+ directory +" : "+analecCorpus.getStructure().toString());
|
|
| 79 | 80 |
return Status.OK_STATUS; |
| 80 | 81 |
} |
| 81 | 82 |
} catch (Throwable e) {
|
| ... | ... | |
| 136 | 137 |
if (ret) {
|
| 137 | 138 |
Log.info(TXMCoreMessages.bind(Messages.ImportTEIAnnotations_5, analecCorpus.getToutesUnites().size(), analecCorpus.getToutesRelations().size(), analecCorpus.getTousSchemas().size())); |
| 138 | 139 |
mainCorpus.setIsModified(true); |
| 140 |
|
|
| 141 |
mainCorpus.getProject().appendToHistory("URS Annotations imported from TEI files of "+annotationDirectory);
|
|
| 142 |
|
|
| 139 | 143 |
CorporaView.refreshObject(mainCorpus); |
| 140 | 144 |
} |
| 141 | 145 |
return ret; |
| TXM/trunk/bundles/org.txm.conllu.rcp/src/org/txm/conllu/rcp/commands/ExportCorpusAsFullCoNLLU.java (revision 3731) | ||
|---|---|---|
| 42 | 42 |
import org.eclipse.osgi.util.NLS; |
| 43 | 43 |
import org.eclipse.ui.handlers.HandlerUtil; |
| 44 | 44 |
import org.kohsuke.args4j.Option; |
| 45 |
import org.txm.conllu.core.function.ImportCoNLLUAnnotations; |
|
| 45 | 46 |
import org.txm.rcp.swt.widget.parameters.ParametersDialog; |
| 46 | 47 |
import org.txm.searchengine.cqp.CQPSearchEngine; |
| 47 | 48 |
import org.txm.searchengine.cqp.clientExceptions.CqiClientException; |
| ... | ... | |
| 63 | 64 |
* @author mdecorde. |
| 64 | 65 |
*/ |
| 65 | 66 |
public class ExportCorpusAsFullCoNLLU extends AbstractHandler {
|
| 66 |
|
|
| 67 |
|
|
| 67 | 68 |
public static final String ID = ExportCorpusAsFullCoNLLU.class.getName(); |
| 68 |
|
|
| 69 |
|
|
| 69 | 70 |
@Option(name = "conlluResultDirectory", usage = "conlluResultDirectory", widget = "Folder", required = true, def = "conllu-result-directory") |
| 70 | 71 |
File conlluResultDirectory; |
| 71 |
|
|
| 72 |
|
|
| 72 | 73 |
@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-") |
| 73 | 74 |
String propertiesPrefix; |
| 74 |
|
|
| 75 |
|
|
| 75 | 76 |
@Option(name = "separator", usage = "Options", widget = "Separator", required = true, def = "comment properties") |
| 76 | 77 |
Boolean separator = false; |
| 77 |
|
|
| 78 |
|
|
| 78 | 79 |
@Option(name = "insertParagraphs", usage = "Insert paragraph marks in the CoNLLU corpus", widget = "Boolean", required = true, def = "true") |
| 79 | 80 |
Boolean insertParagraphs = false; |
| 80 |
|
|
| 81 |
|
|
| 81 | 82 |
@Option(name = "detectGap", usage = "Insert gap comment using the CQP 'gap' property", widget = "Boolean", required = true, def = "true") |
| 82 | 83 |
Boolean detectGap = false; |
| 83 |
|
|
| 84 |
|
|
| 84 | 85 |
@Option(name = "separator3", usage = "Options", widget = "Separator", required = true, def = "tokens options") |
| 85 | 86 |
Boolean separator3 = false; |
| 86 |
|
|
| 87 |
|
|
| 87 | 88 |
@Option(name = "insertNoSpaceAfter", usage = "Insert the NoSpaceAfter misc property if not initially in the CoNLLU corpus", widget = "Boolean", required = true, def = "true") |
| 88 | 89 |
Boolean insertNoSpaceAfter = true; |
| 89 |
|
|
| 90 |
|
|
| 90 | 91 |
@Option(name = "insertTokenWithoutUdAnnotations", usage = "if checked words without ud annotations are exported as well", widget = "Boolean", required = false, def = "false") |
| 91 | 92 |
Boolean insertTokenWithoutUdAnnotations; |
| 92 |
|
|
| 93 |
|
|
| 93 | 94 |
// "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" }; |
| 94 | 95 |
@Option(name = "separator_properties", usage = "Options", widget = "Separator", required = true, def = "fixing UD properties") |
| 95 | 96 |
Boolean separator_properties = false; |
| 96 |
|
|
| 97 |
|
|
| 97 | 98 |
@Option(name = "defaultFormPropertyName", usage = "optional CQP property to fix the missing 'form' ud property", widget = "String", required = false, def = "") |
| 98 | 99 |
String defaultFormPropertyName; |
| 99 |
|
|
| 100 |
|
|
| 100 | 101 |
@Option(name = "defaultLemmaPropertyName", usage = "optional CQP property to fix the missing 'lemma' ud property", widget = "String", required = false, def = "") |
| 101 | 102 |
String defaultLemmaPropertyName; |
| 102 |
|
|
| 103 |
|
|
| 103 | 104 |
@Option(name = "defaultUposPropertyName", usage = "optional CQP property to fix the missing 'upos' ud property", widget = "String", required = false, def = "") |
| 104 | 105 |
String defaultUposPropertyName; |
| 105 |
|
|
| 106 |
|
|
| 106 | 107 |
@Option(name = "defaultXposPropertyName", usage = "optional CQP property to fix the missing 'xpos' ud property", widget = "String", required = false, def = "") |
| 107 | 108 |
String defaultXposPropertyName; |
| 108 |
|
|
| 109 |
|
|
| 109 | 110 |
@Option(name = "defaultFeatsPropertyName", usage = "optional CQP property to fix the missing 'feats' ud property", widget = "String", required = false, def = "") |
| 110 | 111 |
String defaultFeatsPropertyName; |
| 111 |
|
|
| 112 |
|
|
| 112 | 113 |
@Option(name = "defaultHeadPropertyName", usage = "optional CQP property to fix the missing 'head' ud property", widget = "String", required = false, def = "") |
| 113 | 114 |
String defaultHeadPropertyName; |
| 114 |
|
|
| 115 |
|
|
| 115 | 116 |
@Option(name = "defaultDeprelPropertyName", usage = "optional CQP property to fix the missing 'deprel' ud property", widget = "String", required = false, def = "") |
| 116 | 117 |
String defaultDeprelPropertyName; |
| 117 |
|
|
| 118 |
|
|
| 118 | 119 |
@Option(name = "defaultDepsPropertyName", usage = "optional CQP property to fix the missing 'deps' ud property", widget = "String", required = false, def = "") |
| 119 | 120 |
String defaultDepsPropertyName; |
| 120 |
|
|
| 121 |
|
|
| 121 | 122 |
@Option(name = "defaultMiscPropertyName", usage = "optional CQP property to fix the missing 'misc' ud property", widget = "String", required = false, def = "") |
| 122 | 123 |
String defaultMiscPropertyName; |
| 123 |
|
|
| 124 |
|
|
| 124 | 125 |
@Option(name = "separator2", usage = "Options", widget = "Separator", required = true, def = "sentence fix options") |
| 125 | 126 |
Boolean separator2 = false; |
| 126 |
|
|
| 127 |
|
|
| 127 | 128 |
@Option(name = "openingPunct", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "[\\-–«‘“\\(]") |
| 128 | 129 |
String openingPunct; |
| 129 |
|
|
| 130 |
|
|
| 130 | 131 |
/** |
| 131 | 132 |
* the UD property suffixes, will be used to create the CQP properties like propertiesPrefix + suffix |
| 132 | 133 |
*/ |
| 133 |
public static String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
|
|
| 134 |
|
|
| 134 |
public static String[] propNames = ImportCoNLLUAnnotations.UD_PROPERTY_NAMES;
|
|
| 135 |
|
|
| 135 | 136 |
/* |
| 136 | 137 |
* (non-Javadoc) |
| 137 | 138 |
* @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent) |
| 138 | 139 |
*/ |
| 139 | 140 |
@Override |
| 140 | 141 |
public Object execute(final ExecutionEvent event) throws ExecutionException {
|
| 141 |
|
|
| 142 |
|
|
| 142 | 143 |
IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event); |
| 143 |
|
|
| 144 |
|
|
| 144 | 145 |
Object s = selection.getFirstElement(); |
| 145 | 146 |
if (!(s instanceof MainCorpus)) {
|
| 146 | 147 |
Log.warning("Selection is not a corpus. Aborting.");
|
| 147 | 148 |
return null; |
| 148 | 149 |
} |
| 149 |
|
|
| 150 |
|
|
| 150 | 151 |
if (!ParametersDialog.open(this)) {
|
| 151 | 152 |
return null; |
| 152 | 153 |
} |
| 153 |
|
|
| 154 |
|
|
| 154 | 155 |
conlluResultDirectory.mkdirs(); |
| 155 | 156 |
if (conlluResultDirectory == null || !conlluResultDirectory.exists() || !conlluResultDirectory.isDirectory()) {
|
| 156 | 157 |
Log.warning("Error: conllu result directory does not exists: " + conlluResultDirectory);
|
| 157 | 158 |
return null; |
| 158 | 159 |
} |
| 159 |
|
|
| 160 |
|
|
| 160 | 161 |
CQPCorpus corpus = (CQPCorpus) s; |
| 161 | 162 |
MainCorpus mainCorpus = corpus.getMainCorpus(); |
| 162 |
|
|
| 163 |
|
|
| 163 | 164 |
try {
|
| 164 | 165 |
return exportAnnotationsAsCorpus(mainCorpus, conlluResultDirectory, propertiesPrefix, openingPunct, insertTokenWithoutUdAnnotations, |
| 165 | 166 |
defaultFormPropertyName, defaultLemmaPropertyName, defaultUposPropertyName, defaultXposPropertyName, |
| ... | ... | |
| 171 | 172 |
Log.warning(e); |
| 172 | 173 |
Log.printStackTrace(e); |
| 173 | 174 |
} |
| 174 |
|
|
| 175 |
|
|
| 175 | 176 |
return null; |
| 176 | 177 |
} |
| 177 |
|
|
| 178 |
|
|
| 178 | 179 |
/** |
| 179 | 180 |
* export the corpus in a directory of conllu files (one per text) |
| 180 | 181 |
* |
| ... | ... | |
| 202 | 203 |
String defaultFeatsPropertyName, String defaultHeadPropertyName, String defaultDeprelPropertyName, String defaultDepsPropertyName, |
| 203 | 204 |
String defaultMiscPropertyName, |
| 204 | 205 |
boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter) |
| 205 |
throws UnexpectedAnswerException, |
|
| 206 |
IOException, |
|
| 207 |
CqiServerError, |
|
| 208 |
CqiClientException, InvalidCqpIdException {
|
|
| 209 |
|
|
| 206 |
throws UnexpectedAnswerException,
|
|
| 207 |
IOException,
|
|
| 208 |
CqiServerError,
|
|
| 209 |
CqiClientException, InvalidCqpIdException {
|
|
| 210 |
|
|
| 210 | 211 |
if (!conlluResultDirectory.exists()) {
|
| 211 | 212 |
conlluResultDirectory.mkdirs(); |
| 212 | 213 |
} |
| 213 | 214 |
int numberOfWordsWritten = 0; |
| 214 | 215 |
int numberOfSentencesWritten = 0; |
| 215 | 216 |
int numberOfTextsWritten = 0; |
| 216 |
|
|
| 217 |
|
|
| 217 | 218 |
String[] textIds = mainCorpus.getCorpusTextIdsList(); |
| 218 | 219 |
int[] start_limits = mainCorpus.getTextStartLimits(); |
| 219 | 220 |
int[] end_limits = mainCorpus.getTextEndLimits(); |
| 220 |
|
|
| 221 |
|
|
| 221 | 222 |
String lang = mainCorpus.getLang(); |
| 222 | 223 |
// HashSet<String> beforeSpacesRules = new HashSet<>(LangFormater.getNoSpaceBefore(mainCorpus.getLang())); |
| 223 | 224 |
// HashSet<String> afterSpacesRules = new HashSet<>(LangFormater.getNoSpaceAfter(mainCorpus.getLang())); |
| 224 |
|
|
| 225 |
|
|
| 225 | 226 |
for (String p : propNames) {
|
| 226 | 227 |
WordProperty wp = mainCorpus.getProperty(prefix + p); |
| 227 | 228 |
if (wp == null) {
|
| 228 |
Log.warning("Error: cannot find the Conllu property: " + prefix + p);
|
|
| 229 |
return 0; |
|
| 229 |
Log.warning("Warning: cannot find the Conllu property: " + prefix + p);
|
|
| 230 |
//return 0;
|
|
| 230 | 231 |
} |
| 231 | 232 |
} |
| 232 |
|
|
| 233 |
|
|
| 233 | 234 |
if (insertTokenWithoutUdAnnotations && (defaultFormPropertyName == null || mainCorpus.getProperty(defaultFormPropertyName) == null)) {
|
| 234 | 235 |
Log.warning("Error: the defaultFormPropertyName parameter needs to be set if insertTokenWithoutUdAnnotations is set to true");
|
| 235 | 236 |
return 0; |
| 236 | 237 |
} |
| 237 |
|
|
| 238 |
|
|
| 238 | 239 |
for (int iText = 0; iText < start_limits.length; iText++) {
|
| 239 |
|
|
| 240 |
|
|
| 240 | 241 |
// Build corpus positions |
| 241 | 242 |
int[] positions = new int[end_limits[iText] - start_limits[iText] + 1]; |
| 242 | 243 |
int tmp = 0; |
| ... | ... | |
| 244 | 245 |
positions[tmp++] = n; |
| 245 | 246 |
} |
| 246 | 247 |
numberOfWordsWritten += positions.length; |
| 247 |
|
|
| 248 |
|
|
| 248 | 249 |
// Get UD properties |
| 249 | 250 |
WordProperty wp; |
| 250 | 251 |
wp = mainCorpus.getProperty(prefix + "id"); |
| ... | ... | |
| 259 | 260 |
} |
| 260 | 261 |
} |
| 261 | 262 |
tmpValues = null; |
| 262 |
|
|
| 263 |
WordProperty formWordProperty = mainCorpus.getProperty(prefix + "form"); |
|
| 264 |
String[] formValues = CQPSearchEngine.getCqiClient().cpos2Str(formWordProperty.getQualifiedName(), positions); |
|
| 265 |
fixUNDEFValues(formValues); |
|
| 266 |
|
|
| 263 |
|
|
| 264 |
String[] emptyvalues = new String[positions.length]; |
|
| 265 |
for (int i = 0 ; i < emptyvalues.length ; i++) {
|
|
| 266 |
emptyvalues[i] = "_"; |
|
| 267 |
} |
|
| 268 |
|
|
| 269 |
wp = mainCorpus.getProperty(prefix + "form"); |
|
| 270 |
String[] formValues = null; |
|
| 271 |
if (wp != null) {
|
|
| 272 |
formValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
| 273 |
fixUNDEFValues(formValues); |
|
| 274 |
} else {
|
|
| 275 |
formValues = emptyvalues; |
|
| 276 |
} |
|
| 277 |
|
|
| 278 |
|
|
| 267 | 279 |
wp = mainCorpus.getProperty(prefix + "lemma"); |
| 268 |
String[] lemmaValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
| 269 |
fixUNDEFValues(lemmaValues); |
|
| 270 |
|
|
| 280 |
String[] lemmaValues = null; |
|
| 281 |
if (wp != null) {
|
|
| 282 |
lemmaValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
| 283 |
fixUNDEFValues(lemmaValues); |
|
| 284 |
} else {
|
|
| 285 |
lemmaValues = emptyvalues; |
|
| 286 |
} |
|
| 287 |
|
|
| 271 | 288 |
wp = mainCorpus.getProperty(prefix + "upos"); |
| 272 |
String[] uposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
| 273 |
fixUNDEFValues(uposValues); |
|
| 274 |
|
|
| 289 |
String[] uposValues = null; |
|
| 290 |
if (wp != null) {
|
|
| 291 |
uposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
| 292 |
fixUNDEFValues(uposValues); |
|
| 293 |
} else {
|
|
| 294 |
uposValues = emptyvalues; |
|
| 295 |
} |
|
| 296 |
|
|
| 275 | 297 |
wp = mainCorpus.getProperty(prefix + "xpos"); |
| 276 |
String[] xposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
| 277 |
fixUNDEFValues(xposValues); |
|
| 278 |
|
|
| 298 |
String[] xposValues = null; |
|
| 299 |
if (wp != null) {
|
|
| 300 |
xposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
| 301 |
fixUNDEFValues(xposValues); |
|
| 302 |
} else {
|
|
| 303 |
xposValues = emptyvalues; |
|
| 304 |
} |
|
| 305 |
|
|
| 279 | 306 |
wp = mainCorpus.getProperty(prefix + "feats"); |
| 280 |
String[] featsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
| 281 |
fixUNDEFValues(featsValues); |
|
| 282 |
|
|
| 307 |
String[] featsValues = null; |
|
| 308 |
if (wp != null) {
|
|
| 309 |
featsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
| 310 |
fixUNDEFValues(featsValues); |
|
| 311 |
} else {
|
|
| 312 |
featsValues = emptyvalues; |
|
| 313 |
} |
|
| 314 |
|
|
| 283 | 315 |
wp = mainCorpus.getProperty(prefix + "head"); |
| 284 | 316 |
// String[] headValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
| 285 | 317 |
tmpValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
| ... | ... | |
| 293 | 325 |
} |
| 294 | 326 |
} |
| 295 | 327 |
tmpValues = null; |
| 296 |
|
|
| 328 |
|
|
| 297 | 329 |
wp = mainCorpus.getProperty(prefix + "deprel"); |
| 298 |
String[] deprelValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
| 299 |
fixUNDEFValues(deprelValues); |
|
| 300 |
|
|
| 330 |
String[] deprelValues = null; |
|
| 331 |
if (wp != null) {
|
|
| 332 |
deprelValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
| 333 |
fixUNDEFValues(deprelValues); |
|
| 334 |
} else {
|
|
| 335 |
deprelValues = emptyvalues; |
|
| 336 |
} |
|
| 337 |
|
|
| 301 | 338 |
wp = mainCorpus.getProperty(prefix + "deps"); |
| 302 |
String[] depsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
| 303 |
fixUNDEFValues(depsValues); |
|
| 304 |
|
|
| 339 |
String[] depsValues = null; |
|
| 340 |
if (wp != null) {
|
|
| 341 |
depsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
| 342 |
fixUNDEFValues(depsValues); |
|
| 343 |
} else {
|
|
| 344 |
depsValues = emptyvalues; |
|
| 345 |
} |
|
| 346 |
|
|
| 305 | 347 |
wp = mainCorpus.getProperty(prefix + "misc"); |
| 306 |
String[] miscValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
| 307 |
fixUNDEFValues(miscValues); |
|
| 308 |
|
|
| 348 |
String[] miscValues = null; |
|
| 349 |
if (wp != null) {
|
|
| 350 |
miscValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
|
| 351 |
fixUNDEFValues(miscValues); |
|
| 352 |
} else {
|
|
| 353 |
miscValues = emptyvalues; |
|
| 354 |
} |
|
| 355 |
|
|
| 309 | 356 |
HashSet<Integer> paragraphsStartPositions = new HashSet<>(); |
| 310 | 357 |
if (insertParagraphs) {
|
| 311 | 358 |
StructuralUnit p_struct = mainCorpus.getStructuralUnit("p");
|
| ... | ... | |
| 319 | 366 |
} |
| 320 | 367 |
} |
| 321 | 368 |
} |
| 322 |
|
|
| 369 |
|
|
| 323 | 370 |
HashMap<Integer, String> sentidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, prefix+"sentid"); |
| 324 | 371 |
HashMap<Integer, String> newdocidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, prefix+"newdocid"); |
| 325 |
|
|
| 372 |
|
|
| 326 | 373 |
// build sentence, first pass using UD word sentence positions |
| 327 | 374 |
ArrayList<ArrayList<Integer>> sentences = new ArrayList<>(); |
| 328 | 375 |
ArrayList<Integer> tmpSentence = new ArrayList<>(); |
| ... | ... | |
| 331 | 378 |
// + featsValues[p] + " head=" |
| 332 | 379 |
// + headValues[p] + " deprel=" + deprelValues[p] + " deps=" + depsValues[p] + " misc=" + miscValues[p]); |
| 333 | 380 |
if (sentidStartPositions.containsKey(p)) { // new ud sentence
|
| 334 |
|
|
| 381 |
|
|
| 335 | 382 |
if (tmpSentence.size() > 0) {
|
| 336 | 383 |
sentences.add(new ArrayList<>(tmpSentence)); |
| 337 | 384 |
} |
| 338 |
|
|
| 385 |
|
|
| 339 | 386 |
// System.out.println("new sentence: " + " id=" + idValues[p] + " form=" + formValues[p] + " lemma=" + lemmaValues[p] + " upos=" + uposValues[p] + " xpos=" + xposValues[p] + "
|
| 340 | 387 |
// feats=" |
| 341 | 388 |
// + featsValues[p] + " head=" |
| 342 | 389 |
// + headValues[p] + " deprel=" + deprelValues[p] + " deps=" + depsValues[p] + " misc=" + miscValues[p]); |
| 343 | 390 |
tmpSentence.clear(); |
| 344 | 391 |
} |
| 345 |
|
|
| 392 |
|
|
| 346 | 393 |
if (insertTokenWithoutUdAnnotations) {
|
| 347 | 394 |
tmpSentence.add(p); // insert all tokens |
| 348 | 395 |
} |
| 349 | 396 |
else if (idValues[p] != 0) {
|
| 350 | 397 |
tmpSentence.add(p); // insert all tokens |
| 351 | 398 |
} |
| 352 |
|
|
| 399 |
|
|
| 353 | 400 |
} |
| 354 | 401 |
positions = null; // free memory |
| 355 |
|
|
| 402 |
|
|
| 356 | 403 |
// fixing sentences |
| 357 | 404 |
for (int s = 0; s < sentences.size(); s++) {
|
| 358 |
|
|
| 405 |
|
|
| 359 | 406 |
// fix only ud sentences limits |
| 360 | 407 |
ArrayList<Integer> sentence = sentences.get(s); |
| 361 |
|
|
| 408 |
|
|
| 362 | 409 |
if (sentidStartPositions.get(sentence.get(0)) == null) {
|
| 363 | 410 |
continue; // this is not a UD sentence |
| 364 | 411 |
} |
| 365 |
|
|
| 412 |
|
|
| 366 | 413 |
int max = -1; |
| 367 | 414 |
int imax = 0; |
| 368 | 415 |
for (int ip = 0; ip < sentence.size(); ip++) {
|
| ... | ... | |
| 372 | 419 |
imax = ip; |
| 373 | 420 |
} |
| 374 | 421 |
} |
| 375 |
|
|
| 422 |
|
|
| 376 | 423 |
ArrayList<Integer> newSentence = new ArrayList<>(); |
| 377 | 424 |
for (int ip = imax + 1; ip < sentence.size(); ip++) {
|
| 378 | 425 |
newSentence.add(sentence.get(ip)); |
| ... | ... | |
| 388 | 435 |
sentences.add(s + 1, newSentence); |
| 389 | 436 |
} |
| 390 | 437 |
} |
| 391 |
|
|
| 438 |
|
|
| 392 | 439 |
if (tmpSentence.size() > 0) { // add last sentence
|
| 393 | 440 |
sentences.add(new ArrayList<>(tmpSentence)); |
| 394 | 441 |
} |
| 395 |
|
|
| 442 |
|
|
| 396 | 443 |
// fixing sentence __NULL__ ud properties |
| 397 | 444 |
for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
|
| 398 | 445 |
ArrayList<Integer> sentence = sentences.get(iSentence); |
| 399 |
|
|
| 446 |
|
|
| 400 | 447 |
int[] sentencePositions = new int[sentence.size()]; |
| 401 | 448 |
for (int p = 0; p < sentence.size(); p++) {
|
| 402 | 449 |
sentencePositions[p] = sentence.get(p); |
| 403 | 450 |
} |
| 404 |
|
|
| 451 |
|
|
| 405 | 452 |
// get CQP values fixing "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps" |
| 406 | 453 |
String[] ids = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("id").getQualifiedName(), sentencePositions);
|
| 407 |
|
|
| 454 |
|
|
| 408 | 455 |
String[] words = null; |
| 409 | 456 |
if (defaultFormPropertyName != null && defaultFormPropertyName.length() > 0) {
|
| 410 | 457 |
words = getDefaultValues(mainCorpus, defaultFormPropertyName, sentencePositions); |
| ... | ... | |
| 421 | 468 |
if (defaultXposPropertyName != null && defaultXposPropertyName.length() > 0) {
|
| 422 | 469 |
xposs = getDefaultValues(mainCorpus, defaultXposPropertyName, sentencePositions); |
| 423 | 470 |
} |
| 424 |
|
|
| 471 |
|
|
| 425 | 472 |
String[] feats = null; |
| 426 | 473 |
if (defaultFeatsPropertyName != null && defaultFeatsPropertyName.length() > 0) {
|
| 427 | 474 |
feats = getDefaultValues(mainCorpus, defaultFeatsPropertyName, sentencePositions); |
| ... | ... | |
| 442 | 489 |
if (defaultMiscPropertyName != null && defaultMiscPropertyName.length() > 0) {
|
| 443 | 490 |
miscs = getDefaultValues(mainCorpus, defaultMiscPropertyName, sentencePositions); |
| 444 | 491 |
} |
| 445 |
|
|
| 492 |
|
|
| 446 | 493 |
// String[] feats = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(featsCorrPropertyName).getQualifiedName(), sentencePositions); |
| 447 | 494 |
// String[] head = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(headCorrPropertyName).getQualifiedName(), sentencePositions); |
| 448 | 495 |
// String[] deprel = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions); |
| 449 | 496 |
// String[] deps = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions); |
| 450 |
|
|
| 497 |
|
|
| 451 | 498 |
// fix ud properties using CQP values |
| 452 | 499 |
for (int ip = 0; ip < sentence.size(); ip++) {
|
| 453 |
|
|
| 500 |
|
|
| 454 | 501 |
int p = sentence.get(ip); |
| 455 |
|
|
| 502 |
|
|
| 456 | 503 |
// new word |
| 457 | 504 |
if (miscValues[p].equals("_")) {
|
| 458 | 505 |
miscValues[p] = "XmlId=" + ids[ip]; |
| 459 | 506 |
} |
| 460 |
|
|
| 507 |
|
|
| 461 | 508 |
// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
|
| 462 | 509 |
if (words != null && formValues[p].equals("_")) {
|
| 463 | 510 |
formValues[p] = words[ip]; |
| ... | ... | |
| 487 | 534 |
miscValues[p] = miscs[ip]; |
| 488 | 535 |
} |
| 489 | 536 |
} |
| 490 |
|
|
| 537 |
|
|
| 491 | 538 |
if (insertNoSpaceAfter) {
|
| 492 | 539 |
for (int ip = 0; ip < sentence.size(); ip++) { // fix SpaceAfter. !!! this needs to be done after ud properties are fixed
|
| 493 | 540 |
int p = sentence.get(ip); |
| ... | ... | |
| 503 | 550 |
} |
| 504 | 551 |
} |
| 505 | 552 |
} |
| 506 |
|
|
| 553 |
|
|
| 507 | 554 |
// fixing sentence punct limits |
| 508 | 555 |
while (sentence.size() > 0 && iSentence > 0 && formValues[sentence.get(0)].matches("\\p{P}") && !formValues[sentence.get(0)].matches(openingPunct)) {
|
| 509 | 556 |
// System.out.println("FIXING: first non-openingPunct position " + formValues[sentence.get(0)] + " in " + iSentence);
|
| ... | ... | |
| 516 | 563 |
int p2 = sentence.remove(sentence.size() - 1); |
| 517 | 564 |
sentences.get(iSentence + 1).add(0, p2); |
| 518 | 565 |
} |
| 519 |
|
|
| 566 |
|
|
| 520 | 567 |
if (sentence.size() == 0) { // sentence was depleted after fixing it
|
| 521 | 568 |
sentences.remove(iSentence); |
| 522 | 569 |
iSentence--; |
| 523 | 570 |
continue; |
| 524 | 571 |
} |
| 525 | 572 |
} |
| 526 |
|
|
| 573 |
|
|
| 527 | 574 |
for (int s = 0; s < sentences.size(); s++) {
|
| 528 |
|
|
| 575 |
|
|
| 529 | 576 |
// fix only ud sentences limits |
| 530 | 577 |
ArrayList<Integer> sentence = sentences.get(s); |
| 531 | 578 |
HashMap<Integer, Integer> oldToNewIds = new HashMap<>(); |
| 532 | 579 |
for (int ip = 0; ip < sentence.size(); ip++) { // computing old to new ids
|
| 533 | 580 |
int p = sentence.get(ip); |
| 534 |
|
|
| 581 |
|
|
| 535 | 582 |
if (idValues[p] != 0) { // store "old id -> new id"
|
| 536 | 583 |
oldToNewIds.put(idValues[p], (ip + 1)); // from 1 to N |
| 537 | 584 |
} |
| 538 | 585 |
} |
| 539 |
|
|
| 586 |
|
|
| 540 | 587 |
// fixing head and set missing head to 0 and root |
| 541 | 588 |
for (int ip = 0; ip < sentence.size(); ip++) {
|
| 542 | 589 |
int p = sentence.get(ip); |
| 543 |
|
|
| 590 |
|
|
| 544 | 591 |
// fixing id value |
| 545 | 592 |
idValues[p] = (ip + 1); // from 1 to N |
| 546 |
|
|
| 593 |
|
|
| 547 | 594 |
// fixing head values |
| 548 | 595 |
if (oldToNewIds.containsKey(headValues[p])) {
|
| 549 | 596 |
headValues[p] = oldToNewIds.get(headValues[p]); |
| ... | ... | |
| 555 | 602 |
} |
| 556 | 603 |
} |
| 557 | 604 |
} |
| 558 |
|
|
| 605 |
|
|
| 559 | 606 |
// writing sentences |
| 560 | 607 |
File resultConlluFile = new File(conlluResultDirectory, textIds[iText] + ".conllu"); |
| 561 | 608 |
PrintWriter writer = IOUtils.getWriter(resultConlluFile); |
| 562 |
|
|
| 609 |
|
|
| 563 | 610 |
int iParagraph = 1; |
| 564 |
|
|
| 611 |
|
|
| 565 | 612 |
for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
|
| 566 | 613 |
ArrayList<Integer> sentence = sentences.get(iSentence); |
| 567 |
|
|
| 614 |
|
|
| 568 | 615 |
int[] sentencePositions = new int[sentence.size()]; |
| 569 | 616 |
for (int p = 0; p < sentence.size(); p++) {
|
| 570 | 617 |
sentencePositions[p] = sentence.get(p); |
| 571 | 618 |
} |
| 572 |
|
|
| 619 |
|
|
| 573 | 620 |
String[] gap = null; |
| 574 | 621 |
if (detectGap && mainCorpus.getProperty("gap") != null) {
|
| 575 | 622 |
gap = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("gap").getQualifiedName(), sentencePositions);
|
| 576 | 623 |
} |
| 577 |
|
|
| 624 |
|
|
| 578 | 625 |
String[] tokens = new String[sentence.size()]; |
| 579 | 626 |
for (int ip = 0; ip < sentence.size(); ip++) {
|
| 580 | 627 |
tokens[ip] = formValues[sentence.get(ip)]; |
| 581 | 628 |
} |
| 582 |
|
|
| 629 |
|
|
| 583 | 630 |
if (insertNoSpaceAfter) {
|
| 584 | 631 |
writer.println("# text = " + LangFormater.format(StringUtils.join(tokens, " "), mainCorpus.getLang()));
|
| 585 | 632 |
} |
| 586 | 633 |
else {
|
| 587 | 634 |
writer.println("# text = " + StringUtils.join(tokens, " "));
|
| 588 | 635 |
} |
| 589 |
|
|
| 636 |
|
|
| 590 | 637 |
if (newdocidStartPositions.containsKey(sentence.get(0))) {
|
| 591 | 638 |
writer.println("# newdoc id = " + newdocidStartPositions.get(sentence.get(0)));
|
| 592 | 639 |
} |
| 593 | 640 |
else {
|
| 594 | 641 |
writer.println("# newdoc id = " + textIds[iText]);
|
| 595 | 642 |
} |
| 596 |
|
|
| 643 |
|
|
| 597 | 644 |
boolean foundSentId = false; |
| 598 | 645 |
for (int ip : sentence) {
|
| 599 | 646 |
if (!foundSentId && sentidStartPositions.containsKey(ip)) {
|
| ... | ... | |
| 604 | 651 |
if (!foundSentId) { // no sent_id found
|
| 605 | 652 |
writer.println("# sent_id = " + textIds[iText] + "-" + (iSentence + 1) + ".new");
|
| 606 | 653 |
} |
| 607 |
|
|
| 654 |
|
|
| 608 | 655 |
if (paragraphsStartPositions.contains(sentence.get(0))) { // paragraphsStartPositions is empty if the injectParagraph option is not set
|
| 609 | 656 |
writer.println("# newpar id = " + iParagraph);
|
| 610 | 657 |
iParagraph++; |
| 611 | 658 |
} |
| 612 |
|
|
| 659 |
|
|
| 613 | 660 |
for (int ip = 0; ip < sentence.size(); ip++) {
|
| 614 | 661 |
int p = sentence.get(ip); |
| 615 |
|
|
| 662 |
|
|
| 616 | 663 |
// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
|
| 617 | 664 |
writer.println(idValues[p] + "\t" + formValues[p] + "\t" + lemmaValues[p] + "\t" + uposValues[p] |
| 618 | 665 |
+ "\t" + xposValues[p] + "\t" + featsValues[p] + "\t" + headValues[p] + "\t" + deprelValues[p] |
| 619 |
+ "\t" + depsValues[p] + "\t" + miscValues[p]); |
|
| 620 |
|
|
| 666 |
+ "\t" + depsValues[p] + "\t" + miscValues[p]);
|
|
| 667 |
|
|
| 621 | 668 |
if (gap != null && gap[ip].equals("next")) {
|
| 622 | 669 |
writer.println("# gap");
|
| 623 | 670 |
} |
| ... | ... | |
| 626 | 673 |
numberOfSentencesWritten++; |
| 627 | 674 |
} |
| 628 | 675 |
writer.close(); |
| 629 |
|
|
| 676 |
|
|
| 630 | 677 |
System.out.println(" Text done: " + resultConlluFile);
|
| 631 | 678 |
numberOfTextsWritten++; |
| 632 | 679 |
} |
| 633 |
|
|
| 680 |
|
|
| 634 | 681 |
System.out.println("# words written: " + numberOfWordsWritten);
|
| 635 | 682 |
System.out.println("# sentences written: " + numberOfSentencesWritten);
|
| 636 | 683 |
System.out.println("# texts written: " + numberOfTextsWritten);
|
| 637 |
|
|
| 684 |
|
|
| 638 | 685 |
return numberOfWordsWritten; |
| 639 | 686 |
} |
| 640 |
|
|
| 687 |
|
|
| 641 | 688 |
private static String[] getDefaultValues(MainCorpus mainCorpus, String property, int[] positions) throws UnexpectedAnswerException, IOException, CqiServerError, CqiClientException {
|
| 642 |
|
|
| 689 |
|
|
| 643 | 690 |
String[] values = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(property).getQualifiedName(), positions); |
| 644 | 691 |
for (int iupos = 0; iupos < values.length; iupos++) { // recode the || CQP multiple values to ud multiple values
|
| 645 | 692 |
if (values[iupos].length() > 2 && values[iupos].startsWith("|") && values[iupos].endsWith("|")) {
|
| 646 | 693 |
values[iupos] = values[iupos].substring(1, values[iupos].length() - 1); |
| 647 | 694 |
} |
| 648 | 695 |
} |
| 649 |
|
|
| 696 |
|
|
| 650 | 697 |
return values; |
| 651 | 698 |
} |
| 652 |
|
|
| 699 |
|
|
| 653 | 700 |
private static HashMap<Integer, String> getNonUNDEFPositionsAndValues(MainCorpus mainCorpus, String property) throws UnexpectedAnswerException, IOException, CqiServerError, CqiClientException {
|
| 654 |
|
|
| 701 |
|
|
| 702 |
|
|
| 703 |
|
|
| 655 | 704 |
HashMap<Integer, String> sentidStartPositions = new HashMap<>(); |
| 656 |
int[] ids = CQPSearchEngine.getCqiClient().regex2Id(mainCorpus.getProperty(property).getQualifiedName(), "(?!__UNDEF__).+"); |
|
| 657 |
String[] strs = CQPSearchEngine.getCqiClient().id2Str(mainCorpus.getProperty(property).getQualifiedName(), ids); |
|
| 658 |
for (int iId = 0; iId < ids.length; iId++) {
|
|
| 659 |
int id = ids[iId]; |
|
| 660 |
int[] pp = CQPSearchEngine.getCqiClient().id2Cpos(mainCorpus.getProperty(property).getQualifiedName(), id); |
|
| 661 |
for (int p : pp) {
|
|
| 662 |
sentidStartPositions.put(p, strs[iId]); |
|
| 705 |
if (mainCorpus.getProperty(property) != null) {
|
|
| 706 |
int[] ids = CQPSearchEngine.getCqiClient().regex2Id(mainCorpus.getProperty(property).getQualifiedName(), "(?!__UNDEF__).+"); |
|
| 707 |
String[] strs = CQPSearchEngine.getCqiClient().id2Str(mainCorpus.getProperty(property).getQualifiedName(), ids); |
|
| 708 |
for (int iId = 0; iId < ids.length; iId++) {
|
|
| 709 |
int id = ids[iId]; |
|
| 710 |
int[] pp = CQPSearchEngine.getCqiClient().id2Cpos(mainCorpus.getProperty(property).getQualifiedName(), id); |
|
| 711 |
for (int p : pp) {
|
|
| 712 |
sentidStartPositions.put(p, strs[iId]); |
|
| 713 |
} |
|
| 663 | 714 |
} |
| 664 | 715 |
} |
| 665 |
|
|
| 666 | 716 |
return sentidStartPositions; |
| 667 | 717 |
} |
| 668 |
|
|
| 718 |
|
|
| 669 | 719 |
private static void fixUNDEFValues(String[] values) {
|
| 670 |
|
|
| 720 |
|
|
| 671 | 721 |
for (int i = 0; i < values.length; i++) {
|
| 672 | 722 |
if (values[i].equals("__UNDEF__") || values[i].equals("") || values[i].equals("|_|")) {
|
| 673 | 723 |
values[i] = "_"; |
| TXM/trunk/bundles/org.txm.conllu.rcp/src/org/txm/conllu/rcp/commands/ToCoNLL2009.java (revision 3731) | ||
|---|---|---|
| 12 | 12 |
import org.txm.searchengine.cqp.corpus.CorpusManager; |
| 13 | 13 |
import org.txm.searchengine.cqp.corpus.Property; |
| 14 | 14 |
import org.txm.searchengine.cqp.corpus.StructuralUnit; |
| 15 |
import org.txm.searchengine.cqp.corpus.StructuralUnitProperty; |
|
| 16 | 15 |
import org.txm.searchengine.cqp.corpus.WordProperty; |
| 17 | 16 |
import org.txm.searchengine.cqp.corpus.query.CQLQuery; |
| 18 | 17 |
import org.txm.searchengine.cqp.corpus.query.Match; |
| 19 | 18 |
import org.txm.searchengine.cqp.serverException.CqiServerError; |
| 20 | 19 |
import org.txm.utils.ConsoleProgressBar; |
| 20 |
import org.txm.utils.logger.Log; |
|
| 21 | 21 |
|
| 22 |
import jline.internal.Log; |
|
| 23 |
|
|
| 24 | 22 |
public class ToCoNLL2009 {
|
| 25 | 23 |
|
| 26 | 24 |
boolean debug = false; |
| TXM/trunk/bundles/org.txm.conllu.rcp/src/org/txm/conllu/rcp/commands/ExportCorpusAsCoNLLU.java (revision 3731) | ||
|---|---|---|
| 42 | 42 |
import org.eclipse.osgi.util.NLS; |
| 43 | 43 |
import org.eclipse.ui.handlers.HandlerUtil; |
| 44 | 44 |
import org.kohsuke.args4j.Option; |
| 45 |
import org.txm.conllu.core.function.ImportCoNLLUAnnotations; |
|
| 45 | 46 |
import org.txm.rcp.swt.widget.parameters.ParametersDialog; |
| 46 | 47 |
import org.txm.searchengine.cqp.CQPSearchEngine; |
| 47 | 48 |
import org.txm.searchengine.cqp.clientExceptions.CqiClientException; |
| ... | ... | |
| 63 | 64 |
* @author mdecorde. |
| 64 | 65 |
*/ |
| 65 | 66 |
public class ExportCorpusAsCoNLLU extends AbstractHandler {
|
| 66 |
|
|
| 67 |
|
|
| 67 | 68 |
public static final String ID = ExportCorpusAsCoNLLU.class.getName(); |
| 68 |
|
|
| 69 |
|
|
| 69 | 70 |
@Option(name="outputDirectory", usage="an example file", widget="Folder", required=true, def="outputDirectory") |
| 70 | 71 |
File outputDirectory; |
| 71 |
|
|
| 72 |
|
|
| 72 | 73 |
@Option(name="encoding", usage="sentenceProperty", widget="String", required=true, def="UTF-8") |
| 73 | 74 |
String encoding = "UTF-8"; |
| 74 | 75 |
|
| 75 |
@Option(name="sentenceStructure", usage="sentenceProperty", widget="String", required=true, def="s")
|
|
| 76 |
@Option(name="sentenceStructure", usage="sentenceProperty", widget="String", required=true, def="s") |
|
| 76 | 77 |
String sentenceStructure; |
| 77 |
|
|
| 78 |
|
|
| 78 | 79 |
@Option(name="posProperty", usage="if set posProperty used to fill the UPOS ud property", widget="String", required=true, def="frpos") |
| 79 | 80 |
String posProperty; |
| 80 |
|
|
| 81 |
|
|
| 81 | 82 |
@Option(name="lemmaProperty", usage="if set lemmaProperty used to fill the LEMMA ud property", widget="String", required=true, def="frlemma") |
| 82 | 83 |
String lemmaProperty; |
| 83 |
|
|
| 84 |
|
|
| 84 | 85 |
/** |
| 85 | 86 |
* the UD property suffixes, will be used to create the CQP properties like propertiesPrefix + suffix |
| 86 | 87 |
*/ |
| 87 |
public static String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
|
|
| 88 |
|
|
| 88 |
public static String[] propNames = ImportCoNLLUAnnotations.UD_PROPERTY_NAMES;
|
|
| 89 |
|
|
| 89 | 90 |
/* |
| 90 | 91 |
* (non-Javadoc) |
| 91 | 92 |
* @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent) |
| 92 | 93 |
*/ |
| 93 | 94 |
@Override |
| 94 | 95 |
public Object execute(final ExecutionEvent event) throws ExecutionException {
|
| 95 |
|
|
| 96 |
|
|
| 96 | 97 |
IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event); |
| 97 |
|
|
| 98 |
|
|
| 98 | 99 |
Object s = selection.getFirstElement(); |
| 99 | 100 |
if (!(s instanceof MainCorpus)) {
|
| 100 | 101 |
Log.warning("Selection is not a corpus. Aborting.");
|
| 101 | 102 |
return null; |
| 102 | 103 |
} |
| 103 |
|
|
| 104 |
|
|
| 104 | 105 |
if (!ParametersDialog.open(this)) {
|
| 105 | 106 |
return null; |
| 106 | 107 |
} |
| 107 |
|
|
| 108 |
|
|
| 108 | 109 |
outputDirectory.mkdirs(); |
| 109 | 110 |
if (outputDirectory == null || !outputDirectory.exists() || !outputDirectory.isDirectory()) {
|
| 110 | 111 |
Log.warning("Error: conllu result directory does not exists: " + outputDirectory);
|
| 111 | 112 |
return null; |
| 112 | 113 |
} |
| 113 |
|
|
| 114 |
|
|
| 114 | 115 |
CQPCorpus corpus = (CQPCorpus) s; |
| 115 | 116 |
MainCorpus mainCorpus = corpus.getMainCorpus(); |
| 116 |
|
|
| 117 |
|
|
| 117 | 118 |
return exportCorpus(mainCorpus, outputDirectory, sentenceStructure, lemmaProperty, posProperty, encoding); |
| 118 |
|
|
| 119 |
|
|
| 119 | 120 |
} |
| 120 |
|
|
| 121 |
|
|
| 121 | 122 |
public static boolean exportCorpus(MainCorpus mainCorpus, File outputDirectory, String sentenceStructure, String lemmaProperty, String posProperty, String encoding) {
|
| 122 | 123 |
try {
|
| 123 | 124 |
return new ToCoNLL2009().process(outputDirectory, mainCorpus, mainCorpus.getStructuralUnit(sentenceStructure), mainCorpus.getProperty("word"), mainCorpus.getProperty(lemmaProperty), mainCorpus.getProperty(posProperty), encoding);
|
| 124 |
|
|
| 125 |
|
|
| 125 | 126 |
} catch (Exception e) {
|
| 126 | 127 |
Log.warning(e); |
| 127 | 128 |
Log.printStackTrace(e); |
| ... | ... | |
| 156 | 157 |
String defaultFeatsPropertyName, String defaultHeadPropertyName, String defaultDeprelPropertyName, String defaultDepsPropertyName, |
| 157 | 158 |
String defaultMiscPropertyName, |
| 158 | 159 |
boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter) |
| 159 |
throws UnexpectedAnswerException, |
|
| 160 |
IOException, |
|
| 161 |
CqiServerError, |
|
| 162 |
CqiClientException, InvalidCqpIdException {
|
|
| 163 |
|
|
| 160 |
throws UnexpectedAnswerException,
|
|
| 161 |
IOException,
|
|
| 162 |
CqiServerError,
|
|
| 163 |
CqiClientException, InvalidCqpIdException {
|
|
| 164 |
|
|
| 164 | 165 |
if (!conlluResultDirectory.exists()) {
|
| 165 | 166 |
conlluResultDirectory.mkdirs(); |
| 166 | 167 |
} |
| 167 | 168 |
int numberOfWordsWritten = 0; |
| 168 | 169 |
int numberOfSentencesWritten = 0; |
| 169 | 170 |
int numberOfTextsWritten = 0; |
| 170 |
|
|
| 171 |
|
|
| 171 | 172 |
String[] textIds = mainCorpus.getCorpusTextIdsList(); |
| 172 | 173 |
int[] start_limits = mainCorpus.getTextStartLimits(); |
| 173 | 174 |
int[] end_limits = mainCorpus.getTextEndLimits(); |
| 174 |
|
|
| 175 |
|
|
| 175 | 176 |
String lang = mainCorpus.getLang(); |
| 176 | 177 |
// HashSet<String> beforeSpacesRules = new HashSet<>(LangFormater.getNoSpaceBefore(mainCorpus.getLang())); |
| 177 | 178 |
// HashSet<String> afterSpacesRules = new HashSet<>(LangFormater.getNoSpaceAfter(mainCorpus.getLang())); |
| 178 |
|
|
| 179 |
|
|
| 179 | 180 |
for (String p : propNames) {
|
| 180 | 181 |
WordProperty wp = mainCorpus.getProperty(prefix + p); |
| 181 | 182 |
if (wp == null) {
|
| 182 |
Log.warning("Error: cannot find the Conllu property: " + prefix + p);
|
|
| 183 |
Log.warning("Error: cannot find the CoNLLU property: " + prefix + p);
|
|
| 183 | 184 |
return 0; |
| 184 | 185 |
} |
| 185 | 186 |
} |
| 186 |
|
|
| 187 |
|
|
| 187 | 188 |
if (insertTokenWithoutUdAnnotations && (defaultFormPropertyName == null || mainCorpus.getProperty(defaultFormPropertyName) == null)) {
|
| 188 | 189 |
Log.warning("Error: the defaultFormPropertyName parameter needs to be set if insertTokenWithoutUdAnnotations is set to true");
|
| 189 | 190 |
return 0; |
| 190 | 191 |
} |
| 191 |
|
|
| 192 |
|
|
| 192 | 193 |
for (int iText = 0; iText < start_limits.length; iText++) {
|
| 193 |
|
|
| 194 |
|
|
| 194 | 195 |
// Build corpus positions |
| 195 | 196 |
int[] positions = new int[end_limits[iText] - start_limits[iText] + 1]; |
| 196 | 197 |
int tmp = 0; |
| ... | ... | |
| 198 | 199 |
positions[tmp++] = n; |
| 199 | 200 |
} |
| 200 | 201 |
numberOfWordsWritten += positions.length; |
| 201 |
|
|
| 202 |
|
|
| 202 | 203 |
// Get UD properties |
| 203 | 204 |
WordProperty wp; |
| 204 | 205 |
wp = mainCorpus.getProperty(prefix + "id"); |
| ... | ... | |
| 213 | 214 |
} |
| 214 | 215 |
} |
| 215 | 216 |
tmpValues = null; |
| 216 |
|
|
| 217 |
|
|
| 217 | 218 |
WordProperty formWordProperty = mainCorpus.getProperty(prefix + "form"); |
| 218 | 219 |
String[] formValues = CQPSearchEngine.getCqiClient().cpos2Str(formWordProperty.getQualifiedName(), positions); |
| 219 | 220 |
fixUNDEFValues(formValues); |
| 220 |
|
|
| 221 |
|
|
| 221 | 222 |
wp = mainCorpus.getProperty(prefix + "lemma"); |
| 222 | 223 |
String[] lemmaValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
| 223 | 224 |
fixUNDEFValues(lemmaValues); |
| 224 |
|
|
| 225 |
|
|
| 225 | 226 |
wp = mainCorpus.getProperty(prefix + "upos"); |
| 226 | 227 |
String[] uposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
| 227 | 228 |
fixUNDEFValues(uposValues); |
| 228 |
|
|
| 229 |
|
|
| 229 | 230 |
wp = mainCorpus.getProperty(prefix + "xpos"); |
| 230 | 231 |
String[] xposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
| 231 | 232 |
fixUNDEFValues(xposValues); |
| 232 |
|
|
| 233 |
|
|
| 233 | 234 |
wp = mainCorpus.getProperty(prefix + "feats"); |
| 234 | 235 |
String[] featsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
| 235 | 236 |
fixUNDEFValues(featsValues); |
| 236 |
|
|
| 237 |
|
|
| 237 | 238 |
wp = mainCorpus.getProperty(prefix + "head"); |
| 238 | 239 |
// String[] headValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
| 239 | 240 |
tmpValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
| ... | ... | |
| 247 | 248 |
} |
| 248 | 249 |
} |
| 249 | 250 |
tmpValues = null; |
| 250 |
|
|
| 251 |
|
|
| 251 | 252 |
wp = mainCorpus.getProperty(prefix + "deprel"); |
| 252 | 253 |
String[] deprelValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
| 253 | 254 |
fixUNDEFValues(deprelValues); |
| 254 |
|
|
| 255 |
|
|
| 255 | 256 |
wp = mainCorpus.getProperty(prefix + "deps"); |
| 256 | 257 |
String[] depsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
| 257 | 258 |
fixUNDEFValues(depsValues); |
| 258 |
|
|
| 259 |
|
|
| 259 | 260 |
wp = mainCorpus.getProperty(prefix + "misc"); |
| 260 | 261 |
String[] miscValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions); |
| 261 | 262 |
fixUNDEFValues(miscValues); |
| 262 |
|
|
| 263 |
|
|
| 263 | 264 |
HashSet<Integer> paragraphsStartPositions = new HashSet<>(); |
| 264 | 265 |
if (insertParagraphs) {
|
| 265 | 266 |
StructuralUnit p_struct = mainCorpus.getStructuralUnit("p");
|
| ... | ... | |
| 273 | 274 |
} |
| 274 | 275 |
} |
| 275 | 276 |
} |
| 276 |
|
|
| 277 |
|
|
| 277 | 278 |
HashMap<Integer, String> sentidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, prefix+"sentid"); |
| 278 | 279 |
HashMap<Integer, String> newdocidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, prefix+"newdocid"); |
| 279 |
|
|
| 280 |
|
|
| 280 | 281 |
// build sentence, first pass using UD word sentence positions |
| 281 | 282 |
ArrayList<ArrayList<Integer>> sentences = new ArrayList<>(); |
| 282 | 283 |
ArrayList<Integer> tmpSentence = new ArrayList<>(); |
| 283 | 284 |
for (int p = 0; p < positions.length; p++) {
|
| 284 |
// System.out.println("p=" + p + " id=" + idValues[p] + " form=" + formValues[p] + " lemma=" + lemmaValues[p] + " upos=" + uposValues[p] + " xpos=" + xposValues[p] + " feats="
|
|
| 285 |
// + featsValues[p] + " head=" |
|
| 286 |
// + headValues[p] + " deprel=" + deprelValues[p] + " deps=" + depsValues[p] + " misc=" + miscValues[p]); |
|
| 287 | 285 |
if (sentidStartPositions.containsKey(p)) { // new ud sentence
|
| 288 |
|
|
| 286 |
|
|
| 289 | 287 |
if (tmpSentence.size() > 0) {
|
| 290 | 288 |
sentences.add(new ArrayList<>(tmpSentence)); |
| 291 | 289 |
} |
| 292 |
|
|
| 293 |
// System.out.println("new sentence: " + " id=" + idValues[p] + " form=" + formValues[p] + " lemma=" + lemmaValues[p] + " upos=" + uposValues[p] + " xpos=" + xposValues[p] + "
|
|
| 294 |
// feats=" |
|
| 295 |
// + featsValues[p] + " head=" |
|
| 296 |
// + headValues[p] + " deprel=" + deprelValues[p] + " deps=" + depsValues[p] + " misc=" + miscValues[p]); |
|
| 290 |
|
|
| 297 | 291 |
tmpSentence.clear(); |
| 298 | 292 |
} |
| 299 |
|
|
| 293 |
|
|
| 300 | 294 |
if (insertTokenWithoutUdAnnotations) {
|
| 301 | 295 |
tmpSentence.add(p); // insert all tokens |
| 302 | 296 |
} |
| 303 | 297 |
else if (idValues[p] != 0) {
|
| 304 | 298 |
tmpSentence.add(p); // insert all tokens |
| 305 | 299 |
} |
| 306 |
|
|
| 300 |
|
|
| 307 | 301 |
} |
| 308 | 302 |
positions = null; // free memory |
| 309 |
|
|
| 303 |
|
|
| 310 | 304 |
// fixing sentences |
| 311 | 305 |
for (int s = 0; s < sentences.size(); s++) {
|
| 312 |
|
|
| 306 |
|
|
| 313 | 307 |
// fix only ud sentences limits |
| 314 | 308 |
ArrayList<Integer> sentence = sentences.get(s); |
| 315 |
|
|
| 309 |
|
|
| 316 | 310 |
if (sentidStartPositions.get(sentence.get(0)) == null) {
|
| 317 | 311 |
continue; // this is not a UD sentence |
| 318 | 312 |
} |
| 319 |
|
|
| 313 |
|
|
| 320 | 314 |
int max = -1; |
| 321 | 315 |
int imax = 0; |
| 322 | 316 |
for (int ip = 0; ip < sentence.size(); ip++) {
|
| ... | ... | |
| 326 | 320 |
imax = ip; |
| 327 | 321 |
} |
| 328 | 322 |
} |
| 329 |
|
|
| 323 |
|
|
| 330 | 324 |
ArrayList<Integer> newSentence = new ArrayList<>(); |
| 331 | 325 |
for (int ip = imax + 1; ip < sentence.size(); ip++) {
|
| 332 | 326 |
newSentence.add(sentence.get(ip)); |
| ... | ... | |
| 342 | 336 |
sentences.add(s + 1, newSentence); |
| 343 | 337 |
} |
| 344 | 338 |
} |
| 345 |
|
|
| 339 |
|
|
| 346 | 340 |
if (tmpSentence.size() > 0) { // add last sentence
|
| 347 | 341 |
sentences.add(new ArrayList<>(tmpSentence)); |
| 348 | 342 |
} |
| 349 |
|
|
| 343 |
|
|
| 350 | 344 |
// fixing sentence __NULL__ ud properties |
| 351 | 345 |
for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
|
| 352 | 346 |
ArrayList<Integer> sentence = sentences.get(iSentence); |
| 353 |
|
|
| 347 |
|
|
| 354 | 348 |
int[] sentencePositions = new int[sentence.size()]; |
| 355 | 349 |
for (int p = 0; p < sentence.size(); p++) {
|
| 356 | 350 |
sentencePositions[p] = sentence.get(p); |
| 357 | 351 |
} |
| 358 |
|
|
| 352 |
|
|
| 359 | 353 |
// get CQP values fixing "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps" |
| 360 | 354 |
String[] ids = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("id").getQualifiedName(), sentencePositions);
|
| 361 |
|
|
| 355 |
|
|
| 362 | 356 |
String[] words = null; |
| 363 | 357 |
if (defaultFormPropertyName != null && defaultFormPropertyName.length() > 0) {
|
| 364 | 358 |
words = getDefaultValues(mainCorpus, defaultFormPropertyName, sentencePositions); |
| ... | ... | |
| 375 | 369 |
if (defaultXposPropertyName != null && defaultXposPropertyName.length() > 0) {
|
| 376 | 370 |
xposs = getDefaultValues(mainCorpus, defaultXposPropertyName, sentencePositions); |
| 377 | 371 |
} |
| 378 |
|
|
| 372 |
|
|
| 379 | 373 |
String[] feats = null; |
| 380 | 374 |
if (defaultFeatsPropertyName != null && defaultFeatsPropertyName.length() > 0) {
|
| 381 | 375 |
feats = getDefaultValues(mainCorpus, defaultFeatsPropertyName, sentencePositions); |
| ... | ... | |
| 396 | 390 |
if (defaultMiscPropertyName != null && defaultMiscPropertyName.length() > 0) {
|
| 397 | 391 |
miscs = getDefaultValues(mainCorpus, defaultMiscPropertyName, sentencePositions); |
| 398 | 392 |
} |
| 399 |
|
|
| 393 |
|
|
| 400 | 394 |
// String[] feats = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(featsCorrPropertyName).getQualifiedName(), sentencePositions); |
| 401 | 395 |
// String[] head = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(headCorrPropertyName).getQualifiedName(), sentencePositions); |
| 402 | 396 |
// String[] deprel = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions); |
| 403 | 397 |
// String[] deps = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions); |
| 404 |
|
|
| 398 |
|
|
| 405 | 399 |
// fix ud properties using CQP values |
| 406 | 400 |
for (int ip = 0; ip < sentence.size(); ip++) {
|
| 407 |
|
|
| 401 |
|
|
| 408 | 402 |
int p = sentence.get(ip); |
| 409 |
|
|
| 403 |
|
|
| 410 | 404 |
// new word |
| 411 | 405 |
if (miscValues[p].equals("_")) {
|
| 412 | 406 |
miscValues[p] = "XmlId=" + ids[ip]; |
| 413 | 407 |
} |
| 414 |
|
|
| 408 |
|
|
| 415 | 409 |
// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
|
| 416 | 410 |
if (words != null && formValues[p].equals("_")) {
|
| 417 | 411 |
formValues[p] = words[ip]; |
| ... | ... | |
| 441 | 435 |
miscValues[p] = miscs[ip]; |
| 442 | 436 |
} |
| 443 | 437 |
} |
| 444 |
|
|
| 438 |
|
|
| 445 | 439 |
if (insertNoSpaceAfter) {
|
| 446 | 440 |
for (int ip = 0; ip < sentence.size(); ip++) { // fix SpaceAfter. !!! this needs to be done after ud properties are fixed
|
| 447 | 441 |
int p = sentence.get(ip); |
| ... | ... | |
| 457 | 451 |
} |
| 458 | 452 |
} |
| 459 | 453 |
} |
| 460 |
|
|
| 454 |
|
|
| 461 | 455 |
// fixing sentence punct limits |
| 462 | 456 |
while (sentence.size() > 0 && iSentence > 0 && formValues[sentence.get(0)].matches("\\p{P}") && !formValues[sentence.get(0)].matches(openingPunct)) {
|
| 463 | 457 |
// System.out.println("FIXING: first non-openingPunct position " + formValues[sentence.get(0)] + " in " + iSentence);
|
| ... | ... | |
| 470 | 464 |
int p2 = sentence.remove(sentence.size() - 1); |
| 471 | 465 |
sentences.get(iSentence + 1).add(0, p2); |
| 472 | 466 |
} |
| 473 |
|
|
| 467 |
|
|
| 474 | 468 |
if (sentence.size() == 0) { // sentence was depleted after fixing it
|
| 475 | 469 |
sentences.remove(iSentence); |
| 476 | 470 |
iSentence--; |
| 477 | 471 |
continue; |
| 478 | 472 |
} |
| 479 | 473 |
} |
| 480 |
|
|
| 474 |
|
|
| 481 | 475 |
for (int s = 0; s < sentences.size(); s++) {
|
| 482 |
|
|
| 476 |
|
|
| 483 | 477 |
// fix only ud sentences limits |
| 484 | 478 |
ArrayList<Integer> sentence = sentences.get(s); |
| 485 | 479 |
HashMap<Integer, Integer> oldToNewIds = new HashMap<>(); |
| 486 | 480 |
for (int ip = 0; ip < sentence.size(); ip++) { // computing old to new ids
|
| 487 | 481 |
int p = sentence.get(ip); |
| 488 |
|
|
| 482 |
|
|
| 489 | 483 |
if (idValues[p] != 0) { // store "old id -> new id"
|
| 490 | 484 |
oldToNewIds.put(idValues[p], (ip + 1)); // from 1 to N |
| 491 | 485 |
} |
| 492 | 486 |
} |
| 493 |
|
|
| 487 |
|
|
| 494 | 488 |
// fixing head and set missing head to 0 and root |
| 495 | 489 |
for (int ip = 0; ip < sentence.size(); ip++) {
|
| 496 | 490 |
int p = sentence.get(ip); |
| 497 |
|
|
| 491 |
|
|
| 498 | 492 |
// fixing id value |
| 499 | 493 |
idValues[p] = (ip + 1); // from 1 to N |
| 500 |
|
|
| 494 |
|
|
| 501 | 495 |
// fixing head values |
| 502 | 496 |
if (oldToNewIds.containsKey(headValues[p])) {
|
| 503 | 497 |
headValues[p] = oldToNewIds.get(headValues[p]); |
| ... | ... | |
| 509 | 503 |
} |
| 510 | 504 |
} |
| 511 | 505 |
} |
| 512 |
|
|
| 506 |
|
|
| 513 | 507 |
// writing sentences |
| 514 | 508 |
File resultConlluFile = new File(conlluResultDirectory, textIds[iText] + ".conllu"); |
| 515 | 509 |
PrintWriter writer = IOUtils.getWriter(resultConlluFile); |
| 516 |
|
|
| 510 |
|
|
| 517 | 511 |
int iParagraph = 1; |
| 518 |
|
|
| 512 |
|
|
| 519 | 513 |
for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
|
| 520 | 514 |
ArrayList<Integer> sentence = sentences.get(iSentence); |
| 521 |
|
|
| 515 |
|
|
| 522 | 516 |
int[] sentencePositions = new int[sentence.size()]; |
| 523 | 517 |
for (int p = 0; p < sentence.size(); p++) {
|
| 524 | 518 |
sentencePositions[p] = sentence.get(p); |
| 525 | 519 |
} |
| 526 |
|
|
| 520 |
|
|
| 527 | 521 |
String[] gap = null; |
| 528 | 522 |
if (detectGap && mainCorpus.getProperty("gap") != null) {
|
| 529 | 523 |
gap = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("gap").getQualifiedName(), sentencePositions);
|
| 530 | 524 |
} |
| 531 |
|
|
| 525 |
|
|
| 532 | 526 |
String[] tokens = new String[sentence.size()]; |
| 533 | 527 |
for (int ip = 0; ip < sentence.size(); ip++) {
|
| 534 | 528 |
tokens[ip] = formValues[sentence.get(ip)]; |
| 535 | 529 |
} |
| 536 |
|
|
| 530 |
|
|
| 537 | 531 |
if (insertNoSpaceAfter) {
|
| 538 | 532 |
writer.println("# text = " + LangFormater.format(StringUtils.join(tokens, " "), mainCorpus.getLang()));
|
| 539 | 533 |
} |
| 540 | 534 |
else {
|
| 541 | 535 |
writer.println("# text = " + StringUtils.join(tokens, " "));
|
| 542 | 536 |
} |
| 543 |
|
|
| 537 |
|
|
| 544 | 538 |
if (newdocidStartPositions.containsKey(sentence.get(0))) {
|
| 545 | 539 |
writer.println("# newdoc id = " + newdocidStartPositions.get(sentence.get(0)));
|
| 546 | 540 |
} |
| 547 | 541 |
else {
|
| 548 | 542 |
writer.println("# newdoc id = " + textIds[iText]);
|
| 549 | 543 |
} |
| 550 |
|
|
| 544 |
|
|
| 551 | 545 |
boolean foundSentId = false; |
| 552 | 546 |
for (int ip : sentence) {
|
| 553 | 547 |
if (!foundSentId && sentidStartPositions.containsKey(ip)) {
|
| ... | ... | |
| 558 | 552 |
if (!foundSentId) { // no sent_id found
|
| 559 | 553 |
writer.println("# sent_id = " + textIds[iText] + "-" + (iSentence + 1) + ".new");
|
| 560 | 554 |
} |
| 561 |
|
|
| 555 |
|
|
| 562 | 556 |
if (paragraphsStartPositions.contains(sentence.get(0))) { // paragraphsStartPositions is empty if the injectParagraph option is not set
|
| 563 | 557 |
writer.println("# newpar id = " + iParagraph);
|
| 564 | 558 |
iParagraph++; |
| 565 | 559 |
} |
| 566 |
|
|
| 560 |
|
|
| 567 | 561 |
for (int ip = 0; ip < sentence.size(); ip++) {
|
| 568 | 562 |
int p = sentence.get(ip); |
| 569 |
|
|
| 563 |
|
|
| 570 | 564 |
// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
|
| 571 | 565 |
writer.println(idValues[p] + "\t" + formValues[p] + "\t" + lemmaValues[p] + "\t" + uposValues[p] |
| 572 | 566 |
+ "\t" + xposValues[p] + "\t" + featsValues[p] + "\t" + headValues[p] + "\t" + deprelValues[p] |
| 573 |
+ "\t" + depsValues[p] + "\t" + miscValues[p]); |
|
| 574 |
|
|
| 567 |
+ "\t" + depsValues[p] + "\t" + miscValues[p]);
|
|
| 568 |
|
|
| 575 | 569 |
if (gap != null && gap[ip].equals("next")) {
|
| 576 | 570 |
writer.println("# gap");
|
| 577 | 571 |
} |
| ... | ... | |
| 580 | 574 |
numberOfSentencesWritten++; |
| 581 | 575 |
} |
| 582 | 576 |
writer.close(); |
| 583 |
|
|
| 577 |
|
|
| 584 | 578 |
System.out.println(" Text done: " + resultConlluFile);
|
| 585 | 579 |
numberOfTextsWritten++; |
| 586 | 580 |
} |
| 587 |
|
|
| 581 |
|
|
| 588 | 582 |
System.out.println("# words written: " + numberOfWordsWritten);
|
| 589 | 583 |
System.out.println("# sentences written: " + numberOfSentencesWritten);
|
| 590 | 584 |
System.out.println("# texts written: " + numberOfTextsWritten);
|
| 591 |
|
|
| 585 |
|
|
| 592 | 586 |
return numberOfWordsWritten; |
| 593 | 587 |
} |
| 594 |
|
|
| 588 |
|
|
| 595 | 589 |
private static String[] getDefaultValues(MainCorpus mainCorpus, String property, int[] positions) throws UnexpectedAnswerException, IOException, CqiServerError, CqiClientException {
|
| 596 |
|
|
| 597 |
String[] values = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(property).getQualifiedName(), positions);
|
|
| 598 |
for (int iupos = 0; iupos < values.length; iupos++) { // recode the || CQP multiple values to ud multiple values
|
|
| 599 |
if (values[iupos].length() > 2 && values[iupos].startsWith("|") && values[iupos].endsWith("|")) {
|
|
| 600 |
values[iupos] = values[iupos].substring(1, values[iupos].length() - 1);
|
|
| 590 |
|
|
| 591 |
if (mainCorpus.getProperty(property) != null) {
|
|
| 592 |
String[] emptyvalues = new String[positions.length];
|
|
| 593 |
for (int i = 0 ; i < emptyvalues.length ; i++) {
|
|
| 594 |
emptyvalues[i] = "_";
|
|
| 601 | 595 |
} |
| 596 |
return emptyvalues; |
|
| 597 |
} else {
|
|
| 598 |
String[] values = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(property).getQualifiedName(), positions); |
|
| 599 |
for (int iupos = 0; iupos < values.length; iupos++) { // recode the || CQP multiple values to ud multiple values
|
|
| 600 |
if (values[iupos].length() > 2 && values[iupos].startsWith("|") && values[iupos].endsWith("|")) {
|
|
| 601 |
values[iupos] = values[iupos].substring(1, values[iupos].length() - 1); |
|
| 602 |
} |
|
| 603 |
} |
|
| 604 |
|
|
| 605 |
return values; |
|
| 602 | 606 |
} |
| 603 |
|
|
| 604 |
return values; |
|
| 605 | 607 |
} |
| 606 |
|
|
| 608 |
|
|
| 607 | 609 |
private static HashMap<Integer, String> getNonUNDEFPositionsAndValues(MainCorpus mainCorpus, String property) throws UnexpectedAnswerException, IOException, CqiServerError, CqiClientException {
|
| 608 |
|
|
| 610 |
|
|
| 609 | 611 |
HashMap<Integer, String> sentidStartPositions = new HashMap<>(); |
| 610 | 612 |
int[] ids = CQPSearchEngine.getCqiClient().regex2Id(mainCorpus.getProperty(property).getQualifiedName(), "(?!__UNDEF__).+"); |
| 611 | 613 |
String[] strs = CQPSearchEngine.getCqiClient().id2Str(mainCorpus.getProperty(property).getQualifiedName(), ids); |
| ... | ... | |
| 616 | 618 |
sentidStartPositions.put(p, strs[iId]); |
| 617 | 619 |
} |
| 618 | 620 |
} |
| 619 |
|
|
| 621 |
|
|
| 620 | 622 |
return sentidStartPositions; |
| 621 | 623 |
} |
| 622 |
|
|
| 624 |
|
|
| 623 | 625 |
private static void fixUNDEFValues(String[] values) {
|
| 624 |
|
|
| 626 |
|
|
| 625 | 627 |
for (int i = 0; i < values.length; i++) {
|
| 626 | 628 |
if (values[i].equals("__UNDEF__") || values[i].equals("") || values[i].equals("|_|")) {
|
| 627 | 629 |
values[i] = "_"; |
| TXM/trunk/bundles/org.txm.conllu.rcp/src/org/txm/conllu/rcp/commands/ImportCoNLLUAnnotationsFromDirectory.java (revision 3731) | ||
|---|---|---|
| 189 | 189 |
return 0; |
| 190 | 190 |
} |
| 191 | 191 |
|
| 192 |
mainCorpus.getProject().appendToHistory("CoNLL-U annotations imported from "+conlluDirectory+" : "+nTextProcessed+" texts and "+nWordsInserted+" words processed.");
|
|
| 193 |
|
|
| 192 | 194 |
Log.info("XML-TXM source files updated. Updating indexes...");
|
| 193 | 195 |
|
| 194 | 196 |
UDPreferences.getInstance().setProjectPreferenceValue(mainCorpus.getProject(), UDPreferences.UDPREFIX, propertiesPrefix); |
| ... | ... | |
| 226 | 228 |
return 0; |
| 227 | 229 |
} |
| 228 | 230 |
|
| 231 |
mainCorpus.getProject().appendToHistory("CoNLL-U annotations imported from "+conlluFile+" texts and "+nWordsInserted+" words processed.");
|
|
| 232 |
|
|
| 229 | 233 |
Log.info("XML-TXM source files updated. Updating indexes...");
|
| 230 | 234 |
|
| 231 | 235 |
UDPreferences.getInstance().setProjectPreferenceValue(mainCorpus.getProject(), UDPreferences.UDPREFIX, propertiesPrefix); |
| TXM/trunk/bundles/org.txm.annotation.kr.rcp/src/org/txm/annotation/kr/rcp/commands/ImportTable.java (revision 3731) | ||
|---|---|---|
| 130 | 130 |
} |
| 131 | 131 |
monitor.worked(30); |
| 132 | 132 |
|
| 133 |
corpus.getProject().appendToHistory("CQP Annotations imported from "+annotationsFile);
|
|
| 133 | 134 |
Log.info("Done.");
|
Formats disponibles : Unified diff