Révision 4001
| TXM/trunk/bundles/org.txm.conllu.rcp/src/org/txm/conllu/rcp/importsection/CoNLLUSection.java (revision 4001) | ||
|---|---|---|
| 3 | 3 |
import org.eclipse.swt.SWT; |
| 4 | 4 |
import org.eclipse.swt.widgets.Button; |
| 5 | 5 |
import org.eclipse.swt.widgets.Composite; |
| 6 |
import org.eclipse.swt.widgets.Group; |
|
| 6 | 7 |
import org.eclipse.swt.widgets.Label; |
| 7 | 8 |
import org.eclipse.swt.widgets.Text; |
| 8 | 9 |
import org.eclipse.ui.forms.events.ExpansionAdapter; |
| ... | ... | |
| 16 | 17 |
import org.txm.objects.Project; |
| 17 | 18 |
import org.txm.rcp.editors.imports.ImportFormEditor; |
| 18 | 19 |
import org.txm.rcp.editors.imports.sections.ImportEditorSection; |
| 20 |
import org.txm.rcp.swt.widget.RadioGroup; |
|
| 19 | 21 |
|
| 20 | 22 |
public class CoNLLUSection extends ImportEditorSection {
|
| 21 | 23 |
|
| ... | ... | |
| 25 | 27 |
|
| 26 | 28 |
Button buildTIGERIndexesButton; |
| 27 | 29 |
Button useNewDocIdButton; |
| 28 |
Button keepWordContractionsButton;
|
|
| 30 |
RadioGroup keepWordContractionsButton;
|
|
| 29 | 31 |
Text udPropertiesPrefixButton; |
| 30 | 32 |
private Text headPropertiesText; |
| 31 | 33 |
private Text depsPropertiesText; |
| ... | ... | |
| 71 | 73 |
gdata2.colspan = 4; // one line |
| 72 | 74 |
useNewDocIdButton.setLayoutData(gdata2); |
| 73 | 75 |
|
| 74 |
keepWordContractionsButton = toolkit.createButton(sectionClient, "Keep multiword tokens when importing CoNLL-U files", SWT.CHECK); |
|
| 76 |
keepWordContractionsButton = new RadioGroup(sectionClient, SWT.NONE, "Multiwords management", new String[][] {{UDPreferences.MULTIWORDS, "Multiwords"}, {UDPreferences.TOKENS, "Tokens"}, {UDPreferences.ALL, "All"}});//toolkit.create toolkit.createButton(sectionClient, "Keep multiword tokens when importing CoNLL-U files", SWT.CHECK);
|
|
| 77 |
keepWordContractionsButton.setToolTipText("Multiwords: only the multiwords are indexed, Tokens: Only the multiwords tokens are indexed, All: both are indexed");
|
|
| 75 | 78 |
gdata2 = getButtonLayoutData(); |
| 76 | 79 |
gdata2.colspan = 4; // one line |
| 77 | 80 |
keepWordContractionsButton.setLayoutData(gdata2); |
| ... | ... | |
| 114 | 117 |
|
| 115 | 118 |
buildTIGERIndexesButton.setSelection(customNode.getBoolean(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES, UDPreferences.getInstance().getBoolean(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES))); //$NON-NLS-1$ |
| 116 | 119 |
useNewDocIdButton.setSelection(customNode.getBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID, UDPreferences.getInstance().getBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID))); //$NON-NLS-1$ |
| 117 |
keepWordContractionsButton.setSelection(customNode.getBoolean(UDPreferences.KEEP_MULTIWORD_TOKENS, UDPreferences.getInstance().getBoolean(UDPreferences.KEEP_MULTIWORD_TOKENS))); //$NON-NLS-1$
|
|
| 120 |
keepWordContractionsButton.setSelection(customNode.get(UDPreferences.MULTIWORDS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.MULTIWORDS_MANAGEMENT))); //$NON-NLS-1$
|
|
| 118 | 121 |
udPropertiesPrefixButton.setText(customNode.get(UDPreferences.UDPREFIX, UDPreferences.getInstance().getString(UDPreferences.UDPREFIX))); //$NON-NLS-1$ |
| 119 | 122 |
headPropertiesText.setText(customNode.get(UDPreferences.IMPORT_HEAD_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_HEAD_TO_PROJECT))); //$NON-NLS-1$ |
| 120 | 123 |
depsPropertiesText.setText(customNode.get(UDPreferences.IMPORT_DEPS_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_DEPS_TO_PROJECT))); //$NON-NLS-1$ |
| ... | ... | |
| 127 | 130 |
Preferences customNode = project.getPreferencesScope().getNode(UDPreferences.getInstance().getPreferencesNodeQualifier()); |
| 128 | 131 |
customNode.putBoolean(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES, buildTIGERIndexesButton.getSelection()); |
| 129 | 132 |
customNode.putBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID, useNewDocIdButton.getSelection()); |
| 130 |
customNode.putBoolean(UDPreferences.KEEP_MULTIWORD_TOKENS, keepWordContractionsButton.getSelection());
|
|
| 133 |
customNode.put(UDPreferences.MULTIWORDS_MANAGEMENT, keepWordContractionsButton.getSelection());
|
|
| 131 | 134 |
customNode.put(UDPreferences.UDPREFIX, udPropertiesPrefixButton.getText()); |
| 132 | 135 |
customNode.put(UDPreferences.IMPORT_HEAD_TO_PROJECT, headPropertiesText.getText()); |
| 133 | 136 |
customNode.put(UDPreferences.IMPORT_DEPS_TO_PROJECT, depsPropertiesText.getText()); |
| TXM/trunk/bundles/org.txm.conllu.rcp/src/org/txm/conllu/rcp/preferences/CoNLLUPreferencePage.java (revision 4001) | ||
|---|---|---|
| 26 | 26 |
this.addField(new BooleanFieldEditor(UDPreferences.IMPORT_USE_NEW_DOC_ID, "Use new odc id when importing CoNLL-U files", this.getFieldEditorParent())); |
| 27 | 27 |
this.addField(new BooleanFieldEditor(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES, "Build TIGERSearch indexes as well", this.getFieldEditorParent())); |
| 28 | 28 |
|
| 29 |
this.addField(new BooleanFieldEditor(UDPreferences.KEEP_MULTIWORD_TOKENS, "Keep multiword tokens when importing CoNLL-U files", this.getFieldEditorParent()));
|
|
| 29 |
this.addField(new BooleanFieldEditor(UDPreferences.MULTIWORDS_MANAGEMENT, "Keep multiword tokens when importing CoNLL-U files", this.getFieldEditorParent()));
|
|
| 30 | 30 |
udPrefixField = new StringFieldEditor(UDPreferences.UDPREFIX, "UD properties prefix", this.getFieldEditorParent()); |
| 31 | 31 |
this.addField(udPrefixField); |
| 32 | 32 |
this.addField(new StringFieldEditor(UDPreferences.IMPORT_HEAD_TO_PROJECT, "UD head properties to project (comma separated list)", this.getFieldEditorParent())); |
| TXM/trunk/bundles/org.txm.tigersearch.rcp/src/org/txm/searchengine/ts/TIGERSearchEngine.java (revision 4001) | ||
|---|---|---|
| 10 | 10 |
import java.nio.channels.FileChannel; |
| 11 | 11 |
import java.util.ArrayList; |
| 12 | 12 |
import java.util.HashMap; |
| 13 |
import java.util.LinkedHashMap; |
|
| 13 | 14 |
import java.util.LinkedHashSet; |
| 14 | 15 |
import java.util.List; |
| 15 | 16 |
|
| ... | ... | |
| 61 | 62 |
HashMap<CorpusBuild, TSCorpus> corpora = null; |
| 62 | 63 |
|
| 63 | 64 |
public TSCorpus getTSCorpus(CorpusBuild corpus) {
|
| 64 |
|
|
| 65 |
|
|
| 65 | 66 |
CorpusBuild root = corpus.getRootCorpusBuild(); |
| 66 | 67 |
TSCorpus tscorpus = corpora.get(root); |
| 67 | 68 |
if (tscorpus != null) {
|
| ... | ... | |
| 184 | 185 |
|
| 185 | 186 |
TSCorpus tcorpus = this.getTSCorpus(corpus); |
| 186 | 187 |
TSResult result = queryTIGER(corpus, query); |
| 187 |
|
|
| 188 |
|
|
| 188 | 189 |
if (result == null) { // no result
|
| 189 | 190 |
return new EmptySelection(query); |
| 190 | 191 |
} |
| 191 | 192 |
|
| 192 | 193 |
return fromTIGERMatchToSelection(tcorpus, result, query, corpus); |
| 193 | 194 |
} |
| 194 |
|
|
| 195 |
|
|
| 195 | 196 |
public TSResult queryTIGER(CorpusBuild corpus, IQuery query) throws Exception {
|
| 196 | 197 |
|
| 197 | 198 |
TSCorpus tcorpus = this.getTSCorpus(corpus); |
| ... | ... | |
| 207 | 208 |
return result; |
| 208 | 209 |
} |
| 209 | 210 |
int[] cpos = { matches.get(0).getStart(), matches.get(matches.size() - 1).getEnd() };
|
| 210 |
|
|
| 211 |
|
|
| 211 | 212 |
//apply shifts on start and end positions |
| 212 | 213 |
MappedByteBuffer offsetsMapped = tcorpus.getOffsetsMapped(); |
| 213 | 214 |
if (offsetsMapped != null) {
|
| 214 | 215 |
cpos[0] -= offsetsMapped.getInt(cpos[0] * Integer.BYTES); |
| 215 | 216 |
cpos[1] -= offsetsMapped.getInt(cpos[1] * Integer.BYTES); |
| 216 | 217 |
} |
| 217 |
|
|
| 218 |
|
|
| 218 | 219 |
int[] starts = tcorpus.getSentenceStartPositions(); |
| 219 | 220 |
int sent_max = starts.length; |
| 220 | 221 |
int sent_min = 0; |
| ... | ... | |
| 226 | 227 |
break; |
| 227 | 228 |
} |
| 228 | 229 |
} |
| 229 |
|
|
| 230 |
|
|
| 230 | 231 |
Log.finest("QUERYING sentences: " + sent_min + " -> " + sent_max); //$NON-NLS-1$ //$NON-NLS-2$
|
| 231 | 232 |
result = tcorpus.query(query.getQueryString().replace("\n", " "), sent_min, sent_max, -1); //$NON-NLS-1$ //$NON-NLS-2$
|
| 232 | 233 |
} |
| ... | ... | |
| 308 | 309 |
|
| 309 | 310 |
return new TIGERSelection(query, result2); |
| 310 | 311 |
} |
| 311 |
|
|
| 312 |
|
|
| 312 | 313 |
@Override |
| 313 | 314 |
public Query newQuery() {
|
| 314 | 315 |
return new TIGERQuery(); |
| ... | ... | |
| 373 | 374 |
|
| 374 | 375 |
PrintWriter writer = IOUtils.getWriter(logprop, "UTF-8"); //$NON-NLS-1$ |
| 375 | 376 |
writer.println("# Default log configuration of the TIGERSearch suite"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
|
| 376 |
"log4j.rootLogger=SEVERE,Logfile"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
|
|
| 377 |
"log4j.rootLogger=INFO,Logfile"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
|
|
| 377 | 378 |
"log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=SEVERE"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$ |
| 378 | 379 |
"log4j.appender.Logfile=org.apache.log4j.RollingFileAppender"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$ |
| 379 | 380 |
"log4j.appender.Logfile.File="+logprop.getAbsolutePath()+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$ |
| ... | ... | |
| 381 | 382 |
"log4j.appender.Logfile.MaxBackupIndex=1"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$ |
| 382 | 383 |
"log4j.appender.Logfile.layout=org.apache.log4j.PatternLayout"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$ |
| 383 | 384 |
"log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n"); //$NON-NLS-1$
|
| 384 |
|
|
| 385 |
writer.close(); |
|
| 385 | 386 |
BasicConfigurator.configure(); |
| 386 | 387 |
|
| 387 | 388 |
String driverFilename = TigerSearchPreferences.getInstance().getString(TigerSearchPreferences.DRIVER_FILENAME); |
| 388 | 389 |
|
| 389 | 390 |
File master = new File(sourceDirectory, driverFilename); |
| 390 | 391 |
if (!master.exists()) {
|
| 391 |
// File[] xmlFiles = sourceDirectory.listFiles(new FileFilter() {
|
|
| 392 |
// public boolean accept(File file) {
|
|
| 393 |
// if (file.isDirectory()) return false; |
|
| 394 |
// if (file.isHidden()) return false; |
|
| 395 |
// String filename = file.getName(); |
|
| 396 |
// if (filename.equals("import.xml")) return false;
|
|
| 397 |
// if (!filename.endsWith(".xml")) return false;
|
|
| 398 |
// |
|
| 399 |
// return true; |
|
| 400 |
// } |
|
| 401 |
// }); |
|
| 402 |
// |
|
| 403 |
// if (xmlFiles == null) {
|
|
| 404 |
System.out.println("No master file found in: "+sourceDirectory);
|
|
| 405 |
return false;
|
|
| 406 |
// } |
|
| 407 |
// master = xmlFiles[0]; |
|
| 392 |
// File[] xmlFiles = sourceDirectory.listFiles(new FileFilter() {
|
|
| 393 |
// public boolean accept(File file) {
|
|
| 394 |
// if (file.isDirectory()) return false;
|
|
| 395 |
// if (file.isHidden()) return false;
|
|
| 396 |
// String filename = file.getName();
|
|
| 397 |
// if (filename.equals("import.xml")) return false;
|
|
| 398 |
// if (!filename.endsWith(".xml")) return false;
|
|
| 399 |
//
|
|
| 400 |
// return true;
|
|
| 401 |
// }
|
|
| 402 |
// });
|
|
| 403 |
//
|
|
| 404 |
// if (xmlFiles == null) {
|
|
| 405 |
System.out.println("No master file found in: "+sourceDirectory);
|
|
| 406 |
return false; |
|
| 407 |
// }
|
|
| 408 |
// master = xmlFiles[0];
|
|
| 408 | 409 |
} |
| 409 | 410 |
String uri = master.getAbsolutePath(); // TIGER corpus source root file |
| 410 | 411 |
File tigerBinDir = new File(tigerDir, corpusName); |
| 411 | 412 |
tigerBinDir.mkdir(); |
| 412 | 413 |
try {
|
| 414 |
LinkedHashMap<String, ArrayList<String>> errorsPerTIGERSubcorpus = new LinkedHashMap<>(); |
|
| 413 | 415 |
IndexBuilderErrorHandler handler = new SimpleErrorHandler(tigerBinDir.getAbsolutePath()) {
|
| 414 |
public void setMessage(String message) { }
|
|
| 416 |
String currentSubcorpus = ""; |
|
| 417 |
public void setMessage(String message) {
|
|
| 418 |
if (message.startsWith("Reading subcorpus ")) {
|
|
| 419 |
currentSubcorpus = message.substring("Reading subcorpus ".length());
|
|
| 420 |
} |
|
| 421 |
} |
|
| 422 |
public void newSentenceError(String sid, String message) {
|
|
| 423 |
if (!errorsPerTIGERSubcorpus.containsKey(currentSubcorpus)) errorsPerTIGERSubcorpus.put(currentSubcorpus, new ArrayList<String>()); |
|
| 424 |
errorsPerTIGERSubcorpus.get(currentSubcorpus).add("Error at "+sid+": "+message);
|
|
| 425 |
} |
|
| 426 |
public void newSentenceWarning(String sid, String message) {
|
|
| 427 |
if (!errorsPerTIGERSubcorpus.containsKey(currentSubcorpus)) errorsPerTIGERSubcorpus.put(currentSubcorpus, new ArrayList<String>()); |
|
| 428 |
errorsPerTIGERSubcorpus.get(currentSubcorpus).add("Warning at "+sid+": "+message);
|
|
| 429 |
} |
|
| 415 | 430 |
public void setNumberOfSentences(int number) { }
|
| 416 | 431 |
public void setProgressBar(int value) { }
|
| 417 | 432 |
}; |
| 433 |
|
|
| 418 | 434 |
XMLIndexing indexing = new XMLIndexing(corpusName, uri, tigerBinDir.getAbsolutePath(), handler, false); |
| 419 | 435 |
indexing.startIndexing(); |
| 420 |
File logs = new File(tigerBinDir, "indexing.log"); //$NON-NLS-1$ |
|
| 421 |
String txt = IOUtils.getText(logs); |
|
| 422 |
if (txt.contains("Error in corpus graph ")) {
|
|
| 423 |
Log.warning("Error while importing TIGER corpus: "+txt);
|
|
| 424 |
return false; |
|
| 436 |
if (errorsPerTIGERSubcorpus.size() > 0) {
|
|
| 437 |
for (String s : errorsPerTIGERSubcorpus.keySet()) {
|
|
| 438 |
Log.info("In "+s);
|
|
| 439 |
for (String s2 : errorsPerTIGERSubcorpus.get(s)) {
|
|
| 440 |
Log.info(" "+s2);
|
|
| 441 |
|
|
| 442 |
} |
|
| 443 |
} |
|
| 425 | 444 |
} |
| 426 | 445 |
} |
| 427 | 446 |
catch (Exception e) {
|
| TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImporter.groovy (revision 4001) | ||
|---|---|---|
| 30 | 30 |
super(module) |
| 31 | 31 |
} |
| 32 | 32 |
|
| 33 |
public final String merge (String s1, String s2) {
|
|
| 34 |
if (s1 == s2) return s1 |
|
| 35 |
if (s1 == "") return s2 |
|
| 36 |
if (s2 == "") return s1 |
|
| 37 |
if (s1 == "_") return s2 |
|
| 38 |
if (s2 == "_") return s1 |
|
| 39 |
if (s1 == "" && s2 == "") return "_" |
|
| 40 |
if (s1 == "_" && s2 == "_") return "_" |
|
| 41 |
return s1+"|"+s2 |
|
| 33 |
public final String merge(String orig, def sss) {
|
|
| 34 |
|
|
| 35 |
int n = 0; |
|
| 36 |
for (String s : sss) {
|
|
| 37 |
if (s != "" && s != "_") {
|
|
| 38 |
|
|
| 39 |
def ssset = new HashSet(sss); |
|
| 40 |
if (ssset.size() == 1) return ssset.join(" + ")
|
|
| 41 |
|
|
| 42 |
return sss.join(" + ")
|
|
| 43 |
} |
|
| 44 |
} |
|
| 45 |
|
|
| 46 |
return orig; |
|
| 42 | 47 |
} |
| 43 | 48 |
|
| 44 | 49 |
@Override |
| ... | ... | |
| 53 | 58 |
conlluSrcDirectory.deleteDir() |
| 54 | 59 |
conlluSrcDirectory.mkdirs() |
| 55 | 60 |
|
| 56 |
println "Split CoNLL-U files per text..." |
|
| 57 | 61 |
if (!splitCoNLLUFiles(inputDirectory, conlluSrcDirectory, project)) {
|
| 58 | 62 |
return |
| 59 | 63 |
} |
| ... | ... | |
| 63 | 67 |
files.sort() |
| 64 | 68 |
|
| 65 | 69 |
// Keep or not contractions |
| 66 |
boolean keepContractions = "true".equals(UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.KEEP_MULTIWORD_TOKENS, UDPreferences.getInstance().getString(UDPreferences.KEEP_MULTIWORD_TOKENS)));
|
|
| 70 |
String keepContractions = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.MULTIWORDS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.MULTIWORDS_MANAGEMENT));
|
|
| 67 | 71 |
|
| 68 |
println "Remove multiword tokens (keepContractions=$keepContractions) & add XmlId if necessary"
|
|
| 72 |
println "Multiwords managment ($keepContractions) & add XmlId if necessary & remove empty nodes"
|
|
| 69 | 73 |
ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size()) |
| 70 | 74 |
for (File conlluFile : files) {
|
| 71 | 75 |
cpb_texts.tick() |
| ... | ... | |
| 85 | 89 |
} else {
|
| 86 | 90 |
split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++) |
| 87 | 91 |
} |
| 88 |
|
|
| 89 | 92 |
} |
| 90 | 93 |
|
| 91 |
if (!keepContractions && split[0] != null && split[0].contains("-")) {
|
|
| 92 |
println "REMOVE" |
|
| 93 |
println "\t"+lines[i] |
|
| 94 |
println "\t"+lines[i+1] |
|
| 95 |
println "\t"+lines[i+2] |
|
| 96 |
|
|
| 97 |
def split1 = lines[i+1].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
|
| 98 |
def split2 = lines[i+2].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
|
| 99 |
|
|
| 100 |
if (split[2] == "_") split[2] = merge(split1[2],split2[2]) |
|
| 101 |
if (split[3] == "_") split[3] = merge(split1[3], split2[3]) |
|
| 102 |
if (split[4] == "_") split[4] = merge(split1[4], split2[4]) |
|
| 103 |
if (split[5] == "_") split[5] = merge(split1[5], split2[5]) |
|
| 104 |
if (split[6] == "_") split[6] = split1[6] |
|
| 105 |
if (split[7] == "_") split[7] = merge(split1[7], split2[7]) |
|
| 106 |
if (split[8] == "_") split[8] = merge(split1[8], split2[8]) |
|
| 107 |
|
|
| 108 |
lines.remove(i+1) |
|
| 109 |
lines.remove(i+1) |
|
| 94 |
if (split[0].contains(".")) {
|
|
| 95 |
//println "REMOVE EMPTY NODE: $split : "+ |
|
| 96 |
lines.remove(i) |
|
| 97 |
i-- |
|
| 98 |
continue; // next ! |
|
| 110 | 99 |
} |
| 111 | 100 |
|
| 112 |
lines[i] = split.join("\t")
|
|
| 101 |
if (keepContractions == UDPreferences.ALL) {
|
|
| 102 |
// ok on fait rien |
|
| 103 |
} else if (keepContractions == UDPreferences.TOKENS) {
|
|
| 104 |
if (split[0].contains("-")) {
|
|
| 105 |
//println "REMOVE - $split" |
|
| 106 |
lines.remove(i) |
|
| 107 |
i-- |
|
| 108 |
continue; /// next ! |
|
| 109 |
} |
|
| 110 |
} else if (keepContractions == UDPreferences.MULTIWORDS) {
|
|
| 111 |
if (split[0].contains("-")) {
|
|
| 112 |
int n1 = Integer.parseInt(split[0].substring(0, split[0].indexOf("-")));
|
|
| 113 |
split[0] = ""+n1 |
|
| 114 |
|
|
| 115 |
|
|
| 116 |
int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
|
|
| 117 |
int n = 1 + n2 - n1 |
|
| 118 |
|
|
| 119 |
def splits = [] |
|
| 120 |
for (int j = 1 ; j <= n ;j++) {
|
|
| 121 |
def tmp = lines[i+j].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
|
| 122 |
if (tmp[0].contains(".")) {
|
|
| 123 |
//println "PRE-REMOVE EMPTY NODE: $split : "+ |
|
| 124 |
lines.remove(i+j) |
|
| 125 |
j-- |
|
| 126 |
continue // next token |
|
| 127 |
} |
|
| 128 |
splits << tmp |
|
| 129 |
} |
|
| 130 |
|
|
| 131 |
for (int j = 2 ; j < 8 ; j++) {
|
|
| 132 |
split[j] = merge(split[j], splits.collect(){it[j]})
|
|
| 133 |
} |
|
| 134 |
|
|
| 135 |
if (split[9].length() > 0) split[9] += "|" |
|
| 136 |
split[9] += "expand="+splits.collect(){it[1]}.join("_")
|
|
| 137 |
|
|
| 138 |
//println "REMOVE non- $split" |
|
| 139 |
for (int j = 1 ; j <= n ;j++) {
|
|
| 140 |
splits << lines[i+j].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
|
| 141 |
lines.remove(i+1) |
|
| 142 |
} |
|
| 143 |
} |
|
| 144 |
} |
|
| 145 |
|
|
| 146 |
lines[i] = split.join("\t") // rebuild the line
|
|
| 113 | 147 |
} |
| 114 |
IOUtils.write(conlluFile, lines.join("\n") + "\n")
|
|
| 148 |
IOUtils.write(conlluFile, lines.join("\n") + "\n") // CoNLLU needs the last line
|
|
| 115 | 149 |
} |
| 116 | 150 |
} |
| 117 | 151 |
cpb_texts.done() |
| ... | ... | |
| 161 | 195 |
} |
| 162 | 196 |
|
| 163 | 197 |
public static def splitCoNLLUFiles(File inputDirectory, File srcDirectory, def project) {
|
| 164 |
def files = inputDirectory.listFiles() |
|
| 198 |
def files = inputDirectory.listFiles(new FilenameFilter() {
|
|
| 199 |
boolean accept(File dir, String name) {
|
|
| 200 |
return name.toLowerCase().endsWith(".conllu")
|
|
| 201 |
} |
|
| 202 |
}); |
|
| 165 | 203 |
|
| 166 | 204 |
if (files == null) {
|
| 167 | 205 |
println "Aborting. No CONLL file found in $inputDirectory." |
| TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/conll2tiger-ud.pl (revision 4001) | ||
|---|---|---|
| 268 | 268 |
$commentlines++; |
| 269 | 269 |
next; |
| 270 | 270 |
} |
| 271 |
# Added by AL for contractions |
|
| 272 |
elsif ( $words[$w] =~ /^\d+-\d+/ ) {
|
|
| 273 |
|
|
| 274 |
# print LOG "Contraction line loop 1: $words[$w]\n"; |
|
| 275 |
$commentlines++; |
|
| 276 |
|
|
| 277 |
# $contractions++; |
|
| 278 |
next; |
|
| 279 |
} |
|
| 271 |
# # Added by AL for contractions
|
|
| 272 |
# elsif ( $words[$w] =~ /^\d+-\d+/ ) {
|
|
| 273 |
# |
|
| 274 |
# # print LOG "Contraction line loop 1: $words[$w]\n";
|
|
| 275 |
# $commentlines++;
|
|
| 276 |
# |
|
| 277 |
# # $contractions++;
|
|
| 278 |
# next;
|
|
| 279 |
# }
|
|
| 280 | 280 |
else {
|
| 281 | 281 |
if ( defined($opt_c) ) {
|
| 282 | 282 |
$words[$w] =~ s/coord(\d+)-//g; |
| ... | ... | |
| 437 | 437 |
next; |
| 438 | 438 |
} |
| 439 | 439 |
|
| 440 |
#Added AL for contractions |
|
| 441 |
if ( $words[$i] =~ /^\d+-\d+/ ) {
|
|
| 440 |
# #Added AL for contractions |
|
| 441 |
# if ( $words[$i] =~ /^\d+-\d+/ ) {
|
|
| 442 |
# |
|
| 443 |
# # print LOG "Contraction loop 2 : $words[$i]\n"; |
|
| 444 |
# next; |
|
| 445 |
# } |
|
| 442 | 446 |
|
| 443 |
# print LOG "Contraction loop 2 : $words[$i]\n"; |
|
| 444 |
next; |
|
| 445 |
} |
|
| 446 |
|
|
| 447 | 447 |
else {
|
| 448 | 448 |
|
| 449 | 449 |
@cols = split( /\t/, $words[$i] ); |
| ... | ... | |
| 731 | 731 |
<value name="case:pred">case:pred</value> <!-- Welsh --> |
| 732 | 732 |
<value name="case:voc">case:voc</value> <!-- Irish, Scottish Gaelic --> |
| 733 | 733 |
<value name="case">case marking</value> |
| 734 |
<value name="case2">case2 marking</value> |
|
| 734 | 735 |
<value name="cc:nc">cc:nc</value> <!-- Old French --> |
| 735 | 736 |
<value name="cc:nc">Coordinated conjunct : non coordonant</value> |
| 736 | 737 |
<value name="cc:preconj">cc:preconj</value> <!-- Arabic, English, Erzya, Estonian, Faroese, Finnish, German, Indonesian, Komi Permyak, Komi Zyrian, Moksha, North Sami, Persian, Polish, Portuguese, Romanian, Slovenian, Spanish, Thai, Turkish --> |
| TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/CallUD2TigerPerlScript.java (revision 4001) | ||
|---|---|---|
| 41 | 41 |
|
| 42 | 42 |
File mainFile = new File(tigerXMLDirectory, "main.xml"); |
| 43 | 43 |
File perlScript = BundleUtils.getFile("org.txm.conllu.core", "groovy", "/org/txm/scripts/importer/conllu/", "conll2tiger-ud.pl");
|
| 44 |
|
|
| 44 |
System.out.println("PERL: "+perlScript);
|
|
| 45 | 45 |
DeleteDir.deleteDirectory(tigerXMLDirectory); |
| 46 | 46 |
//println "clean&mkdir $tigerXMLDirectory" |
| 47 | 47 |
tigerXMLDirectory.mkdir(); |
| TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/preferences/UDPreferences.java (revision 4001) | ||
|---|---|---|
| 17 | 17 |
|
| 18 | 18 |
public static String UDPREFIX = "ud_prefix"; |
| 19 | 19 |
|
| 20 |
public static String KEEP_MULTIWORD_TOKENS = "import_KEEP_MULTIWORD_TOKENS";
|
|
| 20 |
public static String MULTIWORDS_MANAGEMENT = "import_MULTIWORDS_MANAGEMENT";
|
|
| 21 | 21 |
|
| 22 |
public static String MULTIWORDS = "multiwords"; |
|
| 23 |
|
|
| 24 |
public static String TOKENS = "tokens"; |
|
| 25 |
|
|
| 26 |
public static String ALL = "all"; |
|
| 27 |
|
|
| 22 | 28 |
public static String IMPORT_USE_NEW_DOC_ID = "import_use_new_doc_id"; |
| 23 | 29 |
|
| 24 | 30 |
public static String IMPORT_HEAD_TO_PROJECT = "import_head_to_project"; |
| ... | ... | |
| 48 | 54 |
preferences.put(UDPREFIX, "ud-"); |
| 49 | 55 |
preferences.put(IMPORT_HEAD_TO_PROJECT, "upos,deprel"); |
| 50 | 56 |
preferences.put(IMPORT_DEPS_TO_PROJECT, "upos,deprel"); |
| 51 |
preferences.putBoolean(KEEP_MULTIWORD_TOKENS, false);
|
|
| 57 |
preferences.put(MULTIWORDS_MANAGEMENT, TOKENS); // boths, multiwords, tokens
|
|
| 52 | 58 |
preferences.putBoolean(IMPORT_USE_NEW_DOC_ID, true); |
| 53 | 59 |
preferences.putBoolean(IMPORT_BUILD_TIGERSEARCH_INDEXES, true); |
| 54 | 60 |
} |
| TXM/trunk/bundles/org.txm.specificities.rcp/src/org/txm/specificities/rcp/editors/SpecificitiesEditor.java (revision 4001) | ||
|---|---|---|
| 160 | 160 |
|
| 161 | 161 |
// change statistical computing |
| 162 | 162 |
if (TBXPreferences.getInstance().getBoolean(TBXPreferences.EXPERT_USER)) {
|
| 163 |
// modeText = new Text(extendedParametersArea, SWT.BORDER); // not yet ready
|
|
| 163 |
modeText = new Text(extendedParametersArea, SWT.BORDER); // not yet ready |
|
| 164 | 164 |
} |
| 165 | 165 |
|
| 166 | 166 |
// Result area |
Formats disponibles : Unified diff