Révision 4001
TXM/trunk/bundles/org.txm.conllu.rcp/src/org/txm/conllu/rcp/importsection/CoNLLUSection.java (revision 4001) | ||
---|---|---|
3 | 3 |
import org.eclipse.swt.SWT; |
4 | 4 |
import org.eclipse.swt.widgets.Button; |
5 | 5 |
import org.eclipse.swt.widgets.Composite; |
6 |
import org.eclipse.swt.widgets.Group; |
|
6 | 7 |
import org.eclipse.swt.widgets.Label; |
7 | 8 |
import org.eclipse.swt.widgets.Text; |
8 | 9 |
import org.eclipse.ui.forms.events.ExpansionAdapter; |
... | ... | |
16 | 17 |
import org.txm.objects.Project; |
17 | 18 |
import org.txm.rcp.editors.imports.ImportFormEditor; |
18 | 19 |
import org.txm.rcp.editors.imports.sections.ImportEditorSection; |
20 |
import org.txm.rcp.swt.widget.RadioGroup; |
|
19 | 21 |
|
20 | 22 |
public class CoNLLUSection extends ImportEditorSection { |
21 | 23 |
|
... | ... | |
25 | 27 |
|
26 | 28 |
Button buildTIGERIndexesButton; |
27 | 29 |
Button useNewDocIdButton; |
28 |
Button keepWordContractionsButton;
|
|
30 |
RadioGroup keepWordContractionsButton;
|
|
29 | 31 |
Text udPropertiesPrefixButton; |
30 | 32 |
private Text headPropertiesText; |
31 | 33 |
private Text depsPropertiesText; |
... | ... | |
71 | 73 |
gdata2.colspan = 4; // one line |
72 | 74 |
useNewDocIdButton.setLayoutData(gdata2); |
73 | 75 |
|
74 |
keepWordContractionsButton = toolkit.createButton(sectionClient, "Keep multiword tokens when importing CoNLL-U files", SWT.CHECK); |
|
76 |
keepWordContractionsButton = new RadioGroup(sectionClient, SWT.NONE, "Multiwords management", new String[][] {{UDPreferences.MULTIWORDS, "Multiwords"}, {UDPreferences.TOKENS, "Tokens"}, {UDPreferences.ALL, "All"}});//toolkit.create toolkit.createButton(sectionClient, "Keep multiword tokens when importing CoNLL-U files", SWT.CHECK); |
|
77 |
keepWordContractionsButton.setToolTipText("Multiwords: only the multiwords are indexed, Tokens: Only the multiwords tokens are indexed, All: both are indexed"); |
|
75 | 78 |
gdata2 = getButtonLayoutData(); |
76 | 79 |
gdata2.colspan = 4; // one line |
77 | 80 |
keepWordContractionsButton.setLayoutData(gdata2); |
... | ... | |
114 | 117 |
|
115 | 118 |
buildTIGERIndexesButton.setSelection(customNode.getBoolean(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES, UDPreferences.getInstance().getBoolean(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES))); //$NON-NLS-1$ |
116 | 119 |
useNewDocIdButton.setSelection(customNode.getBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID, UDPreferences.getInstance().getBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID))); //$NON-NLS-1$ |
117 |
keepWordContractionsButton.setSelection(customNode.getBoolean(UDPreferences.KEEP_MULTIWORD_TOKENS, UDPreferences.getInstance().getBoolean(UDPreferences.KEEP_MULTIWORD_TOKENS))); //$NON-NLS-1$
|
|
120 |
keepWordContractionsButton.setSelection(customNode.get(UDPreferences.MULTIWORDS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.MULTIWORDS_MANAGEMENT))); //$NON-NLS-1$
|
|
118 | 121 |
udPropertiesPrefixButton.setText(customNode.get(UDPreferences.UDPREFIX, UDPreferences.getInstance().getString(UDPreferences.UDPREFIX))); //$NON-NLS-1$ |
119 | 122 |
headPropertiesText.setText(customNode.get(UDPreferences.IMPORT_HEAD_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_HEAD_TO_PROJECT))); //$NON-NLS-1$ |
120 | 123 |
depsPropertiesText.setText(customNode.get(UDPreferences.IMPORT_DEPS_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_DEPS_TO_PROJECT))); //$NON-NLS-1$ |
... | ... | |
127 | 130 |
Preferences customNode = project.getPreferencesScope().getNode(UDPreferences.getInstance().getPreferencesNodeQualifier()); |
128 | 131 |
customNode.putBoolean(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES, buildTIGERIndexesButton.getSelection()); |
129 | 132 |
customNode.putBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID, useNewDocIdButton.getSelection()); |
130 |
customNode.putBoolean(UDPreferences.KEEP_MULTIWORD_TOKENS, keepWordContractionsButton.getSelection());
|
|
133 |
customNode.put(UDPreferences.MULTIWORDS_MANAGEMENT, keepWordContractionsButton.getSelection());
|
|
131 | 134 |
customNode.put(UDPreferences.UDPREFIX, udPropertiesPrefixButton.getText()); |
132 | 135 |
customNode.put(UDPreferences.IMPORT_HEAD_TO_PROJECT, headPropertiesText.getText()); |
133 | 136 |
customNode.put(UDPreferences.IMPORT_DEPS_TO_PROJECT, depsPropertiesText.getText()); |
TXM/trunk/bundles/org.txm.conllu.rcp/src/org/txm/conllu/rcp/preferences/CoNLLUPreferencePage.java (revision 4001) | ||
---|---|---|
26 | 26 |
this.addField(new BooleanFieldEditor(UDPreferences.IMPORT_USE_NEW_DOC_ID, "Use new odc id when importing CoNLL-U files", this.getFieldEditorParent())); |
27 | 27 |
this.addField(new BooleanFieldEditor(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES, "Build TIGERSearch indexes as well", this.getFieldEditorParent())); |
28 | 28 |
|
29 |
this.addField(new BooleanFieldEditor(UDPreferences.KEEP_MULTIWORD_TOKENS, "Keep multiword tokens when importing CoNLL-U files", this.getFieldEditorParent()));
|
|
29 |
this.addField(new BooleanFieldEditor(UDPreferences.MULTIWORDS_MANAGEMENT, "Keep multiword tokens when importing CoNLL-U files", this.getFieldEditorParent()));
|
|
30 | 30 |
udPrefixField = new StringFieldEditor(UDPreferences.UDPREFIX, "UD properties prefix", this.getFieldEditorParent()); |
31 | 31 |
this.addField(udPrefixField); |
32 | 32 |
this.addField(new StringFieldEditor(UDPreferences.IMPORT_HEAD_TO_PROJECT, "UD head properties to project (comma separated list)", this.getFieldEditorParent())); |
TXM/trunk/bundles/org.txm.tigersearch.rcp/src/org/txm/searchengine/ts/TIGERSearchEngine.java (revision 4001) | ||
---|---|---|
10 | 10 |
import java.nio.channels.FileChannel; |
11 | 11 |
import java.util.ArrayList; |
12 | 12 |
import java.util.HashMap; |
13 |
import java.util.LinkedHashMap; |
|
13 | 14 |
import java.util.LinkedHashSet; |
14 | 15 |
import java.util.List; |
15 | 16 |
|
... | ... | |
61 | 62 |
HashMap<CorpusBuild, TSCorpus> corpora = null; |
62 | 63 |
|
63 | 64 |
public TSCorpus getTSCorpus(CorpusBuild corpus) { |
64 |
|
|
65 |
|
|
65 | 66 |
CorpusBuild root = corpus.getRootCorpusBuild(); |
66 | 67 |
TSCorpus tscorpus = corpora.get(root); |
67 | 68 |
if (tscorpus != null) { |
... | ... | |
184 | 185 |
|
185 | 186 |
TSCorpus tcorpus = this.getTSCorpus(corpus); |
186 | 187 |
TSResult result = queryTIGER(corpus, query); |
187 |
|
|
188 |
|
|
188 | 189 |
if (result == null) { // no result |
189 | 190 |
return new EmptySelection(query); |
190 | 191 |
} |
191 | 192 |
|
192 | 193 |
return fromTIGERMatchToSelection(tcorpus, result, query, corpus); |
193 | 194 |
} |
194 |
|
|
195 |
|
|
195 | 196 |
public TSResult queryTIGER(CorpusBuild corpus, IQuery query) throws Exception { |
196 | 197 |
|
197 | 198 |
TSCorpus tcorpus = this.getTSCorpus(corpus); |
... | ... | |
207 | 208 |
return result; |
208 | 209 |
} |
209 | 210 |
int[] cpos = { matches.get(0).getStart(), matches.get(matches.size() - 1).getEnd() }; |
210 |
|
|
211 |
|
|
211 | 212 |
//apply shifts on start and end positions |
212 | 213 |
MappedByteBuffer offsetsMapped = tcorpus.getOffsetsMapped(); |
213 | 214 |
if (offsetsMapped != null) { |
214 | 215 |
cpos[0] -= offsetsMapped.getInt(cpos[0] * Integer.BYTES); |
215 | 216 |
cpos[1] -= offsetsMapped.getInt(cpos[1] * Integer.BYTES); |
216 | 217 |
} |
217 |
|
|
218 |
|
|
218 | 219 |
int[] starts = tcorpus.getSentenceStartPositions(); |
219 | 220 |
int sent_max = starts.length; |
220 | 221 |
int sent_min = 0; |
... | ... | |
226 | 227 |
break; |
227 | 228 |
} |
228 | 229 |
} |
229 |
|
|
230 |
|
|
230 | 231 |
Log.finest("QUERYING sentences: " + sent_min + " -> " + sent_max); //$NON-NLS-1$ //$NON-NLS-2$ |
231 | 232 |
result = tcorpus.query(query.getQueryString().replace("\n", " "), sent_min, sent_max, -1); //$NON-NLS-1$ //$NON-NLS-2$ |
232 | 233 |
} |
... | ... | |
308 | 309 |
|
309 | 310 |
return new TIGERSelection(query, result2); |
310 | 311 |
} |
311 |
|
|
312 |
|
|
312 | 313 |
@Override |
313 | 314 |
public Query newQuery() { |
314 | 315 |
return new TIGERQuery(); |
... | ... | |
373 | 374 |
|
374 | 375 |
PrintWriter writer = IOUtils.getWriter(logprop, "UTF-8"); //$NON-NLS-1$ |
375 | 376 |
writer.println("# Default log configuration of the TIGERSearch suite"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$ |
376 |
"log4j.rootLogger=SEVERE,Logfile"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
|
|
377 |
"log4j.rootLogger=INFO,Logfile"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
|
|
377 | 378 |
"log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=SEVERE"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$ |
378 | 379 |
"log4j.appender.Logfile=org.apache.log4j.RollingFileAppender"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$ |
379 | 380 |
"log4j.appender.Logfile.File="+logprop.getAbsolutePath()+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$ |
... | ... | |
381 | 382 |
"log4j.appender.Logfile.MaxBackupIndex=1"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$ |
382 | 383 |
"log4j.appender.Logfile.layout=org.apache.log4j.PatternLayout"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$ |
383 | 384 |
"log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n"); //$NON-NLS-1$ |
384 |
|
|
385 |
writer.close(); |
|
385 | 386 |
BasicConfigurator.configure(); |
386 | 387 |
|
387 | 388 |
String driverFilename = TigerSearchPreferences.getInstance().getString(TigerSearchPreferences.DRIVER_FILENAME); |
388 | 389 |
|
389 | 390 |
File master = new File(sourceDirectory, driverFilename); |
390 | 391 |
if (!master.exists()) { |
391 |
// File[] xmlFiles = sourceDirectory.listFiles(new FileFilter() { |
|
392 |
// public boolean accept(File file) { |
|
393 |
// if (file.isDirectory()) return false; |
|
394 |
// if (file.isHidden()) return false; |
|
395 |
// String filename = file.getName(); |
|
396 |
// if (filename.equals("import.xml")) return false; |
|
397 |
// if (!filename.endsWith(".xml")) return false; |
|
398 |
// |
|
399 |
// return true; |
|
400 |
// } |
|
401 |
// }); |
|
402 |
// |
|
403 |
// if (xmlFiles == null) { |
|
404 |
System.out.println("No master file found in: "+sourceDirectory);
|
|
405 |
return false;
|
|
406 |
// } |
|
407 |
// master = xmlFiles[0]; |
|
392 |
// File[] xmlFiles = sourceDirectory.listFiles(new FileFilter() {
|
|
393 |
// public boolean accept(File file) {
|
|
394 |
// if (file.isDirectory()) return false;
|
|
395 |
// if (file.isHidden()) return false;
|
|
396 |
// String filename = file.getName();
|
|
397 |
// if (filename.equals("import.xml")) return false;
|
|
398 |
// if (!filename.endsWith(".xml")) return false;
|
|
399 |
//
|
|
400 |
// return true;
|
|
401 |
// }
|
|
402 |
// });
|
|
403 |
//
|
|
404 |
// if (xmlFiles == null) {
|
|
405 |
System.out.println("No master file found in: "+sourceDirectory); |
|
406 |
return false; |
|
407 |
// }
|
|
408 |
// master = xmlFiles[0];
|
|
408 | 409 |
} |
409 | 410 |
String uri = master.getAbsolutePath(); // TIGER corpus source root file |
410 | 411 |
File tigerBinDir = new File(tigerDir, corpusName); |
411 | 412 |
tigerBinDir.mkdir(); |
412 | 413 |
try { |
414 |
LinkedHashMap<String, ArrayList<String>> errorsPerTIGERSubcorpus = new LinkedHashMap<>(); |
|
413 | 415 |
IndexBuilderErrorHandler handler = new SimpleErrorHandler(tigerBinDir.getAbsolutePath()) { |
414 |
public void setMessage(String message) { } |
|
416 |
String currentSubcorpus = ""; |
|
417 |
public void setMessage(String message) { |
|
418 |
if (message.startsWith("Reading subcorpus ")) { |
|
419 |
currentSubcorpus = message.substring("Reading subcorpus ".length()); |
|
420 |
} |
|
421 |
} |
|
422 |
public void newSentenceError(String sid, String message) { |
|
423 |
if (!errorsPerTIGERSubcorpus.containsKey(currentSubcorpus)) errorsPerTIGERSubcorpus.put(currentSubcorpus, new ArrayList<String>()); |
|
424 |
errorsPerTIGERSubcorpus.get(currentSubcorpus).add("Error at "+sid+": "+message); |
|
425 |
} |
|
426 |
public void newSentenceWarning(String sid, String message) { |
|
427 |
if (!errorsPerTIGERSubcorpus.containsKey(currentSubcorpus)) errorsPerTIGERSubcorpus.put(currentSubcorpus, new ArrayList<String>()); |
|
428 |
errorsPerTIGERSubcorpus.get(currentSubcorpus).add("Warning at "+sid+": "+message); |
|
429 |
} |
|
415 | 430 |
public void setNumberOfSentences(int number) { } |
416 | 431 |
public void setProgressBar(int value) { } |
417 | 432 |
}; |
433 |
|
|
418 | 434 |
XMLIndexing indexing = new XMLIndexing(corpusName, uri, tigerBinDir.getAbsolutePath(), handler, false); |
419 | 435 |
indexing.startIndexing(); |
420 |
File logs = new File(tigerBinDir, "indexing.log"); //$NON-NLS-1$ |
|
421 |
String txt = IOUtils.getText(logs); |
|
422 |
if (txt.contains("Error in corpus graph ")) { |
|
423 |
Log.warning("Error while importing TIGER corpus: "+txt); |
|
424 |
return false; |
|
436 |
if (errorsPerTIGERSubcorpus.size() > 0) { |
|
437 |
for (String s : errorsPerTIGERSubcorpus.keySet()) { |
|
438 |
Log.info("In "+s); |
|
439 |
for (String s2 : errorsPerTIGERSubcorpus.get(s)) { |
|
440 |
Log.info(" "+s2); |
|
441 |
|
|
442 |
} |
|
443 |
} |
|
425 | 444 |
} |
426 | 445 |
} |
427 | 446 |
catch (Exception e) { |
TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImporter.groovy (revision 4001) | ||
---|---|---|
30 | 30 |
super(module) |
31 | 31 |
} |
32 | 32 |
|
33 |
public final String merge (String s1, String s2) { |
|
34 |
if (s1 == s2) return s1 |
|
35 |
if (s1 == "") return s2 |
|
36 |
if (s2 == "") return s1 |
|
37 |
if (s1 == "_") return s2 |
|
38 |
if (s2 == "_") return s1 |
|
39 |
if (s1 == "" && s2 == "") return "_" |
|
40 |
if (s1 == "_" && s2 == "_") return "_" |
|
41 |
return s1+"|"+s2 |
|
33 |
public final String merge(String orig, def sss) { |
|
34 |
|
|
35 |
int n = 0; |
|
36 |
for (String s : sss) { |
|
37 |
if (s != "" && s != "_") { |
|
38 |
|
|
39 |
def ssset = new HashSet(sss); |
|
40 |
if (ssset.size() == 1) return ssset.join(" + ") |
|
41 |
|
|
42 |
return sss.join(" + ") |
|
43 |
} |
|
44 |
} |
|
45 |
|
|
46 |
return orig; |
|
42 | 47 |
} |
43 | 48 |
|
44 | 49 |
@Override |
... | ... | |
53 | 58 |
conlluSrcDirectory.deleteDir() |
54 | 59 |
conlluSrcDirectory.mkdirs() |
55 | 60 |
|
56 |
println "Split CoNLL-U files per text..." |
|
57 | 61 |
if (!splitCoNLLUFiles(inputDirectory, conlluSrcDirectory, project)) { |
58 | 62 |
return |
59 | 63 |
} |
... | ... | |
63 | 67 |
files.sort() |
64 | 68 |
|
65 | 69 |
// Keep or not contractions |
66 |
boolean keepContractions = "true".equals(UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.KEEP_MULTIWORD_TOKENS, UDPreferences.getInstance().getString(UDPreferences.KEEP_MULTIWORD_TOKENS)));
|
|
70 |
String keepContractions = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.MULTIWORDS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.MULTIWORDS_MANAGEMENT));
|
|
67 | 71 |
|
68 |
println "Remove multiword tokens (keepContractions=$keepContractions) & add XmlId if necessary"
|
|
72 |
println "Multiwords managment ($keepContractions) & add XmlId if necessary & remove empty nodes"
|
|
69 | 73 |
ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size()) |
70 | 74 |
for (File conlluFile : files) { |
71 | 75 |
cpb_texts.tick() |
... | ... | |
85 | 89 |
} else { |
86 | 90 |
split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++) |
87 | 91 |
} |
88 |
|
|
89 | 92 |
} |
90 | 93 |
|
91 |
if (!keepContractions && split[0] != null && split[0].contains("-")) { |
|
92 |
println "REMOVE" |
|
93 |
println "\t"+lines[i] |
|
94 |
println "\t"+lines[i+1] |
|
95 |
println "\t"+lines[i+2] |
|
96 |
|
|
97 |
def split1 = lines[i+1].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length); |
|
98 |
def split2 = lines[i+2].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length); |
|
99 |
|
|
100 |
if (split[2] == "_") split[2] = merge(split1[2],split2[2]) |
|
101 |
if (split[3] == "_") split[3] = merge(split1[3], split2[3]) |
|
102 |
if (split[4] == "_") split[4] = merge(split1[4], split2[4]) |
|
103 |
if (split[5] == "_") split[5] = merge(split1[5], split2[5]) |
|
104 |
if (split[6] == "_") split[6] = split1[6] |
|
105 |
if (split[7] == "_") split[7] = merge(split1[7], split2[7]) |
|
106 |
if (split[8] == "_") split[8] = merge(split1[8], split2[8]) |
|
107 |
|
|
108 |
lines.remove(i+1) |
|
109 |
lines.remove(i+1) |
|
94 |
if (split[0].contains(".")) { |
|
95 |
//println "REMOVE EMPTY NODE: $split : "+ |
|
96 |
lines.remove(i) |
|
97 |
i-- |
|
98 |
continue; // next ! |
|
110 | 99 |
} |
111 | 100 |
|
112 |
lines[i] = split.join("\t") |
|
101 |
if (keepContractions == UDPreferences.ALL) { |
|
102 |
// ok on fait rien |
|
103 |
} else if (keepContractions == UDPreferences.TOKENS) { |
|
104 |
if (split[0].contains("-")) { |
|
105 |
//println "REMOVE - $split" |
|
106 |
lines.remove(i) |
|
107 |
i-- |
|
108 |
continue; /// next ! |
|
109 |
} |
|
110 |
} else if (keepContractions == UDPreferences.MULTIWORDS) { |
|
111 |
if (split[0].contains("-")) { |
|
112 |
int n1 = Integer.parseInt(split[0].substring(0, split[0].indexOf("-"))); |
|
113 |
split[0] = ""+n1 |
|
114 |
|
|
115 |
|
|
116 |
int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-"))); |
|
117 |
int n = 1 + n2 - n1 |
|
118 |
|
|
119 |
def splits = [] |
|
120 |
for (int j = 1 ; j <= n ;j++) { |
|
121 |
def tmp = lines[i+j].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length); |
|
122 |
if (tmp[0].contains(".")) { |
|
123 |
//println "PRE-REMOVE EMPTY NODE: $split : "+ |
|
124 |
lines.remove(i+j) |
|
125 |
j-- |
|
126 |
continue // next token |
|
127 |
} |
|
128 |
splits << tmp |
|
129 |
} |
|
130 |
|
|
131 |
for (int j = 2 ; j < 8 ; j++) { |
|
132 |
split[j] = merge(split[j], splits.collect(){it[j]}) |
|
133 |
} |
|
134 |
|
|
135 |
if (split[9].length() > 0) split[9] += "|" |
|
136 |
split[9] += "expand="+splits.collect(){it[1]}.join("_") |
|
137 |
|
|
138 |
//println "REMOVE non- $split" |
|
139 |
for (int j = 1 ; j <= n ;j++) { |
|
140 |
splits << lines[i+j].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length); |
|
141 |
lines.remove(i+1) |
|
142 |
} |
|
143 |
} |
|
144 |
} |
|
145 |
|
|
146 |
lines[i] = split.join("\t") // rebuild the line |
|
113 | 147 |
} |
114 |
IOUtils.write(conlluFile, lines.join("\n") + "\n") |
|
148 |
IOUtils.write(conlluFile, lines.join("\n") + "\n") // CoNLLU needs the last line
|
|
115 | 149 |
} |
116 | 150 |
} |
117 | 151 |
cpb_texts.done() |
... | ... | |
161 | 195 |
} |
162 | 196 |
|
163 | 197 |
public static def splitCoNLLUFiles(File inputDirectory, File srcDirectory, def project) { |
164 |
def files = inputDirectory.listFiles() |
|
198 |
def files = inputDirectory.listFiles(new FilenameFilter() { |
|
199 |
boolean accept(File dir, String name) { |
|
200 |
return name.toLowerCase().endsWith(".conllu") |
|
201 |
} |
|
202 |
}); |
|
165 | 203 |
|
166 | 204 |
if (files == null) { |
167 | 205 |
println "Aborting. No CONLL file found in $inputDirectory." |
TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/conll2tiger-ud.pl (revision 4001) | ||
---|---|---|
268 | 268 |
$commentlines++; |
269 | 269 |
next; |
270 | 270 |
} |
271 |
# Added by AL for contractions |
|
272 |
elsif ( $words[$w] =~ /^\d+-\d+/ ) { |
|
273 |
|
|
274 |
# print LOG "Contraction line loop 1: $words[$w]\n"; |
|
275 |
$commentlines++; |
|
276 |
|
|
277 |
# $contractions++; |
|
278 |
next; |
|
279 |
} |
|
271 |
# # Added by AL for contractions
|
|
272 |
# elsif ( $words[$w] =~ /^\d+-\d+/ ) {
|
|
273 |
# |
|
274 |
# # print LOG "Contraction line loop 1: $words[$w]\n";
|
|
275 |
# $commentlines++;
|
|
276 |
# |
|
277 |
# # $contractions++;
|
|
278 |
# next;
|
|
279 |
# }
|
|
280 | 280 |
else { |
281 | 281 |
if ( defined($opt_c) ) { |
282 | 282 |
$words[$w] =~ s/coord(\d+)-//g; |
... | ... | |
437 | 437 |
next; |
438 | 438 |
} |
439 | 439 |
|
440 |
#Added AL for contractions |
|
441 |
if ( $words[$i] =~ /^\d+-\d+/ ) { |
|
440 |
# #Added AL for contractions |
|
441 |
# if ( $words[$i] =~ /^\d+-\d+/ ) { |
|
442 |
# |
|
443 |
# # print LOG "Contraction loop 2 : $words[$i]\n"; |
|
444 |
# next; |
|
445 |
# } |
|
442 | 446 |
|
443 |
# print LOG "Contraction loop 2 : $words[$i]\n"; |
|
444 |
next; |
|
445 |
} |
|
446 |
|
|
447 | 447 |
else { |
448 | 448 |
|
449 | 449 |
@cols = split( /\t/, $words[$i] ); |
... | ... | |
731 | 731 |
<value name="case:pred">case:pred</value> <!-- Welsh --> |
732 | 732 |
<value name="case:voc">case:voc</value> <!-- Irish, Scottish Gaelic --> |
733 | 733 |
<value name="case">case marking</value> |
734 |
<value name="case2">case2 marking</value> |
|
734 | 735 |
<value name="cc:nc">cc:nc</value> <!-- Old French --> |
735 | 736 |
<value name="cc:nc">Coordinated conjunct : non coordonant</value> |
736 | 737 |
<value name="cc:preconj">cc:preconj</value> <!-- Arabic, English, Erzya, Estonian, Faroese, Finnish, German, Indonesian, Komi Permyak, Komi Zyrian, Moksha, North Sami, Persian, Polish, Portuguese, Romanian, Slovenian, Spanish, Thai, Turkish --> |
TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/CallUD2TigerPerlScript.java (revision 4001) | ||
---|---|---|
41 | 41 |
|
42 | 42 |
File mainFile = new File(tigerXMLDirectory, "main.xml"); |
43 | 43 |
File perlScript = BundleUtils.getFile("org.txm.conllu.core", "groovy", "/org/txm/scripts/importer/conllu/", "conll2tiger-ud.pl"); |
44 |
|
|
44 |
System.out.println("PERL: "+perlScript); |
|
45 | 45 |
DeleteDir.deleteDirectory(tigerXMLDirectory); |
46 | 46 |
//println "clean&mkdir $tigerXMLDirectory" |
47 | 47 |
tigerXMLDirectory.mkdir(); |
TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/preferences/UDPreferences.java (revision 4001) | ||
---|---|---|
17 | 17 |
|
18 | 18 |
public static String UDPREFIX = "ud_prefix"; |
19 | 19 |
|
20 |
public static String KEEP_MULTIWORD_TOKENS = "import_KEEP_MULTIWORD_TOKENS";
|
|
20 |
public static String MULTIWORDS_MANAGEMENT = "import_MULTIWORDS_MANAGEMENT";
|
|
21 | 21 |
|
22 |
public static String MULTIWORDS = "multiwords"; |
|
23 |
|
|
24 |
public static String TOKENS = "tokens"; |
|
25 |
|
|
26 |
public static String ALL = "all"; |
|
27 |
|
|
22 | 28 |
public static String IMPORT_USE_NEW_DOC_ID = "import_use_new_doc_id"; |
23 | 29 |
|
24 | 30 |
public static String IMPORT_HEAD_TO_PROJECT = "import_head_to_project"; |
... | ... | |
48 | 54 |
preferences.put(UDPREFIX, "ud-"); |
49 | 55 |
preferences.put(IMPORT_HEAD_TO_PROJECT, "upos,deprel"); |
50 | 56 |
preferences.put(IMPORT_DEPS_TO_PROJECT, "upos,deprel"); |
51 |
preferences.putBoolean(KEEP_MULTIWORD_TOKENS, false);
|
|
57 |
preferences.put(MULTIWORDS_MANAGEMENT, TOKENS); // boths, multiwords, tokens
|
|
52 | 58 |
preferences.putBoolean(IMPORT_USE_NEW_DOC_ID, true); |
53 | 59 |
preferences.putBoolean(IMPORT_BUILD_TIGERSEARCH_INDEXES, true); |
54 | 60 |
} |
TXM/trunk/bundles/org.txm.specificities.rcp/src/org/txm/specificities/rcp/editors/SpecificitiesEditor.java (revision 4001) | ||
---|---|---|
160 | 160 |
|
161 | 161 |
// change statistical computing |
162 | 162 |
if (TBXPreferences.getInstance().getBoolean(TBXPreferences.EXPERT_USER)) { |
163 |
// modeText = new Text(extendedParametersArea, SWT.BORDER); // not yet ready
|
|
163 |
modeText = new Text(extendedParametersArea, SWT.BORDER); // not yet ready |
|
164 | 164 |
} |
165 | 165 |
|
166 | 166 |
// Result area |
Formats disponibles : Unified diff