Révision 3994
TXM/trunk/bundles/org.txm.libs.deptreeviz/src/org/txm/libs/deptreeviz/UDDepTreeVizPrintTree.java (revision 3994) | ||
---|---|---|
18 | 18 |
|
19 | 19 |
try { |
20 | 20 |
PrintWriter pw = new PrintWriter(file); |
21 |
|
|
21 | 22 |
|
22 |
SimpleParse p = SimpleParse.fromConll(conll); |
|
23 |
|
|
24 |
// Fixing multiwords 6th position if equals to "_" |
|
25 |
ArrayList<String> conll2 = new ArrayList<>(); |
|
26 |
for (int i = 0 ; i < conll.size() ; i++) { |
|
27 |
String l = conll.get(i); |
|
28 |
String split[] = l.split("\t"); |
|
29 |
// if (split[6].equals("_") && split[0].contains("-")) { |
|
30 |
// if (conll.size() > i + 1) { |
|
31 |
// String splitNext[] = conll.get(i+1).split("\t"); |
|
32 |
// if (split[0].startsWith(splitNext[0]+ "-")) { |
|
33 |
// split[6] = splitNext[6]; |
|
34 |
// } |
|
35 |
// } |
|
36 |
// } |
|
37 |
// |
|
38 |
if (split[6].equals("_")) { // 6th col is broken and we don't have the tokens of the multiwords |
|
39 |
//split[6] = "0"; |
|
40 |
continue; |
|
41 |
} |
|
42 |
conll2.add(l); |
|
43 |
// l = ""; |
|
44 |
// for (String s : split) l += s+"\t"; |
|
45 |
// |
|
46 |
// conll2.add(l.substring(0, l.length() - 1)); |
|
47 |
|
|
48 |
} |
|
49 |
for (String l : conll2) System.out.println(l); |
|
50 |
|
|
51 |
SimpleParse p = SimpleParse.fromConll(conll2); |
|
23 | 52 |
List<SimpleWord> words = p.getWords(); |
24 | 53 |
List<String> labels = p.getVerticesLabels().get("SYN"); |
25 | 54 |
|
... | ... | |
64 | 93 |
|
65 | 94 |
public static void main(String[] args) throws IOException { |
66 | 95 |
|
67 |
File file = File.createTempFile("txm", ".svg", new File("/home/mdecorde"));
|
|
96 |
File file = new File("/home/mdecorde/deptreeviz.svg");
|
|
68 | 97 |
|
69 |
List<String> conll = Arrays.asList( |
|
70 |
"1 Ce ce PRON _ Number=Sing|Person=3|PronType=Dem 2 nsubj _ _", |
|
71 |
"2 doit devoir VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _", |
|
72 |
"3 être être AUX _ VerbForm=Inf 5 cop _ _", |
|
73 |
"4 un un DET _ Definite=Ind|Gender=Masc|Number=Sing|PronType=Art 5 det _ _", |
|
74 |
"5 général général NOUN _ Gender=Masc|Number=Sing 2 xcomp _ _", |
|
75 |
"6 étranger étranger ADJ _ Gender=Masc|Number=Sing 5 amod _ _", |
|
76 |
"7 . . PUNCT _ _ 2 punct _ _"); |
|
98 |
String all = """ |
|
99 |
1 [Pour] pour ADP _ _ 3 case _ start_char=4|end_char=8|ner=O|XmlId=w_1959_1 |
|
100 |
2 la le DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art 3 det _ start_char=9|end_char=11|ner=O|XmlId=w_1959_3 |
|
101 |
3 métropole métropole NOUN _ Gender=Fem|Number=Sing 15 obl:mod _ start_char=12|end_char=21|ner=O|XmlId=w_1959_4 |
|
102 |
4 française français ADJ _ Gender=Fem|Number=Sing 3 amod _ start_char=22|end_char=31|ner=O|XmlId=w_1959_5 |
|
103 |
5 , , PUNCT _ _ 3 punct _ start_char=31|end_char=32|ner=O|XmlId=w_1959_6 |
|
104 |
6 pour pour ADP _ _ 8 case _ start_char=33|end_char=37|ner=O|XmlId=w_1959_7 |
|
105 |
7 l' le DET _ Definite=Def|Number=Sing|PronType=Art 8 det _ start_char=38|end_char=40|ner=O|XmlId=w_1959_8 |
|
106 |
8 Algérie Algérie PROPN _ Number=Sing 15 obl:mod _ start_char=40|end_char=47|ner=S-LOC|XmlId=w_1959_9 |
|
107 |
9 , , PUNCT _ _ 8 punct _ start_char=47|end_char=48|ner=O|XmlId=w_1959_10 |
|
108 |
10 pour pour ADP _ _ 12 case _ start_char=49|end_char=53|ner=O|XmlId=w_1959_11 |
|
109 |
11 la le DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art 12 det _ start_char=54|end_char=56|ner=O|XmlId=w_1959_12 |
|
110 |
12 communauté communauté NOUN _ Gender=Fem|Number=Sing 15 obl:mod _ start_char=57|end_char=67|ner=O|XmlId=w_1959_13 |
|
111 |
13 , , PUNCT _ _ 12 punct _ start_char=67|end_char=68|ner=O|XmlId=w_1959_14 |
|
112 |
14 je il PRON _ Number=Sing|Person=1|PronType=Prs 15 nsubj _ start_char=69|end_char=71|ner=O|XmlId=w_1959_15 |
|
113 |
15 forme former VERB _ Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin 0 root _ start_char=72|end_char=77|ner=O|XmlId=w_1959_16 |
|
114 |
16 des un DET _ Definite=Ind|Number=Plur|PronType=Art 17 det _ start_char=78|end_char=81|ner=O|XmlId=w_1959_17 |
|
115 |
17 voeux vœu NOUN _ Gender=Masc|Number=Plur 15 obj _ start_char=83|end_char=88|ner=O|XmlId=w_1959_18 |
|
116 |
18 ardents ardent ADJ _ Gender=Masc|Number=Plur 17 amod _ start_char=90|end_char=97|ner=O|XmlId=w_1959_19 |
|
117 |
19 et et CCONJ _ _ 20 cc _ start_char=98|end_char=100|ner=O|XmlId=w_1959_20 |
|
118 |
20 confiants confiant ADJ _ Gender=Masc|Number=Plur 18 conj _ start_char=101|end_char=110|ner=O|XmlId=w_1959_21 |
|
119 |
21-22 au _ _ _ _ _ _ _ start_char=111|end_char=113|ner=O|XmlId=w_1959_22 |
|
120 |
21 à à ADP _ _ 24 case _ XmlId=w_1959_23 |
|
121 |
22 le le DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 24 det _ XmlId=w_1959_24 |
|
122 |
23 premier premier ADJ _ Gender=Masc|Number=Sing 24 amod _ start_char=114|end_char=121|ner=O|XmlId=w_1959_25 |
|
123 |
24 jour jour NOUN _ Gender=Masc|Number=Sing 15 obl:mod _ start_char=122|end_char=126|ner=O|XmlId=w_1959_26 |
|
124 |
25 de de ADP _ _ 26 case _ start_char=127|end_char=129|ner=O|XmlId=w_1959_27 |
|
125 |
26 1960 1960 NUM _ Number=Plur 24 nmod _ start_char=130|end_char=134|ner=O|XmlId=w_1959_28 |
|
126 |
27 . . PUNCT _ _ 15 punct _ start_char=134|end_char=135|ner=O|XmlId=w_1959_29 |
|
127 |
"""; |
|
77 | 128 |
|
78 | 129 |
|
130 |
List<String> conll = Arrays.asList(all.split("\n")); |
|
131 |
|
|
132 |
|
|
79 | 133 |
System.out.println("FILE: " + print(file, conll, null, null)); |
80 | 134 |
} |
81 | 135 |
|
TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImporter.groovy (revision 3994) | ||
---|---|---|
30 | 30 |
super(module) |
31 | 31 |
} |
32 | 32 |
|
33 |
public final String merge (String s1, String s2) { |
|
34 |
if (s1 == s2) return s1 |
|
35 |
if (s1 == "") return s2 |
|
36 |
if (s2 == "") return s1 |
|
37 |
if (s1 == "_") return s2 |
|
38 |
if (s2 == "_") return s1 |
|
39 |
if (s1 == "" && s2 == "") return "_" |
|
40 |
if (s1 == "_" && s2 == "_") return "_" |
|
41 |
return s1+"|"+s2 |
|
42 |
} |
|
43 |
|
|
33 | 44 |
@Override |
34 | 45 |
public void process() { |
35 | 46 |
|
... | ... | |
47 | 58 |
return |
48 | 59 |
} |
49 | 60 |
} |
61 |
|
|
62 |
def files = conlluSrcDirectory.listFiles() |
|
63 |
files.sort() |
|
64 |
|
|
65 |
// Keep or not contractions |
|
66 |
boolean keepContractions = "true".equals(UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.KEEP_MULTIWORD_TOKENS, UDPreferences.getInstance().getString(UDPreferences.KEEP_MULTIWORD_TOKENS))); |
|
67 |
|
|
68 |
println "Remove multiword tokens (keepContractions=$keepContractions) & add XmlId if necessary" |
|
69 |
ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size()) |
|
70 |
for (File conlluFile : files) { |
|
71 |
cpb_texts.tick() |
|
72 |
if (conlluFile.getName().endsWith(".conllu")) { |
|
73 |
String textid = FileUtils.stripExtension(conlluFile) |
|
74 |
int wcounter = 1; |
|
75 |
ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8"); |
|
76 |
for (int i = 0 ; i < lines.size() ; i++) { |
|
77 |
String line = lines[i] |
|
78 |
if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue; |
|
79 |
|
|
80 |
def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length); |
|
81 |
|
|
82 |
if (split[-1] != null && !split[-1].contains("XmlId=")) { |
|
83 |
if (split[-1] == "_") { |
|
84 |
split[-1] = "XmlId=w_"+textid+"_"+(wcounter++) |
|
85 |
} else { |
|
86 |
split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++) |
|
87 |
} |
|
88 |
|
|
89 |
} |
|
90 |
|
|
91 |
if (!keepContractions && split[0] != null && split[0].contains("-")) { |
|
92 |
println "REMOVE" |
|
93 |
println "\t"+lines[i] |
|
94 |
println "\t"+lines[i+1] |
|
95 |
println "\t"+lines[i+2] |
|
96 |
|
|
97 |
def split1 = lines[i+1].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length); |
|
98 |
def split2 = lines[i+2].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length); |
|
99 |
|
|
100 |
if (split[2] == "_") split[2] = merge(split1[2],split2[2]) |
|
101 |
if (split[3] == "_") split[3] = merge(split1[3], split2[3]) |
|
102 |
if (split[4] == "_") split[4] = merge(split1[4], split2[4]) |
|
103 |
if (split[5] == "_") split[5] = merge(split1[5], split2[5]) |
|
104 |
if (split[6] == "_") split[6] = split1[6] |
|
105 |
if (split[7] == "_") split[7] = merge(split1[7], split2[7]) |
|
106 |
if (split[8] == "_") split[8] = merge(split1[8], split2[8]) |
|
107 |
|
|
108 |
lines.remove(i+1) |
|
109 |
lines.remove(i+1) |
|
110 |
} |
|
111 |
|
|
112 |
lines[i] = split.join("\t") |
|
113 |
} |
|
114 |
IOUtils.write(conlluFile, lines.join("\n") + "\n") |
|
115 |
} |
|
116 |
} |
|
117 |
cpb_texts.done() |
|
118 |
|
|
119 |
|
|
120 |
// // Fix missing XmlId in conllu files |
|
121 |
// println "Setting word XmlID if necessary" |
|
122 |
// ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size()) |
|
123 |
// for (File conlluFile : files) { |
|
124 |
// cpb_texts.tick() |
|
125 |
// if (conlluFile.getName().endsWith(".conllu")) { |
|
126 |
// String textid = FileUtils.stripExtension(conlluFile) |
|
127 |
// int wcounter = 1; |
|
128 |
// ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8"); |
|
129 |
// for (int i = 0 ; i < lines.size() ; i++) { |
|
130 |
// String line = lines[i] |
|
131 |
// if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue; |
|
132 |
// |
|
133 |
// def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length); |
|
134 |
// if (split[-1] != null && !split[-1].contains("XmlId=")) { |
|
135 |
// split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++) |
|
136 |
// lines[i] = split.join("\t") |
|
137 |
// } |
|
138 |
// |
|
139 |
// } |
|
140 |
// IOUtils.write(conlluFile, lines.join("\n") + "\n") |
|
141 |
// } |
|
142 |
// } |
|
143 |
// cpb_texts.done() |
|
144 |
|
|
50 | 145 |
File metadataFile = Metadatas.findMetadataFile(module.sourceDirectory) |
51 | 146 |
File srcDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu2tei") |
52 | 147 |
srcDirectory.deleteDir() |
... | ... | |
72 | 167 |
println "Aborting. No CONLL file found in $inputDirectory." |
73 | 168 |
return false |
74 | 169 |
} |
75 |
|
|
170 |
files.sort() |
|
76 | 171 |
ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size()) |
77 | 172 |
|
78 | 173 |
println "Splitting CoNLL-U files..." |
... | ... | |
117 | 212 |
println "Aborting. No CONLL file found in $inputDirectory." |
118 | 213 |
return false |
119 | 214 |
} |
120 |
|
|
215 |
files.sort() |
|
121 | 216 |
def properties = Arrays.asList(ImportCoNLLUAnnotations.UD_PROPERTY_NAMES) |
122 | 217 |
|
123 | 218 |
String prefix = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.UDPREFIX, UDPreferences.getInstance().getString(UDPreferences.UDPREFIX)); |
124 | 219 |
|
125 | 220 |
UDPreferences.getInstance().setProjectPreferenceValue(project, UDPreferences.UDPREFIX, prefix); // copy the current preference into the corpus preference |
126 | 221 |
|
127 |
boolean keepContractions = "true".equals(UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.KEEP_CONTRACTIONS, UDPreferences.getInstance().getString(UDPreferences.KEEP_CONTRACTIONS))); |
|
128 |
|
|
129 | 222 |
def headPropertiesToProject = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.IMPORT_HEAD_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_HEAD_TO_PROJECT)).split(",") as Set |
130 | 223 |
|
131 | 224 |
def depsPropertiesToProject = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.IMPORT_DEPS_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_DEPS_TO_PROJECT)).split(",") as Set |
... | ... | |
172 | 265 |
|
173 | 266 |
} else { |
174 | 267 |
|
175 |
HashMap<String, String> wProperties = new HashMap<String, String>()
|
|
268 |
LinkedHashMap<String, String> wProperties = new LinkedHashMap<String, String>()
|
|
176 | 269 |
|
177 |
def split = line.split("\t") |
|
270 |
def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length)
|
|
178 | 271 |
if (split.size() == properties.size()) { |
179 | 272 |
String id = split[0] |
180 | 273 |
for (int i = 0 ; i < split.size() ; i++) { |
181 | 274 |
wProperties[properties[i]] = split[i] |
182 | 275 |
} |
183 | 276 |
|
184 |
if (wProperties.get("id").equals("1")) { // it's a new sentence, store the current if any and starts a new sentence |
|
185 |
if (words.size() > 0) { |
|
186 |
def sentence = [par_id, sent_id, words, comments] |
|
187 |
content.add(sentence) |
|
188 |
|
|
189 |
sent_id = "" |
|
190 |
par_id = "1" |
|
191 |
comments = [] |
|
192 |
words = [] |
|
193 |
} |
|
277 |
// if (wProperties.get("id").equals("1") || wProperties.get("id").startsWith("1-")) { // it's a new sentence, store the current if any and starts a new sentence |
|
278 |
// if (words.size() > 0) { |
|
279 |
// def sentence = [par_id, sent_id, words, comments] |
|
280 |
// content.add(sentence) |
|
281 |
// |
|
282 |
// sent_id = "" |
|
283 |
// par_id = "1" |
|
284 |
// comments = [] |
|
285 |
// words = [] |
|
286 |
// } |
|
287 |
// } |
|
288 |
if (wProperties[properties[0]].contains(".")) { // id |
|
289 |
// empty node |
|
290 |
} else { |
|
291 |
words << wProperties |
|
194 | 292 |
} |
195 |
|
|
196 |
words << wProperties |
|
197 | 293 |
} else { |
198 | 294 |
//println "Warning: not a line: "+line |
199 | 295 |
} |
200 | 296 |
} |
201 | 297 |
} |
202 | 298 |
|
299 |
if (words.size() > 0) { // last sentence ? |
|
300 |
def sentence = [par_id, sent_id, words, comments] |
|
301 |
content.add(sentence) |
|
302 |
} |
|
303 |
|
|
203 | 304 |
if (content.size() == 0) { |
204 | 305 |
continue; |
205 | 306 |
} |
206 | 307 |
|
308 |
//println "${content.size()} sentences found." |
|
309 |
|
|
207 | 310 |
File xmlFile = new File(srcDirectory, text_id+".xml") |
208 | 311 |
// println "xmlFile=$xmlFile" |
209 | 312 |
BufferedOutputStream output = new BufferedOutputStream(new FileOutputStream(xmlFile)) |
... | ... | |
251 | 354 |
writer.writeCharacters("\n") |
252 | 355 |
} |
253 | 356 |
|
254 |
if (!keepContractions) { // merge properties in the "-" word and remove the parts |
|
255 |
for (int i = 0 ; i < words.size() ; i++) { |
|
256 |
def word = words[i] |
|
257 |
String id = word[0] |
|
258 |
|
|
259 |
if (id.contains("-")) { // multi-word line |
|
260 |
int index = id.indexOf("-") |
|
261 |
String id1 = id.substring(0, index) |
|
262 |
String id2 = id.substring(index+1) |
|
263 |
def token1 = sentence[id1] |
|
264 |
def token2 = sentence[id2] |
|
265 |
|
|
266 |
if (token1 == null || token2 == null) { |
|
267 |
println "Error: text $text_id paragraph $par_id sent $sent_id word $id has wrong token ids $id1 and $id2 -> $token1 and $token2" |
|
268 |
continue |
|
269 |
} |
|
270 |
|
|
271 |
for (String p : properties) { |
|
272 |
if (p == "id") continue // don't merge the form property |
|
273 |
if (p == "form") continue // don't merge the form property |
|
274 |
word[p] = token1[p] + "+" + token2[p] |
|
275 |
} |
|
276 |
words.remove(i+1) // remove the token |
|
277 |
words.remove(i+1) // remove the token |
|
278 |
} |
|
279 |
} |
|
280 |
} |
|
281 |
|
|
282 | 357 |
if (headPropertiesToProject.size() > 0 || depsPropertiesToProject.size() > 0) { |
283 | 358 |
LinkedHashMap sentencehash = new LinkedHashMap() |
284 | 359 |
//println "WORDS="+words |
TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImport.groovy (revision 3994) | ||
---|---|---|
123 | 123 |
File tigerCorpusExistingDirectory = new File(this.binaryDirectory, "tiger/"+corpusName); |
124 | 124 |
MainCorpus corpus = this.project.getFirstChild(MainCorpus.class); |
125 | 125 |
corpus.compute(false); // load the corpus in CQP |
126 |
TIGERSearchEngine.writeOffsetDataFiles(corpus, "editionId", tigerCorpusExistingDirectory, tigerDirectory, tigerCorpusExistingDirectory) |
|
126 |
TIGERSearchEngine.writeOffsetDataFiles(corpus, "editionId", tigerCorpusExistingDirectory, tigerDirectory, tigerCorpusExistingDirectory, kee)
|
|
127 | 127 |
} |
128 | 128 |
} else { |
129 | 129 |
println "Can not do the TIGER indexes step." |
TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/preferences/UDPreferences.java (revision 3994) | ||
---|---|---|
17 | 17 |
|
18 | 18 |
public static String UDPREFIX = "ud_prefix"; |
19 | 19 |
|
20 |
public static String KEEP_CONTRACTIONS = "import_keep_contractions";
|
|
20 |
public static String KEEP_MULTIWORD_TOKENS = "import_KEEP_MULTIWORD_TOKENS";
|
|
21 | 21 |
|
22 | 22 |
public static String IMPORT_USE_NEW_DOC_ID = "import_use_new_doc_id"; |
23 | 23 |
|
... | ... | |
48 | 48 |
preferences.put(UDPREFIX, "ud-"); |
49 | 49 |
preferences.put(IMPORT_HEAD_TO_PROJECT, "upos,deprel"); |
50 | 50 |
preferences.put(IMPORT_DEPS_TO_PROJECT, "upos,deprel"); |
51 |
preferences.putBoolean(KEEP_CONTRACTIONS, true);
|
|
51 |
preferences.putBoolean(KEEP_MULTIWORD_TOKENS, false);
|
|
52 | 52 |
preferences.putBoolean(IMPORT_USE_NEW_DOC_ID, true); |
53 | 53 |
preferences.putBoolean(IMPORT_BUILD_TIGERSEARCH_INDEXES, true); |
54 | 54 |
} |
TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/function/UDTreeSearch.java (revision 3994) | ||
---|---|---|
148 | 148 |
|
149 | 149 |
if (buffer.length() > 0) buffer.append("\t"); |
150 | 150 |
if (prop.equals("form") && startOfSubMatch) buffer.append("["); |
151 |
buffer.append(values.get(prefix+prop)[p]); |
|
151 |
String v = values.get(prefix+prop)[p]; |
|
152 |
if (v.startsWith("|") && v.endsWith("|")) v = v.substring(1, v.length() - 1); // remove | for CQP multi value |
|
153 |
buffer.append(v); |
|
152 | 154 |
if (prop.equals("form") && endOfSubMatch) buffer.append("]"); |
153 | 155 |
} |
154 | 156 |
|
TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/search/UDSearchEngine.java (revision 3994) | ||
---|---|---|
8 | 8 |
import org.txm.conllu.core.function.UDTreeSearch; |
9 | 9 |
import org.txm.conllu.core.preferences.UDPreferences; |
10 | 10 |
import org.txm.core.engines.EngineType; |
11 |
import org.txm.core.preferences.TBXPreferences; |
|
11 | 12 |
import org.txm.core.preferences.TXMPreferences; |
12 | 13 |
import org.txm.core.results.TXMResult; |
13 | 14 |
import org.txm.objects.CorpusBuild; |
... | ... | |
18 | 19 |
import org.txm.searchengine.core.SearchEngine; |
19 | 20 |
import org.txm.searchengine.core.SearchEngineProperty; |
20 | 21 |
import org.txm.searchengine.core.Selection; |
22 |
import org.txm.searchengine.cqp.CQPPreferences; |
|
21 | 23 |
import org.txm.searchengine.cqp.CQPSearchEngine; |
22 | 24 |
import org.txm.searchengine.cqp.clientExceptions.CqiClientException; |
23 | 25 |
import org.txm.searchengine.cqp.corpus.CQPCorpus; |
... | ... | |
153 | 155 |
* @return A CQL query to build a Subcorpus of UD sentences |
154 | 156 |
*/ |
155 | 157 |
public static String getUDSCQPQueryString(String prefix) { |
156 |
return "["+prefix+"id=\"1\"] ["+prefix+"id !=\"1\"]+ [:"+prefix+"id=\"1|__UNDEF__\":]"; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ |
|
158 |
String emptyvalue = TBXPreferences.getInstance().getString(TBXPreferences.EMPTY_PROPERTY_VALUE_CODE); |
|
159 |
return "["+prefix+"id=\"1(-.+)?\"] []* [:"+prefix+"id=\"1(-.+)?|"+emptyvalue+"\":]"; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ |
|
157 | 160 |
} |
158 | 161 |
|
159 | 162 |
public String hasAdditionalDetailsForResult(TXMResult result) { |
TXM/trunk/bundles/org.txm.conllu.rcp/src/org/txm/conllu/rcp/importsection/CoNLLUSection.java (revision 3994) | ||
---|---|---|
71 | 71 |
gdata2.colspan = 4; // one line |
72 | 72 |
useNewDocIdButton.setLayoutData(gdata2); |
73 | 73 |
|
74 |
keepWordContractionsButton = toolkit.createButton(sectionClient, "Keep word contractions when importing CoNLL-U files", SWT.CHECK);
|
|
74 |
keepWordContractionsButton = toolkit.createButton(sectionClient, "Keep multiword tokens when importing CoNLL-U files", SWT.CHECK);
|
|
75 | 75 |
gdata2 = getButtonLayoutData(); |
76 | 76 |
gdata2.colspan = 4; // one line |
77 | 77 |
keepWordContractionsButton.setLayoutData(gdata2); |
... | ... | |
114 | 114 |
|
115 | 115 |
buildTIGERIndexesButton.setSelection(customNode.getBoolean(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES, UDPreferences.getInstance().getBoolean(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES))); //$NON-NLS-1$ |
116 | 116 |
useNewDocIdButton.setSelection(customNode.getBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID, UDPreferences.getInstance().getBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID))); //$NON-NLS-1$ |
117 |
keepWordContractionsButton.setSelection(customNode.getBoolean(UDPreferences.KEEP_CONTRACTIONS, UDPreferences.getInstance().getBoolean(UDPreferences.KEEP_CONTRACTIONS))); //$NON-NLS-1$
|
|
117 |
keepWordContractionsButton.setSelection(customNode.getBoolean(UDPreferences.KEEP_MULTIWORD_TOKENS, UDPreferences.getInstance().getBoolean(UDPreferences.KEEP_MULTIWORD_TOKENS))); //$NON-NLS-1$
|
|
118 | 118 |
udPropertiesPrefixButton.setText(customNode.get(UDPreferences.UDPREFIX, UDPreferences.getInstance().getString(UDPreferences.UDPREFIX))); //$NON-NLS-1$ |
119 | 119 |
headPropertiesText.setText(customNode.get(UDPreferences.IMPORT_HEAD_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_HEAD_TO_PROJECT))); //$NON-NLS-1$ |
120 | 120 |
depsPropertiesText.setText(customNode.get(UDPreferences.IMPORT_DEPS_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_DEPS_TO_PROJECT))); //$NON-NLS-1$ |
... | ... | |
127 | 127 |
Preferences customNode = project.getPreferencesScope().getNode(UDPreferences.getInstance().getPreferencesNodeQualifier()); |
128 | 128 |
customNode.putBoolean(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES, buildTIGERIndexesButton.getSelection()); |
129 | 129 |
customNode.putBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID, useNewDocIdButton.getSelection()); |
130 |
customNode.putBoolean(UDPreferences.KEEP_CONTRACTIONS, keepWordContractionsButton.getSelection());
|
|
130 |
customNode.putBoolean(UDPreferences.KEEP_MULTIWORD_TOKENS, keepWordContractionsButton.getSelection());
|
|
131 | 131 |
customNode.put(UDPreferences.UDPREFIX, udPropertiesPrefixButton.getText()); |
132 | 132 |
customNode.put(UDPreferences.IMPORT_HEAD_TO_PROJECT, headPropertiesText.getText()); |
133 | 133 |
customNode.put(UDPreferences.IMPORT_DEPS_TO_PROJECT, depsPropertiesText.getText()); |
TXM/trunk/bundles/org.txm.conllu.rcp/src/org/txm/conllu/rcp/preferences/CoNLLUPreferencePage.java (revision 3994) | ||
---|---|---|
26 | 26 |
this.addField(new BooleanFieldEditor(UDPreferences.IMPORT_USE_NEW_DOC_ID, "Use new odc id when importing CoNLL-U files", this.getFieldEditorParent())); |
27 | 27 |
this.addField(new BooleanFieldEditor(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES, "Build TIGERSearch indexes as well", this.getFieldEditorParent())); |
28 | 28 |
|
29 |
this.addField(new BooleanFieldEditor(UDPreferences.KEEP_CONTRACTIONS, "Keep word contractions when importing CoNLL-U files", this.getFieldEditorParent()));
|
|
29 |
this.addField(new BooleanFieldEditor(UDPreferences.KEEP_MULTIWORD_TOKENS, "Keep multiword tokens when importing CoNLL-U files", this.getFieldEditorParent()));
|
|
30 | 30 |
udPrefixField = new StringFieldEditor(UDPreferences.UDPREFIX, "UD properties prefix", this.getFieldEditorParent()); |
31 | 31 |
this.addField(udPrefixField); |
32 | 32 |
this.addField(new StringFieldEditor(UDPreferences.IMPORT_HEAD_TO_PROJECT, "UD head properties to project (comma separated list)", this.getFieldEditorParent())); |
Formats disponibles : Unified diff