Révision 3994
| TXM/trunk/bundles/org.txm.libs.deptreeviz/src/org/txm/libs/deptreeviz/UDDepTreeVizPrintTree.java (revision 3994) | ||
|---|---|---|
| 18 | 18 |
|
| 19 | 19 |
try {
|
| 20 | 20 |
PrintWriter pw = new PrintWriter(file); |
| 21 |
|
|
| 21 | 22 |
|
| 22 |
SimpleParse p = SimpleParse.fromConll(conll); |
|
| 23 |
|
|
| 24 |
// Fixing multiwords 6th position if equals to "_" |
|
| 25 |
ArrayList<String> conll2 = new ArrayList<>(); |
|
| 26 |
for (int i = 0 ; i < conll.size() ; i++) {
|
|
| 27 |
String l = conll.get(i); |
|
| 28 |
String split[] = l.split("\t");
|
|
| 29 |
// if (split[6].equals("_") && split[0].contains("-")) {
|
|
| 30 |
// if (conll.size() > i + 1) {
|
|
| 31 |
// String splitNext[] = conll.get(i+1).split("\t");
|
|
| 32 |
// if (split[0].startsWith(splitNext[0]+ "-")) {
|
|
| 33 |
// split[6] = splitNext[6]; |
|
| 34 |
// } |
|
| 35 |
// } |
|
| 36 |
// } |
|
| 37 |
// |
|
| 38 |
if (split[6].equals("_")) { // 6th col is broken and we don't have the tokens of the multiwords
|
|
| 39 |
//split[6] = "0"; |
|
| 40 |
continue; |
|
| 41 |
} |
|
| 42 |
conll2.add(l); |
|
| 43 |
// l = ""; |
|
| 44 |
// for (String s : split) l += s+"\t"; |
|
| 45 |
// |
|
| 46 |
// conll2.add(l.substring(0, l.length() - 1)); |
|
| 47 |
|
|
| 48 |
} |
|
| 49 |
for (String l : conll2) System.out.println(l); |
|
| 50 |
|
|
| 51 |
SimpleParse p = SimpleParse.fromConll(conll2); |
|
| 23 | 52 |
List<SimpleWord> words = p.getWords(); |
| 24 | 53 |
List<String> labels = p.getVerticesLabels().get("SYN");
|
| 25 | 54 |
|
| ... | ... | |
| 64 | 93 |
|
| 65 | 94 |
public static void main(String[] args) throws IOException {
|
| 66 | 95 |
|
| 67 |
File file = File.createTempFile("txm", ".svg", new File("/home/mdecorde"));
|
|
| 96 |
File file = new File("/home/mdecorde/deptreeviz.svg");
|
|
| 68 | 97 |
|
| 69 |
List<String> conll = Arrays.asList( |
|
| 70 |
"1 Ce ce PRON _ Number=Sing|Person=3|PronType=Dem 2 nsubj _ _", |
|
| 71 |
"2 doit devoir VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _", |
|
| 72 |
"3 être être AUX _ VerbForm=Inf 5 cop _ _", |
|
| 73 |
"4 un un DET _ Definite=Ind|Gender=Masc|Number=Sing|PronType=Art 5 det _ _", |
|
| 74 |
"5 général général NOUN _ Gender=Masc|Number=Sing 2 xcomp _ _", |
|
| 75 |
"6 étranger étranger ADJ _ Gender=Masc|Number=Sing 5 amod _ _", |
|
| 76 |
"7 . . PUNCT _ _ 2 punct _ _"); |
|
| 98 |
String all = """ |
|
| 99 |
1 [Pour] pour ADP _ _ 3 case _ start_char=4|end_char=8|ner=O|XmlId=w_1959_1 |
|
| 100 |
2 la le DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art 3 det _ start_char=9|end_char=11|ner=O|XmlId=w_1959_3 |
|
| 101 |
3 métropole métropole NOUN _ Gender=Fem|Number=Sing 15 obl:mod _ start_char=12|end_char=21|ner=O|XmlId=w_1959_4 |
|
| 102 |
4 française français ADJ _ Gender=Fem|Number=Sing 3 amod _ start_char=22|end_char=31|ner=O|XmlId=w_1959_5 |
|
| 103 |
5 , , PUNCT _ _ 3 punct _ start_char=31|end_char=32|ner=O|XmlId=w_1959_6 |
|
| 104 |
6 pour pour ADP _ _ 8 case _ start_char=33|end_char=37|ner=O|XmlId=w_1959_7 |
|
| 105 |
7 l' le DET _ Definite=Def|Number=Sing|PronType=Art 8 det _ start_char=38|end_char=40|ner=O|XmlId=w_1959_8 |
|
| 106 |
8 Algérie Algérie PROPN _ Number=Sing 15 obl:mod _ start_char=40|end_char=47|ner=S-LOC|XmlId=w_1959_9 |
|
| 107 |
9 , , PUNCT _ _ 8 punct _ start_char=47|end_char=48|ner=O|XmlId=w_1959_10 |
|
| 108 |
10 pour pour ADP _ _ 12 case _ start_char=49|end_char=53|ner=O|XmlId=w_1959_11 |
|
| 109 |
11 la le DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art 12 det _ start_char=54|end_char=56|ner=O|XmlId=w_1959_12 |
|
| 110 |
12 communauté communauté NOUN _ Gender=Fem|Number=Sing 15 obl:mod _ start_char=57|end_char=67|ner=O|XmlId=w_1959_13 |
|
| 111 |
13 , , PUNCT _ _ 12 punct _ start_char=67|end_char=68|ner=O|XmlId=w_1959_14 |
|
| 112 |
14 je il PRON _ Number=Sing|Person=1|PronType=Prs 15 nsubj _ start_char=69|end_char=71|ner=O|XmlId=w_1959_15 |
|
| 113 |
15 forme former VERB _ Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin 0 root _ start_char=72|end_char=77|ner=O|XmlId=w_1959_16 |
|
| 114 |
16 des un DET _ Definite=Ind|Number=Plur|PronType=Art 17 det _ start_char=78|end_char=81|ner=O|XmlId=w_1959_17 |
|
| 115 |
17 voeux vœu NOUN _ Gender=Masc|Number=Plur 15 obj _ start_char=83|end_char=88|ner=O|XmlId=w_1959_18 |
|
| 116 |
18 ardents ardent ADJ _ Gender=Masc|Number=Plur 17 amod _ start_char=90|end_char=97|ner=O|XmlId=w_1959_19 |
|
| 117 |
19 et et CCONJ _ _ 20 cc _ start_char=98|end_char=100|ner=O|XmlId=w_1959_20 |
|
| 118 |
20 confiants confiant ADJ _ Gender=Masc|Number=Plur 18 conj _ start_char=101|end_char=110|ner=O|XmlId=w_1959_21 |
|
| 119 |
21-22 au _ _ _ _ _ _ _ start_char=111|end_char=113|ner=O|XmlId=w_1959_22 |
|
| 120 |
21 à à ADP _ _ 24 case _ XmlId=w_1959_23 |
|
| 121 |
22 le le DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 24 det _ XmlId=w_1959_24 |
|
| 122 |
23 premier premier ADJ _ Gender=Masc|Number=Sing 24 amod _ start_char=114|end_char=121|ner=O|XmlId=w_1959_25 |
|
| 123 |
24 jour jour NOUN _ Gender=Masc|Number=Sing 15 obl:mod _ start_char=122|end_char=126|ner=O|XmlId=w_1959_26 |
|
| 124 |
25 de de ADP _ _ 26 case _ start_char=127|end_char=129|ner=O|XmlId=w_1959_27 |
|
| 125 |
26 1960 1960 NUM _ Number=Plur 24 nmod _ start_char=130|end_char=134|ner=O|XmlId=w_1959_28 |
|
| 126 |
27 . . PUNCT _ _ 15 punct _ start_char=134|end_char=135|ner=O|XmlId=w_1959_29 |
|
| 127 |
"""; |
|
| 77 | 128 |
|
| 78 | 129 |
|
| 130 |
List<String> conll = Arrays.asList(all.split("\n"));
|
|
| 131 |
|
|
| 132 |
|
|
| 79 | 133 |
System.out.println("FILE: " + print(file, conll, null, null));
|
| 80 | 134 |
} |
| 81 | 135 |
|
| TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImporter.groovy (revision 3994) | ||
|---|---|---|
| 30 | 30 |
super(module) |
| 31 | 31 |
} |
| 32 | 32 |
|
| 33 |
public final String merge (String s1, String s2) {
|
|
| 34 |
if (s1 == s2) return s1 |
|
| 35 |
if (s1 == "") return s2 |
|
| 36 |
if (s2 == "") return s1 |
|
| 37 |
if (s1 == "_") return s2 |
|
| 38 |
if (s2 == "_") return s1 |
|
| 39 |
if (s1 == "" && s2 == "") return "_" |
|
| 40 |
if (s1 == "_" && s2 == "_") return "_" |
|
| 41 |
return s1+"|"+s2 |
|
| 42 |
} |
|
| 43 |
|
|
| 33 | 44 |
@Override |
| 34 | 45 |
public void process() {
|
| 35 | 46 |
|
| ... | ... | |
| 47 | 58 |
return |
| 48 | 59 |
} |
| 49 | 60 |
} |
| 61 |
|
|
| 62 |
def files = conlluSrcDirectory.listFiles() |
|
| 63 |
files.sort() |
|
| 64 |
|
|
| 65 |
// Keep or not contractions |
|
| 66 |
boolean keepContractions = "true".equals(UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.KEEP_MULTIWORD_TOKENS, UDPreferences.getInstance().getString(UDPreferences.KEEP_MULTIWORD_TOKENS))); |
|
| 67 |
|
|
| 68 |
println "Remove multiword tokens (keepContractions=$keepContractions) & add XmlId if necessary" |
|
| 69 |
ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size()) |
|
| 70 |
for (File conlluFile : files) {
|
|
| 71 |
cpb_texts.tick() |
|
| 72 |
if (conlluFile.getName().endsWith(".conllu")) {
|
|
| 73 |
String textid = FileUtils.stripExtension(conlluFile) |
|
| 74 |
int wcounter = 1; |
|
| 75 |
ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8"); |
|
| 76 |
for (int i = 0 ; i < lines.size() ; i++) {
|
|
| 77 |
String line = lines[i] |
|
| 78 |
if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
|
|
| 79 |
|
|
| 80 |
def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
|
| 81 |
|
|
| 82 |
if (split[-1] != null && !split[-1].contains("XmlId=")) {
|
|
| 83 |
if (split[-1] == "_") {
|
|
| 84 |
split[-1] = "XmlId=w_"+textid+"_"+(wcounter++) |
|
| 85 |
} else {
|
|
| 86 |
split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++) |
|
| 87 |
} |
|
| 88 |
|
|
| 89 |
} |
|
| 90 |
|
|
| 91 |
if (!keepContractions && split[0] != null && split[0].contains("-")) {
|
|
| 92 |
println "REMOVE" |
|
| 93 |
println "\t"+lines[i] |
|
| 94 |
println "\t"+lines[i+1] |
|
| 95 |
println "\t"+lines[i+2] |
|
| 96 |
|
|
| 97 |
def split1 = lines[i+1].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
|
| 98 |
def split2 = lines[i+2].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
|
| 99 |
|
|
| 100 |
if (split[2] == "_") split[2] = merge(split1[2],split2[2]) |
|
| 101 |
if (split[3] == "_") split[3] = merge(split1[3], split2[3]) |
|
| 102 |
if (split[4] == "_") split[4] = merge(split1[4], split2[4]) |
|
| 103 |
if (split[5] == "_") split[5] = merge(split1[5], split2[5]) |
|
| 104 |
if (split[6] == "_") split[6] = split1[6] |
|
| 105 |
if (split[7] == "_") split[7] = merge(split1[7], split2[7]) |
|
| 106 |
if (split[8] == "_") split[8] = merge(split1[8], split2[8]) |
|
| 107 |
|
|
| 108 |
lines.remove(i+1) |
|
| 109 |
lines.remove(i+1) |
|
| 110 |
} |
|
| 111 |
|
|
| 112 |
lines[i] = split.join("\t")
|
|
| 113 |
} |
|
| 114 |
IOUtils.write(conlluFile, lines.join("\n") + "\n")
|
|
| 115 |
} |
|
| 116 |
} |
|
| 117 |
cpb_texts.done() |
|
| 118 |
|
|
| 119 |
|
|
| 120 |
// // Fix missing XmlId in conllu files |
|
| 121 |
// println "Setting word XmlID if necessary" |
|
| 122 |
// ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size()) |
|
| 123 |
// for (File conlluFile : files) {
|
|
| 124 |
// cpb_texts.tick() |
|
| 125 |
// if (conlluFile.getName().endsWith(".conllu")) {
|
|
| 126 |
// String textid = FileUtils.stripExtension(conlluFile) |
|
| 127 |
// int wcounter = 1; |
|
| 128 |
// ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8"); |
|
| 129 |
// for (int i = 0 ; i < lines.size() ; i++) {
|
|
| 130 |
// String line = lines[i] |
|
| 131 |
// if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
|
|
| 132 |
// |
|
| 133 |
// def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
|
| 134 |
// if (split[-1] != null && !split[-1].contains("XmlId=")) {
|
|
| 135 |
// split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++) |
|
| 136 |
// lines[i] = split.join("\t")
|
|
| 137 |
// } |
|
| 138 |
// |
|
| 139 |
// } |
|
| 140 |
// IOUtils.write(conlluFile, lines.join("\n") + "\n")
|
|
| 141 |
// } |
|
| 142 |
// } |
|
| 143 |
// cpb_texts.done() |
|
| 144 |
|
|
| 50 | 145 |
File metadataFile = Metadatas.findMetadataFile(module.sourceDirectory) |
| 51 | 146 |
File srcDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu2tei") |
| 52 | 147 |
srcDirectory.deleteDir() |
| ... | ... | |
| 72 | 167 |
println "Aborting. No CONLL file found in $inputDirectory." |
| 73 | 168 |
return false |
| 74 | 169 |
} |
| 75 |
|
|
| 170 |
files.sort() |
|
| 76 | 171 |
ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size()) |
| 77 | 172 |
|
| 78 | 173 |
println "Splitting CoNLL-U files..." |
| ... | ... | |
| 117 | 212 |
println "Aborting. No CONLL file found in $inputDirectory." |
| 118 | 213 |
return false |
| 119 | 214 |
} |
| 120 |
|
|
| 215 |
files.sort() |
|
| 121 | 216 |
def properties = Arrays.asList(ImportCoNLLUAnnotations.UD_PROPERTY_NAMES) |
| 122 | 217 |
|
| 123 | 218 |
String prefix = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.UDPREFIX, UDPreferences.getInstance().getString(UDPreferences.UDPREFIX)); |
| 124 | 219 |
|
| 125 | 220 |
UDPreferences.getInstance().setProjectPreferenceValue(project, UDPreferences.UDPREFIX, prefix); // copy the current preference into the corpus preference |
| 126 | 221 |
|
| 127 |
boolean keepContractions = "true".equals(UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.KEEP_CONTRACTIONS, UDPreferences.getInstance().getString(UDPreferences.KEEP_CONTRACTIONS))); |
|
| 128 |
|
|
| 129 | 222 |
def headPropertiesToProject = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.IMPORT_HEAD_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_HEAD_TO_PROJECT)).split(",") as Set
|
| 130 | 223 |
|
| 131 | 224 |
def depsPropertiesToProject = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.IMPORT_DEPS_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_DEPS_TO_PROJECT)).split(",") as Set
|
| ... | ... | |
| 172 | 265 |
|
| 173 | 266 |
} else {
|
| 174 | 267 |
|
| 175 |
HashMap<String, String> wProperties = new HashMap<String, String>()
|
|
| 268 |
LinkedHashMap<String, String> wProperties = new LinkedHashMap<String, String>()
|
|
| 176 | 269 |
|
| 177 |
def split = line.split("\t")
|
|
| 270 |
def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length)
|
|
| 178 | 271 |
if (split.size() == properties.size()) {
|
| 179 | 272 |
String id = split[0] |
| 180 | 273 |
for (int i = 0 ; i < split.size() ; i++) {
|
| 181 | 274 |
wProperties[properties[i]] = split[i] |
| 182 | 275 |
} |
| 183 | 276 |
|
| 184 |
if (wProperties.get("id").equals("1")) { // it's a new sentence, store the current if any and starts a new sentence
|
|
| 185 |
if (words.size() > 0) {
|
|
| 186 |
def sentence = [par_id, sent_id, words, comments] |
|
| 187 |
content.add(sentence) |
|
| 188 |
|
|
| 189 |
sent_id = "" |
|
| 190 |
par_id = "1" |
|
| 191 |
comments = [] |
|
| 192 |
words = [] |
|
| 193 |
} |
|
| 277 |
// if (wProperties.get("id").equals("1") || wProperties.get("id").startsWith("1-")) { // it's a new sentence, store the current if any and starts a new sentence
|
|
| 278 |
// if (words.size() > 0) {
|
|
| 279 |
// def sentence = [par_id, sent_id, words, comments] |
|
| 280 |
// content.add(sentence) |
|
| 281 |
// |
|
| 282 |
// sent_id = "" |
|
| 283 |
// par_id = "1" |
|
| 284 |
// comments = [] |
|
| 285 |
// words = [] |
|
| 286 |
// } |
|
| 287 |
// } |
|
| 288 |
if (wProperties[properties[0]].contains(".")) { // id
|
|
| 289 |
// empty node |
|
| 290 |
} else {
|
|
| 291 |
words << wProperties |
|
| 194 | 292 |
} |
| 195 |
|
|
| 196 |
words << wProperties |
|
| 197 | 293 |
} else {
|
| 198 | 294 |
//println "Warning: not a line: "+line |
| 199 | 295 |
} |
| 200 | 296 |
} |
| 201 | 297 |
} |
| 202 | 298 |
|
| 299 |
if (words.size() > 0) { // last sentence ?
|
|
| 300 |
def sentence = [par_id, sent_id, words, comments] |
|
| 301 |
content.add(sentence) |
|
| 302 |
} |
|
| 303 |
|
|
| 203 | 304 |
if (content.size() == 0) {
|
| 204 | 305 |
continue; |
| 205 | 306 |
} |
| 206 | 307 |
|
| 308 |
//println "${content.size()} sentences found."
|
|
| 309 |
|
|
| 207 | 310 |
File xmlFile = new File(srcDirectory, text_id+".xml") |
| 208 | 311 |
// println "xmlFile=$xmlFile" |
| 209 | 312 |
BufferedOutputStream output = new BufferedOutputStream(new FileOutputStream(xmlFile)) |
| ... | ... | |
| 251 | 354 |
writer.writeCharacters("\n")
|
| 252 | 355 |
} |
| 253 | 356 |
|
| 254 |
if (!keepContractions) { // merge properties in the "-" word and remove the parts
|
|
| 255 |
for (int i = 0 ; i < words.size() ; i++) {
|
|
| 256 |
def word = words[i] |
|
| 257 |
String id = word[0] |
|
| 258 |
|
|
| 259 |
if (id.contains("-")) { // multi-word line
|
|
| 260 |
int index = id.indexOf("-")
|
|
| 261 |
String id1 = id.substring(0, index) |
|
| 262 |
String id2 = id.substring(index+1) |
|
| 263 |
def token1 = sentence[id1] |
|
| 264 |
def token2 = sentence[id2] |
|
| 265 |
|
|
| 266 |
if (token1 == null || token2 == null) {
|
|
| 267 |
println "Error: text $text_id paragraph $par_id sent $sent_id word $id has wrong token ids $id1 and $id2 -> $token1 and $token2" |
|
| 268 |
continue |
|
| 269 |
} |
|
| 270 |
|
|
| 271 |
for (String p : properties) {
|
|
| 272 |
if (p == "id") continue // don't merge the form property |
|
| 273 |
if (p == "form") continue // don't merge the form property |
|
| 274 |
word[p] = token1[p] + "+" + token2[p] |
|
| 275 |
} |
|
| 276 |
words.remove(i+1) // remove the token |
|
| 277 |
words.remove(i+1) // remove the token |
|
| 278 |
} |
|
| 279 |
} |
|
| 280 |
} |
|
| 281 |
|
|
| 282 | 357 |
if (headPropertiesToProject.size() > 0 || depsPropertiesToProject.size() > 0) {
|
| 283 | 358 |
LinkedHashMap sentencehash = new LinkedHashMap() |
| 284 | 359 |
//println "WORDS="+words |
| TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImport.groovy (revision 3994) | ||
|---|---|---|
| 123 | 123 |
File tigerCorpusExistingDirectory = new File(this.binaryDirectory, "tiger/"+corpusName); |
| 124 | 124 |
MainCorpus corpus = this.project.getFirstChild(MainCorpus.class); |
| 125 | 125 |
corpus.compute(false); // load the corpus in CQP |
| 126 |
TIGERSearchEngine.writeOffsetDataFiles(corpus, "editionId", tigerCorpusExistingDirectory, tigerDirectory, tigerCorpusExistingDirectory) |
|
| 126 |
TIGERSearchEngine.writeOffsetDataFiles(corpus, "editionId", tigerCorpusExistingDirectory, tigerDirectory, tigerCorpusExistingDirectory, kee)
|
|
| 127 | 127 |
} |
| 128 | 128 |
} else {
|
| 129 | 129 |
println "Can not do the TIGER indexes step." |
| TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/preferences/UDPreferences.java (revision 3994) | ||
|---|---|---|
| 17 | 17 |
|
| 18 | 18 |
public static String UDPREFIX = "ud_prefix"; |
| 19 | 19 |
|
| 20 |
public static String KEEP_CONTRACTIONS = "import_keep_contractions";
|
|
| 20 |
public static String KEEP_MULTIWORD_TOKENS = "import_KEEP_MULTIWORD_TOKENS";
|
|
| 21 | 21 |
|
| 22 | 22 |
public static String IMPORT_USE_NEW_DOC_ID = "import_use_new_doc_id"; |
| 23 | 23 |
|
| ... | ... | |
| 48 | 48 |
preferences.put(UDPREFIX, "ud-"); |
| 49 | 49 |
preferences.put(IMPORT_HEAD_TO_PROJECT, "upos,deprel"); |
| 50 | 50 |
preferences.put(IMPORT_DEPS_TO_PROJECT, "upos,deprel"); |
| 51 |
preferences.putBoolean(KEEP_CONTRACTIONS, true);
|
|
| 51 |
preferences.putBoolean(KEEP_MULTIWORD_TOKENS, false);
|
|
| 52 | 52 |
preferences.putBoolean(IMPORT_USE_NEW_DOC_ID, true); |
| 53 | 53 |
preferences.putBoolean(IMPORT_BUILD_TIGERSEARCH_INDEXES, true); |
| 54 | 54 |
} |
| TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/function/UDTreeSearch.java (revision 3994) | ||
|---|---|---|
| 148 | 148 |
|
| 149 | 149 |
if (buffer.length() > 0) buffer.append("\t");
|
| 150 | 150 |
if (prop.equals("form") && startOfSubMatch) buffer.append("[");
|
| 151 |
buffer.append(values.get(prefix+prop)[p]); |
|
| 151 |
String v = values.get(prefix+prop)[p]; |
|
| 152 |
if (v.startsWith("|") && v.endsWith("|")) v = v.substring(1, v.length() - 1); // remove | for CQP multi value
|
|
| 153 |
buffer.append(v); |
|
| 152 | 154 |
if (prop.equals("form") && endOfSubMatch) buffer.append("]");
|
| 153 | 155 |
} |
| 154 | 156 |
|
| TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/search/UDSearchEngine.java (revision 3994) | ||
|---|---|---|
| 8 | 8 |
import org.txm.conllu.core.function.UDTreeSearch; |
| 9 | 9 |
import org.txm.conllu.core.preferences.UDPreferences; |
| 10 | 10 |
import org.txm.core.engines.EngineType; |
| 11 |
import org.txm.core.preferences.TBXPreferences; |
|
| 11 | 12 |
import org.txm.core.preferences.TXMPreferences; |
| 12 | 13 |
import org.txm.core.results.TXMResult; |
| 13 | 14 |
import org.txm.objects.CorpusBuild; |
| ... | ... | |
| 18 | 19 |
import org.txm.searchengine.core.SearchEngine; |
| 19 | 20 |
import org.txm.searchengine.core.SearchEngineProperty; |
| 20 | 21 |
import org.txm.searchengine.core.Selection; |
| 22 |
import org.txm.searchengine.cqp.CQPPreferences; |
|
| 21 | 23 |
import org.txm.searchengine.cqp.CQPSearchEngine; |
| 22 | 24 |
import org.txm.searchengine.cqp.clientExceptions.CqiClientException; |
| 23 | 25 |
import org.txm.searchengine.cqp.corpus.CQPCorpus; |
| ... | ... | |
| 153 | 155 |
* @return A CQL query to build a Subcorpus of UD sentences |
| 154 | 156 |
*/ |
| 155 | 157 |
public static String getUDSCQPQueryString(String prefix) {
|
| 156 |
return "["+prefix+"id=\"1\"] ["+prefix+"id !=\"1\"]+ [:"+prefix+"id=\"1|__UNDEF__\":]"; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ |
|
| 158 |
String emptyvalue = TBXPreferences.getInstance().getString(TBXPreferences.EMPTY_PROPERTY_VALUE_CODE); |
|
| 159 |
return "["+prefix+"id=\"1(-.+)?\"] []* [:"+prefix+"id=\"1(-.+)?|"+emptyvalue+"\":]"; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ |
|
| 157 | 160 |
} |
| 158 | 161 |
|
| 159 | 162 |
public String hasAdditionalDetailsForResult(TXMResult result) {
|
| TXM/trunk/bundles/org.txm.conllu.rcp/src/org/txm/conllu/rcp/importsection/CoNLLUSection.java (revision 3994) | ||
|---|---|---|
| 71 | 71 |
gdata2.colspan = 4; // one line |
| 72 | 72 |
useNewDocIdButton.setLayoutData(gdata2); |
| 73 | 73 |
|
| 74 |
keepWordContractionsButton = toolkit.createButton(sectionClient, "Keep word contractions when importing CoNLL-U files", SWT.CHECK);
|
|
| 74 |
keepWordContractionsButton = toolkit.createButton(sectionClient, "Keep multiword tokens when importing CoNLL-U files", SWT.CHECK);
|
|
| 75 | 75 |
gdata2 = getButtonLayoutData(); |
| 76 | 76 |
gdata2.colspan = 4; // one line |
| 77 | 77 |
keepWordContractionsButton.setLayoutData(gdata2); |
| ... | ... | |
| 114 | 114 |
|
| 115 | 115 |
buildTIGERIndexesButton.setSelection(customNode.getBoolean(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES, UDPreferences.getInstance().getBoolean(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES))); //$NON-NLS-1$ |
| 116 | 116 |
useNewDocIdButton.setSelection(customNode.getBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID, UDPreferences.getInstance().getBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID))); //$NON-NLS-1$ |
| 117 |
keepWordContractionsButton.setSelection(customNode.getBoolean(UDPreferences.KEEP_CONTRACTIONS, UDPreferences.getInstance().getBoolean(UDPreferences.KEEP_CONTRACTIONS))); //$NON-NLS-1$
|
|
| 117 |
keepWordContractionsButton.setSelection(customNode.getBoolean(UDPreferences.KEEP_MULTIWORD_TOKENS, UDPreferences.getInstance().getBoolean(UDPreferences.KEEP_MULTIWORD_TOKENS))); //$NON-NLS-1$
|
|
| 118 | 118 |
udPropertiesPrefixButton.setText(customNode.get(UDPreferences.UDPREFIX, UDPreferences.getInstance().getString(UDPreferences.UDPREFIX))); //$NON-NLS-1$ |
| 119 | 119 |
headPropertiesText.setText(customNode.get(UDPreferences.IMPORT_HEAD_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_HEAD_TO_PROJECT))); //$NON-NLS-1$ |
| 120 | 120 |
depsPropertiesText.setText(customNode.get(UDPreferences.IMPORT_DEPS_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_DEPS_TO_PROJECT))); //$NON-NLS-1$ |
| ... | ... | |
| 127 | 127 |
Preferences customNode = project.getPreferencesScope().getNode(UDPreferences.getInstance().getPreferencesNodeQualifier()); |
| 128 | 128 |
customNode.putBoolean(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES, buildTIGERIndexesButton.getSelection()); |
| 129 | 129 |
customNode.putBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID, useNewDocIdButton.getSelection()); |
| 130 |
customNode.putBoolean(UDPreferences.KEEP_CONTRACTIONS, keepWordContractionsButton.getSelection());
|
|
| 130 |
customNode.putBoolean(UDPreferences.KEEP_MULTIWORD_TOKENS, keepWordContractionsButton.getSelection());
|
|
| 131 | 131 |
customNode.put(UDPreferences.UDPREFIX, udPropertiesPrefixButton.getText()); |
| 132 | 132 |
customNode.put(UDPreferences.IMPORT_HEAD_TO_PROJECT, headPropertiesText.getText()); |
| 133 | 133 |
customNode.put(UDPreferences.IMPORT_DEPS_TO_PROJECT, depsPropertiesText.getText()); |
| TXM/trunk/bundles/org.txm.conllu.rcp/src/org/txm/conllu/rcp/preferences/CoNLLUPreferencePage.java (revision 3994) | ||
|---|---|---|
| 26 | 26 |
this.addField(new BooleanFieldEditor(UDPreferences.IMPORT_USE_NEW_DOC_ID, "Use new odc id when importing CoNLL-U files", this.getFieldEditorParent())); |
| 27 | 27 |
this.addField(new BooleanFieldEditor(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES, "Build TIGERSearch indexes as well", this.getFieldEditorParent())); |
| 28 | 28 |
|
| 29 |
this.addField(new BooleanFieldEditor(UDPreferences.KEEP_CONTRACTIONS, "Keep word contractions when importing CoNLL-U files", this.getFieldEditorParent()));
|
|
| 29 |
this.addField(new BooleanFieldEditor(UDPreferences.KEEP_MULTIWORD_TOKENS, "Keep multiword tokens when importing CoNLL-U files", this.getFieldEditorParent()));
|
|
| 30 | 30 |
udPrefixField = new StringFieldEditor(UDPreferences.UDPREFIX, "UD properties prefix", this.getFieldEditorParent()); |
| 31 | 31 |
this.addField(udPrefixField); |
| 32 | 32 |
this.addField(new StringFieldEditor(UDPreferences.IMPORT_HEAD_TO_PROJECT, "UD head properties to project (comma separated list)", this.getFieldEditorParent())); |
Formats disponibles : Unified diff