Révision 4012
TXM/trunk/bundles/org.txm.conllu.rcp/src/org/txm/conllu/rcp/importsection/CoNLLUSection.java (revision 4012) | ||
---|---|---|
108 | 108 |
gdata2.colspan = 3; // one line |
109 | 109 |
depsPropertiesText.setLayoutData(gdata2); |
110 | 110 |
|
111 |
printNewLinesInEditionsButton = toolkit.createButton(sectionClient, "Print a newline after a sentence in editions", SWT.CHECK);
|
|
111 |
printNewLinesInEditionsButton = toolkit.createButton(sectionClient, "Format sentences in edition/Formatage des phrases dans l'édition", SWT.CHECK);
|
|
112 | 112 |
gdata2 = getButtonLayoutData(); |
113 | 113 |
gdata2.colspan = 4; // one line |
114 | 114 |
printNewLinesInEditionsButton.setLayoutData(gdata2); |
TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/FixDriverFile.java (revision 4012) | ||
---|---|---|
1 |
package org.txm.conllu.core; |
|
2 |
|
|
3 |
import java.io.File; |
|
4 |
import java.io.IOException; |
|
5 |
import java.util.Arrays; |
|
6 |
import java.util.HashMap; |
|
7 |
import java.util.HashSet; |
|
8 |
import java.util.List; |
|
9 |
|
|
10 |
import javax.xml.parsers.ParserConfigurationException; |
|
11 |
|
|
12 |
import org.txm.utils.xml.DomUtils; |
|
13 |
import org.w3c.dom.Document; |
|
14 |
import org.w3c.dom.Element; |
|
15 |
import org.w3c.dom.NodeList; |
|
16 |
import org.xml.sax.SAXException; |
|
17 |
|
|
18 |
/** |
|
19 |
* |
|
20 |
* @author mdecorde |
|
21 |
* |
|
22 |
*/ |
|
23 |
public class FixDriverFile { |
|
24 |
|
|
25 |
public static boolean fixFeatureValues(File driverFile, List<File> xmlFiles) throws ParserConfigurationException, SAXException, IOException { |
|
26 |
|
|
27 |
HashMap<String, HashSet<String>> declaredFeatures = new HashMap<String, HashSet<String>>(); |
|
28 |
HashMap<String, HashSet<String>> missingFeatures = new HashMap<String, HashSet<String>>(); |
|
29 |
HashMap<String, Element> featuresElements = new HashMap<String, Element>(); |
|
30 |
|
|
31 |
Document doc = DomUtils.load(driverFile); |
|
32 |
NodeList featuresList = doc.getElementsByTagName("feature"); |
|
33 |
for (int i = 0 ; i < featuresList.getLength() ; i++) { |
|
34 |
Element f = (Element) featuresList.item(i); |
|
35 |
|
|
36 |
featuresElements.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), f); |
|
37 |
HashSet<String> values = new HashSet<String>(); |
|
38 |
declaredFeatures.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), values); |
|
39 |
missingFeatures.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), new HashSet<String>()); |
|
40 |
|
|
41 |
NodeList featureValuesList = f.getElementsByTagName("value"); |
|
42 |
for (int j = 0 ; j < featureValuesList.getLength() ; j++) { |
|
43 |
Element v = (Element) featureValuesList.item(j); |
|
44 |
values.add(v.getAttribute("name")); |
|
45 |
} |
|
46 |
} |
|
47 |
//System.out.println("Declared: "+declaredFeatures.keySet()); |
|
48 |
|
|
49 |
for (File xmlFile : xmlFiles) { |
|
50 |
Document doc2 = DomUtils.load(xmlFile); |
|
51 |
NodeList tList = doc2.getElementsByTagName("t"); |
|
52 |
for (int i = 0 ; i < tList.getLength() ; i++) { |
|
53 |
Element e = (Element) tList.item(i); |
|
54 |
//System.out.println("T="+e.getAttributes()); |
|
55 |
for (int j = 0 ; j < e.getAttributes().getLength() ; j++) { |
|
56 |
|
|
57 |
String name = e.getAttributes().item(j).getLocalName()+ "\tT"; |
|
58 |
String value = e.getAttributes().item(j).getNodeValue(); |
|
59 |
|
|
60 |
if (declaredFeatures.containsKey(name) && declaredFeatures.get(name).size() > 0) { |
|
61 |
HashSet<String> existingValues = declaredFeatures.get(name); |
|
62 |
if (existingValues.contains(value)) { |
|
63 |
// ok |
|
64 |
} else { |
|
65 |
missingFeatures.get(name).add(value); |
|
66 |
} |
|
67 |
} |
|
68 |
} |
|
69 |
|
|
70 |
} |
|
71 |
tList = doc2.getElementsByTagName("nt"); |
|
72 |
for (int i = 0 ; i < tList.getLength() ; i++) { |
|
73 |
Element e = (Element) tList.item(i); |
|
74 |
//System.out.println("NT="+e.getAttributes()); |
|
75 |
for (int j = 0 ; j < e.getAttributes().getLength() ; j++) { |
|
76 |
|
|
77 |
String name = e.getAttributes().item(j).getLocalName()+ "\tNT"; |
|
78 |
String value = e.getAttributes().item(j).getNodeValue(); |
|
79 |
|
|
80 |
if (declaredFeatures.containsKey(name) && declaredFeatures.get(name).size() > 0) { |
|
81 |
HashSet<String> existingValues = declaredFeatures.get(name); |
|
82 |
if (existingValues.contains(value)) { |
|
83 |
// ok |
|
84 |
} else { |
|
85 |
missingFeatures.get(name).add(value); |
|
86 |
} |
|
87 |
} |
|
88 |
} |
|
89 |
} |
|
90 |
} |
|
91 |
|
|
92 |
//System.out.println("Missing values:"); |
|
93 |
for (String missingFeatureNamedomain : missingFeatures.keySet()) { |
|
94 |
if (missingFeatures.get(missingFeatureNamedomain).size() == 0) continue; |
|
95 |
//System.out.println("\t"+missingFeatureNamedomain); |
|
96 |
|
|
97 |
Element f = featuresElements.get(missingFeatureNamedomain); |
|
98 |
|
|
99 |
for (String v : missingFeatures.get(missingFeatureNamedomain)) { |
|
100 |
//System.out.println("\t\t"+v); |
|
101 |
Element missingFeatureElement = f.getOwnerDocument().createElement("value"); |
|
102 |
missingFeatureElement.setAttribute("name", v); |
|
103 |
missingFeatureElement.setTextContent(v); |
|
104 |
f.appendChild(missingFeatureElement); |
|
105 |
} |
|
106 |
} |
|
107 |
|
|
108 |
return DomUtils.save(doc, driverFile); |
|
109 |
} |
|
110 |
|
|
111 |
public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException { |
|
112 |
File[] files = {new File("/home/mdecorde/runtime-rcpapplication.product/corpora/VOEUX-CONLLU/tiger-xml/1959.xml"), |
|
113 |
new File("/home/mdecorde/runtime-rcpapplication.product/corpora/VOEUX-CONLLU/tiger-xml/1960.xml"), |
|
114 |
new File("/home/mdecorde/runtime-rcpapplication.product/corpora/VOEUX-CONLLU/tiger-xml/1961.xml")}; |
|
115 |
|
|
116 |
FixDriverFile.fixFeatureValues(new File("/home/mdecorde/runtime-rcpapplication.product/corpora/VOEUX-CONLLU/tiger-xml/corpus.xml"), Arrays.asList(files)); |
|
117 |
} |
|
118 |
} |
|
0 | 119 |
TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/CoNLLU2TIGER.java (revision 4012) | ||
---|---|---|
67 | 67 |
|
68 | 68 |
for (File conlluFile : conlluFiles) { |
69 | 69 |
|
70 |
|
|
71 | 70 |
String filename = FileUtils.stripExtension(conlluFile); |
72 | 71 |
File tigerXMLFile = new File(tigerDirectory, filename+".xml"); |
73 | 72 |
tigerOutput = new BufferedOutputStream(new FileOutputStream(tigerXMLFile), 16 * 1024); |
TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImporter.groovy (revision 4012) | ||
---|---|---|
37 | 37 |
if (s != "" && s != "_") { |
38 | 38 |
|
39 | 39 |
def ssset = new HashSet(sss); |
40 |
if (ssset.size() == 1) return ssset.join(" + ")
|
|
40 |
if (ssset.size() == 1) return ssset.join(".")
|
|
41 | 41 |
|
42 |
return sss.join(" + ")
|
|
42 |
return sss.join(".")
|
|
43 | 43 |
} |
44 | 44 |
} |
45 | 45 |
|
... | ... | |
67 | 67 |
files.sort() |
68 | 68 |
|
69 | 69 |
// Keep or not contractions |
70 |
String keepContractions = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
|
|
70 |
String contractionsManagement = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
|
|
71 | 71 |
|
72 |
println "Contractions managment ($keepContractions) & add XmlId if necessary & remove empty nodes"
|
|
72 |
println "Contractions managment ($contractionsManagement) & add XmlId if necessary & remove empty nodes"
|
|
73 | 73 |
ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size()) |
74 | 74 |
for (File conlluFile : files) { |
75 | 75 |
cpb_texts.tick() |
76 | 76 |
if (conlluFile.getName().endsWith(".conllu")) { |
77 | 77 |
String textid = FileUtils.stripExtension(conlluFile) |
78 | 78 |
int wcounter = 1; |
79 |
|
|
79 | 80 |
ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8"); |
81 |
|
|
82 |
def temp_multiwords = [:] |
|
83 |
|
|
80 | 84 |
for (int i = 0 ; i < lines.size() ; i++) { |
81 | 85 |
String line = lines[i] |
82 | 86 |
if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue; |
... | ... | |
98 | 102 |
continue; // next ! |
99 | 103 |
} |
100 | 104 |
|
101 |
if (keepContractions == UDPreferences.ALL) {
|
|
105 |
if (contractionsManagement == UDPreferences.ALL) {
|
|
102 | 106 |
// ok on fait rien |
103 |
} else if (keepContractions == UDPreferences.SYNTAX) {
|
|
107 |
} else if (contractionsManagement == UDPreferences.SYNTAX) {
|
|
104 | 108 |
if (split[0].contains("-")) { |
109 |
|
|
110 |
// stores the syntatic word id and the ortographic word properties |
|
111 |
temp_multiwords = [:] |
|
112 |
int n1 = Integer.parseInt(split[0].substring(0, split[0].indexOf("-"))); |
|
113 |
int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-"))); |
|
114 |
for (int ii = n1 ; ii <= n2 ; ii++) { |
|
115 |
temp_multiwords[""+ii] = split; |
|
116 |
} |
|
117 |
|
|
105 | 118 |
//println "REMOVE - $split" |
106 | 119 |
lines.remove(i) |
107 | 120 |
i-- |
108 | 121 |
continue; /// next ! |
122 |
} else if (temp_multiwords.containsKey(split[0])) { // it's a syntactic word of an orthographic word -> add the orthographic form in the misc field |
|
123 |
def split_ortho = temp_multiwords.remove(split[0]) |
|
124 |
|
|
125 |
if (split[9].length() > 0) split[9] += "|" |
|
126 |
split[9] += "multiword="+split_ortho[1] // the orthographic form |
|
127 |
|
|
109 | 128 |
} |
110 |
} else if (keepContractions == UDPreferences.SURFACE) {
|
|
129 |
} else if (contractionsManagement == UDPreferences.SURFACE) {
|
|
111 | 130 |
if (split[0].contains("-")) { |
112 | 131 |
int n1 = Integer.parseInt(split[0].substring(0, split[0].indexOf("-"))); |
113 | 132 |
int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-"))); |
... | ... | |
116 | 135 |
split[0] = ""+n1 |
117 | 136 |
|
118 | 137 |
def splits = [] |
119 |
for (int j = 1 ; j <= n ;j++) {
|
|
120 |
def tmp = lines[i+j].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length); |
|
138 |
for (int j = 0 ; j <= n ;j++) {
|
|
139 |
def tmp = lines[i+j+1].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
|
121 | 140 |
if (tmp[0].contains(".")) { |
122 | 141 |
//println "PRE-REMOVE EMPTY NODE: $split : "+ |
123 | 142 |
lines.remove(i+j) |
... | ... | |
126 | 145 |
} |
127 | 146 |
splits << tmp |
128 | 147 |
} |
129 |
|
|
148 |
|
|
130 | 149 |
for (int j = 2 ; j < 8 ; j++) { |
131 | 150 |
split[j] = merge(split[j], splits.collect(){it[j]}) |
132 | 151 |
} |
133 |
|
|
134 |
if (split[9].length() > 0) split[9] += "|" |
|
135 |
split[9] += "expand="+splits.collect(){it[1]}.join("_") |
|
136 |
|
|
137 |
println "REMOVE non- $split" |
|
152 |
|
|
153 |
//println "REMOVE non- $split" |
|
138 | 154 |
for (int j = 0 ; j <= n ;j++) { |
139 | 155 |
lines.remove(i+1) |
140 | 156 |
} |
141 |
println "splits=$splits" |
|
157 |
//println "splits=$splits"
|
|
142 | 158 |
} |
143 | 159 |
} |
144 | 160 |
|
... | ... | |
262 | 278 |
|
263 | 279 |
def printNewLines = "true" == UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.IMPORT_PRINT_NEWLINES_AFTER_SENTENCES, ""+UDPreferences.getInstance().getString(UDPreferences.IMPORT_PRINT_NEWLINES_AFTER_SENTENCES)) |
264 | 280 |
|
281 |
String contractionsManagement = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT)); |
|
282 |
|
|
265 | 283 |
ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size()) |
266 | 284 |
|
267 | 285 |
println "Parsing CoNLL-U files..." |
... | ... | |
403 | 421 |
ImportCoNLLUAnnotations.buildPropertiesProjections(sentencehash, headPropertiesToProject, depsPropertiesToProject) |
404 | 422 |
} |
405 | 423 |
|
424 |
if (printNewLines) { |
|
425 |
writer.writeStartElement("p") |
|
426 |
writer.writeAttribute("type", "sentence") |
|
427 |
} |
|
428 |
|
|
406 | 429 |
for (def word : words) { |
407 | 430 |
|
408 | 431 |
String id = null |
... | ... | |
431 | 454 |
writer.writeCharacters(" ") |
432 | 455 |
} |
433 | 456 |
|
434 |
if (printNewLines) writer.writeEmptyElement("lb")
|
|
457 |
if (printNewLines) writer.writeEndElement()
|
|
435 | 458 |
|
436 | 459 |
writer.writeCharacters("\n") |
437 | 460 |
writer.writeEndElement() // s |
... | ... | |
446 | 469 |
writer.writeCharacters("\n") |
447 | 470 |
writer.writeEndElement() // TEI |
448 | 471 |
writer.close() |
449 |
|
|
450 | 472 |
} |
451 | 473 |
|
452 | 474 |
cpb_texts.done() |
TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImport.groovy (revision 4012) | ||
---|---|---|
23 | 23 |
import org.txm.tigersearch.preferences.TigerSearchPreferences |
24 | 24 |
import org.txm.conllu.core.CallUD2TigerPerlScript |
25 | 25 |
|
26 |
import org.txm.conllu.core.FixDriverFile |
|
27 |
|
|
26 | 28 |
class CoNLLUImport extends XTZImport { |
27 | 29 |
|
28 | 30 |
public CoNLLUImport(Project params) { |
... | ... | |
102 | 104 |
} |
103 | 105 |
} |
104 | 106 |
|
105 |
// patch the subcorpus tags in the driver XML file with the right corpus order |
|
107 |
// patch the subcorpus tags in the driver XML file with the right corpus order : 1) the text order 2) the properties values
|
|
106 | 108 |
File driver = new File(this.binaryDirectory, "tiger-xml/"+driverFilename) |
107 | 109 |
String content = IOUtils.getText(driver, "UTF-8"); |
108 | 110 |
content = content.replaceAll("<subcorpus .+\n", ""); |
... | ... | |
112 | 114 |
subcorpusList += "<subcorpus name=\"$name\" external=\"file:${name}.xml\"/>\n" |
113 | 115 |
} |
114 | 116 |
content = content.replaceAll("<body>", "<body>\n"+subcorpusList+"\n"); // get the last main.xml content and patch it with the subcorpus tags |
117 |
|
|
118 |
// write the modified driver file |
|
115 | 119 |
IOUtils.setText(driver, content, "UTF-8"); |
120 |
|
|
121 |
|
|
122 |
FixDriverFile.fixFeatureValues(driver, tigerxmlFiles) |
|
123 |
|
|
124 |
|
|
125 |
|
|
126 |
|
|
127 |
|
|
116 | 128 |
// build TIGER indexes |
117 | 129 |
if (isSuccessful) { |
118 | 130 |
// read from the 'tiger-xml' and write to the 'tiger' directory |
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZDefaultPagerStep.groovy (revision 4012) | ||
---|---|---|
449 | 449 |
case "p": |
450 | 450 |
//case "lg": |
451 | 451 |
pagedWriter.write("\n") |
452 |
if ("p".equals(type)) type = null; |
|
452 | 453 |
pagedWriter.writeStartElement("p", ["class":rend, "type":type]) |
453 | 454 |
break; |
454 | 455 |
case "ab": |
Formats disponibles : Unified diff