Révision 4012
| TXM/trunk/bundles/org.txm.conllu.rcp/src/org/txm/conllu/rcp/importsection/CoNLLUSection.java (revision 4012) | ||
|---|---|---|
| 108 | 108 |
gdata2.colspan = 3; // one line |
| 109 | 109 |
depsPropertiesText.setLayoutData(gdata2); |
| 110 | 110 |
|
| 111 |
printNewLinesInEditionsButton = toolkit.createButton(sectionClient, "Print a newline after a sentence in editions", SWT.CHECK);
|
|
| 111 |
printNewLinesInEditionsButton = toolkit.createButton(sectionClient, "Format sentences in edition/Formatage des phrases dans l'édition", SWT.CHECK);
|
|
| 112 | 112 |
gdata2 = getButtonLayoutData(); |
| 113 | 113 |
gdata2.colspan = 4; // one line |
| 114 | 114 |
printNewLinesInEditionsButton.setLayoutData(gdata2); |
| TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/FixDriverFile.java (revision 4012) | ||
|---|---|---|
| 1 |
package org.txm.conllu.core; |
|
| 2 |
|
|
| 3 |
import java.io.File; |
|
| 4 |
import java.io.IOException; |
|
| 5 |
import java.util.Arrays; |
|
| 6 |
import java.util.HashMap; |
|
| 7 |
import java.util.HashSet; |
|
| 8 |
import java.util.List; |
|
| 9 |
|
|
| 10 |
import javax.xml.parsers.ParserConfigurationException; |
|
| 11 |
|
|
| 12 |
import org.txm.utils.xml.DomUtils; |
|
| 13 |
import org.w3c.dom.Document; |
|
| 14 |
import org.w3c.dom.Element; |
|
| 15 |
import org.w3c.dom.NodeList; |
|
| 16 |
import org.xml.sax.SAXException; |
|
| 17 |
|
|
| 18 |
/** |
|
| 19 |
* |
|
| 20 |
* @author mdecorde |
|
| 21 |
* |
|
| 22 |
*/ |
|
| 23 |
public class FixDriverFile {
|
|
| 24 |
|
|
| 25 |
public static boolean fixFeatureValues(File driverFile, List<File> xmlFiles) throws ParserConfigurationException, SAXException, IOException {
|
|
| 26 |
|
|
| 27 |
HashMap<String, HashSet<String>> declaredFeatures = new HashMap<String, HashSet<String>>(); |
|
| 28 |
HashMap<String, HashSet<String>> missingFeatures = new HashMap<String, HashSet<String>>(); |
|
| 29 |
HashMap<String, Element> featuresElements = new HashMap<String, Element>(); |
|
| 30 |
|
|
| 31 |
Document doc = DomUtils.load(driverFile); |
|
| 32 |
NodeList featuresList = doc.getElementsByTagName("feature");
|
|
| 33 |
for (int i = 0 ; i < featuresList.getLength() ; i++) {
|
|
| 34 |
Element f = (Element) featuresList.item(i); |
|
| 35 |
|
|
| 36 |
featuresElements.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), f);
|
|
| 37 |
HashSet<String> values = new HashSet<String>(); |
|
| 38 |
declaredFeatures.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), values);
|
|
| 39 |
missingFeatures.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), new HashSet<String>());
|
|
| 40 |
|
|
| 41 |
NodeList featureValuesList = f.getElementsByTagName("value");
|
|
| 42 |
for (int j = 0 ; j < featureValuesList.getLength() ; j++) {
|
|
| 43 |
Element v = (Element) featureValuesList.item(j); |
|
| 44 |
values.add(v.getAttribute("name"));
|
|
| 45 |
} |
|
| 46 |
} |
|
| 47 |
//System.out.println("Declared: "+declaredFeatures.keySet());
|
|
| 48 |
|
|
| 49 |
for (File xmlFile : xmlFiles) {
|
|
| 50 |
Document doc2 = DomUtils.load(xmlFile); |
|
| 51 |
NodeList tList = doc2.getElementsByTagName("t");
|
|
| 52 |
for (int i = 0 ; i < tList.getLength() ; i++) {
|
|
| 53 |
Element e = (Element) tList.item(i); |
|
| 54 |
//System.out.println("T="+e.getAttributes());
|
|
| 55 |
for (int j = 0 ; j < e.getAttributes().getLength() ; j++) {
|
|
| 56 |
|
|
| 57 |
String name = e.getAttributes().item(j).getLocalName()+ "\tT"; |
|
| 58 |
String value = e.getAttributes().item(j).getNodeValue(); |
|
| 59 |
|
|
| 60 |
if (declaredFeatures.containsKey(name) && declaredFeatures.get(name).size() > 0) {
|
|
| 61 |
HashSet<String> existingValues = declaredFeatures.get(name); |
|
| 62 |
if (existingValues.contains(value)) {
|
|
| 63 |
// ok |
|
| 64 |
} else {
|
|
| 65 |
missingFeatures.get(name).add(value); |
|
| 66 |
} |
|
| 67 |
} |
|
| 68 |
} |
|
| 69 |
|
|
| 70 |
} |
|
| 71 |
tList = doc2.getElementsByTagName("nt");
|
|
| 72 |
for (int i = 0 ; i < tList.getLength() ; i++) {
|
|
| 73 |
Element e = (Element) tList.item(i); |
|
| 74 |
//System.out.println("NT="+e.getAttributes());
|
|
| 75 |
for (int j = 0 ; j < e.getAttributes().getLength() ; j++) {
|
|
| 76 |
|
|
| 77 |
String name = e.getAttributes().item(j).getLocalName()+ "\tNT"; |
|
| 78 |
String value = e.getAttributes().item(j).getNodeValue(); |
|
| 79 |
|
|
| 80 |
if (declaredFeatures.containsKey(name) && declaredFeatures.get(name).size() > 0) {
|
|
| 81 |
HashSet<String> existingValues = declaredFeatures.get(name); |
|
| 82 |
if (existingValues.contains(value)) {
|
|
| 83 |
// ok |
|
| 84 |
} else {
|
|
| 85 |
missingFeatures.get(name).add(value); |
|
| 86 |
} |
|
| 87 |
} |
|
| 88 |
} |
|
| 89 |
} |
|
| 90 |
} |
|
| 91 |
|
|
| 92 |
//System.out.println("Missing values:");
|
|
| 93 |
for (String missingFeatureNamedomain : missingFeatures.keySet()) {
|
|
| 94 |
if (missingFeatures.get(missingFeatureNamedomain).size() == 0) continue; |
|
| 95 |
//System.out.println("\t"+missingFeatureNamedomain);
|
|
| 96 |
|
|
| 97 |
Element f = featuresElements.get(missingFeatureNamedomain); |
|
| 98 |
|
|
| 99 |
for (String v : missingFeatures.get(missingFeatureNamedomain)) {
|
|
| 100 |
//System.out.println("\t\t"+v);
|
|
| 101 |
Element missingFeatureElement = f.getOwnerDocument().createElement("value");
|
|
| 102 |
missingFeatureElement.setAttribute("name", v);
|
|
| 103 |
missingFeatureElement.setTextContent(v); |
|
| 104 |
f.appendChild(missingFeatureElement); |
|
| 105 |
} |
|
| 106 |
} |
|
| 107 |
|
|
| 108 |
return DomUtils.save(doc, driverFile); |
|
| 109 |
} |
|
| 110 |
|
|
| 111 |
public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException {
|
|
| 112 |
File[] files = {new File("/home/mdecorde/runtime-rcpapplication.product/corpora/VOEUX-CONLLU/tiger-xml/1959.xml"),
|
|
| 113 |
new File("/home/mdecorde/runtime-rcpapplication.product/corpora/VOEUX-CONLLU/tiger-xml/1960.xml"),
|
|
| 114 |
new File("/home/mdecorde/runtime-rcpapplication.product/corpora/VOEUX-CONLLU/tiger-xml/1961.xml")};
|
|
| 115 |
|
|
| 116 |
FixDriverFile.fixFeatureValues(new File("/home/mdecorde/runtime-rcpapplication.product/corpora/VOEUX-CONLLU/tiger-xml/corpus.xml"), Arrays.asList(files));
|
|
| 117 |
} |
|
| 118 |
} |
|
| 0 | 119 | |
| TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/CoNLLU2TIGER.java (revision 4012) | ||
|---|---|---|
| 67 | 67 |
|
| 68 | 68 |
for (File conlluFile : conlluFiles) {
|
| 69 | 69 |
|
| 70 |
|
|
| 71 | 70 |
String filename = FileUtils.stripExtension(conlluFile); |
| 72 | 71 |
File tigerXMLFile = new File(tigerDirectory, filename+".xml"); |
| 73 | 72 |
tigerOutput = new BufferedOutputStream(new FileOutputStream(tigerXMLFile), 16 * 1024); |
| TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImporter.groovy (revision 4012) | ||
|---|---|---|
| 37 | 37 |
if (s != "" && s != "_") {
|
| 38 | 38 |
|
| 39 | 39 |
def ssset = new HashSet(sss); |
| 40 |
if (ssset.size() == 1) return ssset.join(" + ")
|
|
| 40 |
if (ssset.size() == 1) return ssset.join(".")
|
|
| 41 | 41 |
|
| 42 |
return sss.join(" + ")
|
|
| 42 |
return sss.join(".")
|
|
| 43 | 43 |
} |
| 44 | 44 |
} |
| 45 | 45 |
|
| ... | ... | |
| 67 | 67 |
files.sort() |
| 68 | 68 |
|
| 69 | 69 |
// Keep or not contractions |
| 70 |
String keepContractions = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
|
|
| 70 |
String contractionsManagement = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
|
|
| 71 | 71 |
|
| 72 |
println "Contractions managment ($keepContractions) & add XmlId if necessary & remove empty nodes"
|
|
| 72 |
println "Contractions managment ($contractionsManagement) & add XmlId if necessary & remove empty nodes"
|
|
| 73 | 73 |
ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size()) |
| 74 | 74 |
for (File conlluFile : files) {
|
| 75 | 75 |
cpb_texts.tick() |
| 76 | 76 |
if (conlluFile.getName().endsWith(".conllu")) {
|
| 77 | 77 |
String textid = FileUtils.stripExtension(conlluFile) |
| 78 | 78 |
int wcounter = 1; |
| 79 |
|
|
| 79 | 80 |
ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8"); |
| 81 |
|
|
| 82 |
def temp_multiwords = [:] |
|
| 83 |
|
|
| 80 | 84 |
for (int i = 0 ; i < lines.size() ; i++) {
|
| 81 | 85 |
String line = lines[i] |
| 82 | 86 |
if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
|
| ... | ... | |
| 98 | 102 |
continue; // next ! |
| 99 | 103 |
} |
| 100 | 104 |
|
| 101 |
if (keepContractions == UDPreferences.ALL) {
|
|
| 105 |
if (contractionsManagement == UDPreferences.ALL) {
|
|
| 102 | 106 |
// ok on fait rien |
| 103 |
} else if (keepContractions == UDPreferences.SYNTAX) {
|
|
| 107 |
} else if (contractionsManagement == UDPreferences.SYNTAX) {
|
|
| 104 | 108 |
if (split[0].contains("-")) {
|
| 109 |
|
|
| 110 |
// stores the syntatic word id and the ortographic word properties |
|
| 111 |
temp_multiwords = [:] |
|
| 112 |
int n1 = Integer.parseInt(split[0].substring(0, split[0].indexOf("-")));
|
|
| 113 |
int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
|
|
| 114 |
for (int ii = n1 ; ii <= n2 ; ii++) {
|
|
| 115 |
temp_multiwords[""+ii] = split; |
|
| 116 |
} |
|
| 117 |
|
|
| 105 | 118 |
//println "REMOVE - $split" |
| 106 | 119 |
lines.remove(i) |
| 107 | 120 |
i-- |
| 108 | 121 |
continue; /// next ! |
| 122 |
} else if (temp_multiwords.containsKey(split[0])) { // it's a syntactic word of an orthographic word -> add the orthographic form in the misc field
|
|
| 123 |
def split_ortho = temp_multiwords.remove(split[0]) |
|
| 124 |
|
|
| 125 |
if (split[9].length() > 0) split[9] += "|" |
|
| 126 |
split[9] += "multiword="+split_ortho[1] // the orthographic form |
|
| 127 |
|
|
| 109 | 128 |
} |
| 110 |
} else if (keepContractions == UDPreferences.SURFACE) {
|
|
| 129 |
} else if (contractionsManagement == UDPreferences.SURFACE) {
|
|
| 111 | 130 |
if (split[0].contains("-")) {
|
| 112 | 131 |
int n1 = Integer.parseInt(split[0].substring(0, split[0].indexOf("-")));
|
| 113 | 132 |
int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
|
| ... | ... | |
| 116 | 135 |
split[0] = ""+n1 |
| 117 | 136 |
|
| 118 | 137 |
def splits = [] |
| 119 |
for (int j = 1 ; j <= n ;j++) {
|
|
| 120 |
def tmp = lines[i+j].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
|
| 138 |
for (int j = 0 ; j <= n ;j++) {
|
|
| 139 |
def tmp = lines[i+j+1].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
|
| 121 | 140 |
if (tmp[0].contains(".")) {
|
| 122 | 141 |
//println "PRE-REMOVE EMPTY NODE: $split : "+ |
| 123 | 142 |
lines.remove(i+j) |
| ... | ... | |
| 126 | 145 |
} |
| 127 | 146 |
splits << tmp |
| 128 | 147 |
} |
| 129 |
|
|
| 148 |
|
|
| 130 | 149 |
for (int j = 2 ; j < 8 ; j++) {
|
| 131 | 150 |
split[j] = merge(split[j], splits.collect(){it[j]})
|
| 132 | 151 |
} |
| 133 |
|
|
| 134 |
if (split[9].length() > 0) split[9] += "|" |
|
| 135 |
split[9] += "expand="+splits.collect(){it[1]}.join("_")
|
|
| 136 |
|
|
| 137 |
println "REMOVE non- $split" |
|
| 152 |
|
|
| 153 |
//println "REMOVE non- $split" |
|
| 138 | 154 |
for (int j = 0 ; j <= n ;j++) {
|
| 139 | 155 |
lines.remove(i+1) |
| 140 | 156 |
} |
| 141 |
println "splits=$splits" |
|
| 157 |
//println "splits=$splits"
|
|
| 142 | 158 |
} |
| 143 | 159 |
} |
| 144 | 160 |
|
| ... | ... | |
| 262 | 278 |
|
| 263 | 279 |
def printNewLines = "true" == UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.IMPORT_PRINT_NEWLINES_AFTER_SENTENCES, ""+UDPreferences.getInstance().getString(UDPreferences.IMPORT_PRINT_NEWLINES_AFTER_SENTENCES)) |
| 264 | 280 |
|
| 281 |
String contractionsManagement = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT)); |
|
| 282 |
|
|
| 265 | 283 |
ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size()) |
| 266 | 284 |
|
| 267 | 285 |
println "Parsing CoNLL-U files..." |
| ... | ... | |
| 403 | 421 |
ImportCoNLLUAnnotations.buildPropertiesProjections(sentencehash, headPropertiesToProject, depsPropertiesToProject) |
| 404 | 422 |
} |
| 405 | 423 |
|
| 424 |
if (printNewLines) {
|
|
| 425 |
writer.writeStartElement("p")
|
|
| 426 |
writer.writeAttribute("type", "sentence")
|
|
| 427 |
} |
|
| 428 |
|
|
| 406 | 429 |
for (def word : words) {
|
| 407 | 430 |
|
| 408 | 431 |
String id = null |
| ... | ... | |
| 431 | 454 |
writer.writeCharacters(" ")
|
| 432 | 455 |
} |
| 433 | 456 |
|
| 434 |
if (printNewLines) writer.writeEmptyElement("lb")
|
|
| 457 |
if (printNewLines) writer.writeEndElement()
|
|
| 435 | 458 |
|
| 436 | 459 |
writer.writeCharacters("\n")
|
| 437 | 460 |
writer.writeEndElement() // s |
| ... | ... | |
| 446 | 469 |
writer.writeCharacters("\n")
|
| 447 | 470 |
writer.writeEndElement() // TEI |
| 448 | 471 |
writer.close() |
| 449 |
|
|
| 450 | 472 |
} |
| 451 | 473 |
|
| 452 | 474 |
cpb_texts.done() |
| TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImport.groovy (revision 4012) | ||
|---|---|---|
| 23 | 23 |
import org.txm.tigersearch.preferences.TigerSearchPreferences |
| 24 | 24 |
import org.txm.conllu.core.CallUD2TigerPerlScript |
| 25 | 25 |
|
| 26 |
import org.txm.conllu.core.FixDriverFile |
|
| 27 |
|
|
| 26 | 28 |
class CoNLLUImport extends XTZImport {
|
| 27 | 29 |
|
| 28 | 30 |
public CoNLLUImport(Project params) {
|
| ... | ... | |
| 102 | 104 |
} |
| 103 | 105 |
} |
| 104 | 106 |
|
| 105 |
// patch the subcorpus tags in the driver XML file with the right corpus order |
|
| 107 |
// patch the subcorpus tags in the driver XML file with the right corpus order : 1) the text order 2) the properties values
|
|
| 106 | 108 |
File driver = new File(this.binaryDirectory, "tiger-xml/"+driverFilename) |
| 107 | 109 |
String content = IOUtils.getText(driver, "UTF-8"); |
| 108 | 110 |
content = content.replaceAll("<subcorpus .+\n", "");
|
| ... | ... | |
| 112 | 114 |
subcorpusList += "<subcorpus name=\"$name\" external=\"file:${name}.xml\"/>\n"
|
| 113 | 115 |
} |
| 114 | 116 |
content = content.replaceAll("<body>", "<body>\n"+subcorpusList+"\n"); // get the last main.xml content and patch it with the subcorpus tags
|
| 117 |
|
|
| 118 |
// write the modified driver file |
|
| 115 | 119 |
IOUtils.setText(driver, content, "UTF-8"); |
| 120 |
|
|
| 121 |
|
|
| 122 |
FixDriverFile.fixFeatureValues(driver, tigerxmlFiles) |
|
| 123 |
|
|
| 124 |
|
|
| 125 |
|
|
| 126 |
|
|
| 127 |
|
|
| 116 | 128 |
// build TIGER indexes |
| 117 | 129 |
if (isSuccessful) {
|
| 118 | 130 |
// read from the 'tiger-xml' and write to the 'tiger' directory |
| TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZDefaultPagerStep.groovy (revision 4012) | ||
|---|---|---|
| 449 | 449 |
case "p": |
| 450 | 450 |
//case "lg": |
| 451 | 451 |
pagedWriter.write("\n")
|
| 452 |
if ("p".equals(type)) type = null;
|
|
| 452 | 453 |
pagedWriter.writeStartElement("p", ["class":rend, "type":type])
|
| 453 | 454 |
break; |
| 454 | 455 |
case "ab": |
Formats disponibles : Unified diff