Révision 4012

TXM/trunk/bundles/org.txm.conllu.rcp/src/org/txm/conllu/rcp/importsection/CoNLLUSection.java (revision 4012)
108 108
		gdata2.colspan = 3; // one line
109 109
		depsPropertiesText.setLayoutData(gdata2);
110 110

  
111
		printNewLinesInEditionsButton = toolkit.createButton(sectionClient, "Print a newline after a sentence in editions", SWT.CHECK);
111
		printNewLinesInEditionsButton = toolkit.createButton(sectionClient, "Format sentences in edition/Formatage des phrases dans l'édition", SWT.CHECK);
112 112
		gdata2 = getButtonLayoutData();
113 113
		gdata2.colspan = 4; // one line
114 114
		printNewLinesInEditionsButton.setLayoutData(gdata2);
TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/FixDriverFile.java (revision 4012)
1
package org.txm.conllu.core;
2

  
3
import java.io.File;
4
import java.io.IOException;
5
import java.util.Arrays;
6
import java.util.HashMap;
7
import java.util.HashSet;
8
import java.util.List;
9

  
10
import javax.xml.parsers.ParserConfigurationException;
11

  
12
import org.txm.utils.xml.DomUtils;
13
import org.w3c.dom.Document;
14
import org.w3c.dom.Element;
15
import org.w3c.dom.NodeList;
16
import org.xml.sax.SAXException;
17

  
18
/**
19
 * 
20
 * @author mdecorde
21
 *
22
 */
23
public class FixDriverFile {
24

  
25
	public static boolean fixFeatureValues(File driverFile, List<File> xmlFiles) throws ParserConfigurationException, SAXException, IOException {
26

  
27
		HashMap<String, HashSet<String>> declaredFeatures = new HashMap<String, HashSet<String>>();
28
		HashMap<String, HashSet<String>> missingFeatures = new HashMap<String, HashSet<String>>();
29
		HashMap<String, Element> featuresElements = new HashMap<String, Element>();
30

  
31
		Document doc = DomUtils.load(driverFile);
32
		NodeList featuresList = doc.getElementsByTagName("feature");
33
		for (int i = 0 ; i < featuresList.getLength() ; i++) {
34
			Element f = (Element) featuresList.item(i);
35

  
36
			featuresElements.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), f);
37
			HashSet<String> values = new HashSet<String>();
38
			declaredFeatures.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), values);
39
			missingFeatures.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), new HashSet<String>());
40

  
41
			NodeList featureValuesList = f.getElementsByTagName("value");
42
			for (int j = 0 ; j < featureValuesList.getLength() ; j++) {
43
				Element v = (Element) featureValuesList.item(j);
44
				values.add(v.getAttribute("name"));
45
			}
46
		}
47
		//System.out.println("Declared: "+declaredFeatures.keySet());
48

  
49
		for (File xmlFile : xmlFiles) {
50
			Document doc2 = DomUtils.load(xmlFile);
51
			NodeList tList = doc2.getElementsByTagName("t");
52
			for (int i = 0 ; i < tList.getLength() ; i++) {
53
				Element e = (Element) tList.item(i);
54
				//System.out.println("T="+e.getAttributes());
55
				for (int j = 0 ; j < e.getAttributes().getLength() ; j++) {
56
					
57
					String name = e.getAttributes().item(j).getLocalName()+ "\tT";
58
					String value = e.getAttributes().item(j).getNodeValue();
59
					
60
					if (declaredFeatures.containsKey(name) && declaredFeatures.get(name).size() > 0) {
61
						HashSet<String> existingValues = declaredFeatures.get(name);
62
						if (existingValues.contains(value)) {
63
							// ok
64
						} else {
65
							missingFeatures.get(name).add(value);
66
						}
67
					}
68
				}
69

  
70
			}
71
			tList = doc2.getElementsByTagName("nt");
72
			for (int i = 0 ; i < tList.getLength() ; i++) {
73
				Element e = (Element) tList.item(i);
74
				//System.out.println("NT="+e.getAttributes());
75
				for (int j = 0 ; j < e.getAttributes().getLength() ; j++) {
76
					
77
					String name = e.getAttributes().item(j).getLocalName()+ "\tNT";
78
					String value = e.getAttributes().item(j).getNodeValue();
79
					
80
					if (declaredFeatures.containsKey(name) && declaredFeatures.get(name).size() > 0) {
81
						HashSet<String> existingValues = declaredFeatures.get(name);
82
						if (existingValues.contains(value)) {
83
							// ok
84
						} else {
85
							missingFeatures.get(name).add(value);
86
						}
87
					}
88
				}
89
			}
90
		}
91
		
92
		//System.out.println("Missing values:");
93
		for (String missingFeatureNamedomain : missingFeatures.keySet()) {
94
			if (missingFeatures.get(missingFeatureNamedomain).size() == 0) continue;
95
			//System.out.println("\t"+missingFeatureNamedomain);
96
			
97
			Element f = featuresElements.get(missingFeatureNamedomain);
98
			
99
			for (String v : missingFeatures.get(missingFeatureNamedomain)) {
100
				//System.out.println("\t\t"+v);
101
				Element missingFeatureElement = f.getOwnerDocument().createElement("value");
102
				missingFeatureElement.setAttribute("name", v);
103
				missingFeatureElement.setTextContent(v);
104
				f.appendChild(missingFeatureElement);
105
			}
106
		}
107
		
108
		return DomUtils.save(doc, driverFile);
109
	}
110

  
111
	public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException {
112
		File[] files = {new File("/home/mdecorde/runtime-rcpapplication.product/corpora/VOEUX-CONLLU/tiger-xml/1959.xml"),
113
				new File("/home/mdecorde/runtime-rcpapplication.product/corpora/VOEUX-CONLLU/tiger-xml/1960.xml"),
114
				new File("/home/mdecorde/runtime-rcpapplication.product/corpora/VOEUX-CONLLU/tiger-xml/1961.xml")};
115
		
116
		FixDriverFile.fixFeatureValues(new File("/home/mdecorde/runtime-rcpapplication.product/corpora/VOEUX-CONLLU/tiger-xml/corpus.xml"), Arrays.asList(files));
117
	}
118
}
0 119

  
TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/CoNLLU2TIGER.java (revision 4012)
67 67
		
68 68
		for (File conlluFile : conlluFiles) {
69 69
			
70
			
71 70
			String filename = FileUtils.stripExtension(conlluFile);
72 71
			File tigerXMLFile = new File(tigerDirectory, filename+".xml");
73 72
			tigerOutput = new BufferedOutputStream(new FileOutputStream(tigerXMLFile), 16 * 1024);
TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImporter.groovy (revision 4012)
37 37
			if (s != "" && s != "_") {
38 38
				
39 39
				def ssset = new HashSet(sss);
40
				if (ssset.size() == 1) return ssset.join(" + ")
40
				if (ssset.size() == 1) return ssset.join(".")
41 41
				
42
				return sss.join(" + ")
42
				return sss.join(".")
43 43
			}
44 44
		}
45 45
		
......
67 67
		files.sort()
68 68
		
69 69
		// Keep or not contractions
70
		String keepContractions =  UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
70
		String contractionsManagement =  UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
71 71
		
72
			println "Contractions managment ($keepContractions) & add XmlId if necessary & remove empty nodes"
72
			println "Contractions managment ($contractionsManagement) & add XmlId if necessary & remove empty nodes"
73 73
			ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
74 74
			for (File conlluFile : files) {
75 75
				cpb_texts.tick()
76 76
				if (conlluFile.getName().endsWith(".conllu")) {
77 77
					String textid = FileUtils.stripExtension(conlluFile)
78 78
					int wcounter = 1;
79
					
79 80
					ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8");
81
					
82
					def temp_multiwords = [:]
83
					
80 84
					for (int i = 0 ; i < lines.size() ; i++) {
81 85
						String line = lines[i]
82 86
						if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
......
98 102
							continue; // next !
99 103
						}
100 104
						
101
						if (keepContractions == UDPreferences.ALL) {
105
						if (contractionsManagement == UDPreferences.ALL) {
102 106
							// ok on fait rien
103
						} else if (keepContractions == UDPreferences.SYNTAX) {
107
						} else if (contractionsManagement == UDPreferences.SYNTAX) {
104 108
							if (split[0].contains("-")) {
109
								
110
								// stores the syntatic word id and the ortographic word properties
111
								temp_multiwords = [:]
112
								int n1 = Integer.parseInt(split[0].substring(0,  split[0].indexOf("-")));
113
								int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
114
								for (int ii = n1 ; ii <= n2 ; ii++) {
115
									temp_multiwords[""+ii] = split;
116
								}
117
								
105 118
								//println "REMOVE - $split"
106 119
								lines.remove(i)
107 120
								i--
108 121
								continue; /// next !
122
							} else if (temp_multiwords.containsKey(split[0])) { // it's a syntactic word of an orthographic word -> add the orthographic form in the misc field
123
								def split_ortho = temp_multiwords.remove(split[0])
124
								
125
								if (split[9].length() > 0) split[9] += "|"
126
								split[9] += "multiword="+split_ortho[1] // the orthographic form
127
								
109 128
							}
110
						} else if (keepContractions == UDPreferences.SURFACE) {
129
						} else if (contractionsManagement == UDPreferences.SURFACE) {
111 130
							if (split[0].contains("-")) {
112 131
								int n1 = Integer.parseInt(split[0].substring(0,  split[0].indexOf("-")));
113 132
								int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
......
116 135
								split[0] = ""+n1
117 136
								
118 137
								def splits = []
119
								for (int j = 1 ; j <= n ;j++) {
120
									def tmp = lines[i+j].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
138
								for (int j = 0 ; j <= n ;j++) {
139
									def tmp = lines[i+j+1].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
121 140
									if (tmp[0].contains(".")) {
122 141
										//println "PRE-REMOVE EMPTY NODE: $split : "+
123 142
										lines.remove(i+j)
......
126 145
									}
127 146
									splits << tmp
128 147
								}
129
								
148

  
130 149
								for (int j = 2 ; j < 8 ; j++) {
131 150
									split[j] = merge(split[j], splits.collect(){it[j]})
132 151
								}
133
								
134
								if (split[9].length() > 0) split[9] += "|"
135
								split[9] += "expand="+splits.collect(){it[1]}.join("_")
136
								
137
								println "REMOVE non- $split"
152
																
153
								//println "REMOVE non- $split"
138 154
								for (int j = 0 ; j <= n ;j++) {
139 155
									lines.remove(i+1)
140 156
								}
141
								println "splits=$splits"
157
								//println "splits=$splits"
142 158
							}
143 159
						}
144 160
						
......
262 278
		
263 279
		def printNewLines = "true" == UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.IMPORT_PRINT_NEWLINES_AFTER_SENTENCES, ""+UDPreferences.getInstance().getString(UDPreferences.IMPORT_PRINT_NEWLINES_AFTER_SENTENCES))
264 280
		
281
		String contractionsManagement =  UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
282
		
265 283
		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
266 284
		
267 285
		println "Parsing CoNLL-U files..."
......
403 421
					ImportCoNLLUAnnotations.buildPropertiesProjections(sentencehash, headPropertiesToProject, depsPropertiesToProject)
404 422
				}
405 423
				
424
				if (printNewLines) {
425
					 writer.writeStartElement("p")
426
					 writer.writeAttribute("type", "sentence")
427
				}
428
				
406 429
				for (def word : words) {
407 430
					
408 431
					String id = null
......
431 454
					writer.writeCharacters(" ")
432 455
				}
433 456
				
434
				if (printNewLines) writer.writeEmptyElement("lb")
457
				if (printNewLines) writer.writeEndElement()
435 458
				
436 459
				writer.writeCharacters("\n")
437 460
				writer.writeEndElement() // s
......
446 469
			writer.writeCharacters("\n")
447 470
			writer.writeEndElement() // TEI
448 471
			writer.close()
449
			
450 472
		}
451 473
		
452 474
		cpb_texts.done()
TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImport.groovy (revision 4012)
23 23
import org.txm.tigersearch.preferences.TigerSearchPreferences
24 24
import org.txm.conllu.core.CallUD2TigerPerlScript
25 25

  
26
import org.txm.conllu.core.FixDriverFile
27

  
26 28
class CoNLLUImport extends XTZImport {
27 29
	
28 30
	public CoNLLUImport(Project params) {
......
102 104
				}
103 105
			}
104 106
			
105
			// patch the subcorpus tags in the driver XML file with the right corpus order
107
			// patch the subcorpus tags in the driver XML file with the right corpus order : 1) the text order 2) the properties values
106 108
			File driver = new File(this.binaryDirectory, "tiger-xml/"+driverFilename)
107 109
			String content = IOUtils.getText(driver, "UTF-8");
108 110
			content = content.replaceAll("<subcorpus .+\n", "");
......
112 114
				subcorpusList += "<subcorpus name=\"$name\" external=\"file:${name}.xml\"/>\n"
113 115
			}
114 116
			content = content.replaceAll("<body>", "<body>\n"+subcorpusList+"\n"); // get the last main.xml content and patch it with the subcorpus tags
117
			
118
			// write the modified driver file
115 119
			IOUtils.setText(driver, content, "UTF-8");
120
			
121
			
122
			FixDriverFile.fixFeatureValues(driver, tigerxmlFiles)
123
			
124
			
125
		
126
			
127
			
116 128
			// build TIGER indexes
117 129
			if (isSuccessful) {
118 130
				// read from the 'tiger-xml' and write to the 'tiger' directory
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZDefaultPagerStep.groovy (revision 4012)
449 449
							case "p":
450 450
							//case "lg":
451 451
								pagedWriter.write("\n")
452
								if ("p".equals(type)) type = null;
452 453
								pagedWriter.writeStartElement("p", ["class":rend, "type":type])
453 454
								break;
454 455
							case "ab":

Formats disponibles : Unified diff