/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

     		gdata2.colspan = 3; // one line
     		depsPropertiesText.setLayoutData(gdata2);
     		printNewLinesInEditionsButton = toolkit.createButton(sectionClient, "Print a newline after a sentence in editions", SWT.CHECK);
     		printNewLinesInEditionsButton = toolkit.createButton(sectionClient, "Format sentences in edition/Formatage des phrases dans l'édition", SWT.CHECK);
     		gdata2 = getButtonLayoutData();
     		gdata2.colspan = 4; // one line
     		printNewLinesInEditionsButton.setLayoutData(gdata2);

     package org.txm.conllu.core;
     import java.io.File;
     import java.io.IOException;
     import java.util.Arrays;
     import java.util.HashMap;
     import java.util.HashSet;
     import java.util.List;
     import javax.xml.parsers.ParserConfigurationException;
     import org.txm.utils.xml.DomUtils;
     import org.w3c.dom.Document;
     import org.w3c.dom.Element;
     import org.w3c.dom.NodeList;
     import org.xml.sax.SAXException;
     /**
+     *
      * @author mdecorde
+     *
      */
     public class FixDriverFile {
     	public static boolean fixFeatureValues(File driverFile, List<File> xmlFiles) throws ParserConfigurationException, SAXException, IOException {
     		HashMap<String, HashSet<String>> declaredFeatures = new HashMap<String, HashSet<String>>();
     		HashMap<String, HashSet<String>> missingFeatures = new HashMap<String, HashSet<String>>();
     		HashMap<String, Element> featuresElements = new HashMap<String, Element>();
     		Document doc = DomUtils.load(driverFile);
     		NodeList featuresList = doc.getElementsByTagName("feature");
     		for (int i = 0 ; i < featuresList.getLength() ; i++) {
     			Element f = (Element) featuresList.item(i);
     			featuresElements.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), f);
     			HashSet<String> values = new HashSet<String>();
     			declaredFeatures.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), values);
     			missingFeatures.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), new HashSet<String>());
     			NodeList featureValuesList = f.getElementsByTagName("value");
     			for (int j = 0 ; j < featureValuesList.getLength() ; j++) {
     				Element v = (Element) featureValuesList.item(j);
     				values.add(v.getAttribute("name"));
+    			}
+    		}
     		//System.out.println("Declared: "+declaredFeatures.keySet());
     		for (File xmlFile : xmlFiles) {
     			Document doc2 = DomUtils.load(xmlFile);
     			NodeList tList = doc2.getElementsByTagName("t");
     			for (int i = 0 ; i < tList.getLength() ; i++) {
     				Element e = (Element) tList.item(i);
     				//System.out.println("T="+e.getAttributes());
     				for (int j = 0 ; j < e.getAttributes().getLength() ; j++) {
     					String name = e.getAttributes().item(j).getLocalName()+ "\tT";
     					String value = e.getAttributes().item(j).getNodeValue();
     					if (declaredFeatures.containsKey(name) && declaredFeatures.get(name).size() > 0) {
     						HashSet<String> existingValues = declaredFeatures.get(name);
     						if (existingValues.contains(value)) {
     							// ok
     						} else {
     							missingFeatures.get(name).add(value);
+    						}
+    					}
+    				}
+    			}
     			tList = doc2.getElementsByTagName("nt");
     			for (int i = 0 ; i < tList.getLength() ; i++) {
     				Element e = (Element) tList.item(i);
     				//System.out.println("NT="+e.getAttributes());
     				for (int j = 0 ; j < e.getAttributes().getLength() ; j++) {
     					String name = e.getAttributes().item(j).getLocalName()+ "\tNT";
     					String value = e.getAttributes().item(j).getNodeValue();
     					if (declaredFeatures.containsKey(name) && declaredFeatures.get(name).size() > 0) {
     						HashSet<String> existingValues = declaredFeatures.get(name);
     						if (existingValues.contains(value)) {
     							// ok
     						} else {
     							missingFeatures.get(name).add(value);
+    						}
+    					}
+    				}
+    			}
+    		}
     		//System.out.println("Missing values:");
     		for (String missingFeatureNamedomain : missingFeatures.keySet()) {
     			if (missingFeatures.get(missingFeatureNamedomain).size() == 0) continue;
     			//System.out.println("\t"+missingFeatureNamedomain);
     			Element f = featuresElements.get(missingFeatureNamedomain);
     			for (String v : missingFeatures.get(missingFeatureNamedomain)) {
     				//System.out.println("\t\t"+v);
     				Element missingFeatureElement = f.getOwnerDocument().createElement("value");
     				missingFeatureElement.setAttribute("name", v);
     				missingFeatureElement.setTextContent(v);
     				f.appendChild(missingFeatureElement);
+    			}
+    		}
     		return DomUtils.save(doc, driverFile);
+    	}
     	public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException {
     		File[] files = {new File("/home/mdecorde/runtime-rcpapplication.product/corpora/VOEUX-CONLLU/tiger-xml/1959.xml"),
     				new File("/home/mdecorde/runtime-rcpapplication.product/corpora/VOEUX-CONLLU/tiger-xml/1960.xml"),
     				new File("/home/mdecorde/runtime-rcpapplication.product/corpora/VOEUX-CONLLU/tiger-xml/1961.xml")};
     		FixDriverFile.fixFeatureValues(new File("/home/mdecorde/runtime-rcpapplication.product/corpora/VOEUX-CONLLU/tiger-xml/corpus.xml"), Arrays.asList(files));
+    	}
+    }

TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/CoNLLU2TIGER.java (revision 4012)
67	67
68	68	for (File conlluFile : conlluFiles) {
69	69
70
71	70	String filename = FileUtils.stripExtension(conlluFile);
72	71	File tigerXMLFile = new File(tigerDirectory, filename+".xml");
73	72	tigerOutput = new BufferedOutputStream(new FileOutputStream(tigerXMLFile), 16 * 1024);

     			if (s != "" && s != "_") {
     				def ssset = new HashSet(sss);
     				if (ssset.size() == 1) return ssset.join(" + ")
     				if (ssset.size() == 1) return ssset.join(".")
     				return sss.join(" + ")
     				return sss.join(".")
+    			}
+    		}
-...
     		files.sort()
     		// Keep or not contractions
     		String keepContractions =  UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
     		String contractionsManagement =  UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
     			println "Contractions managment ($keepContractions) & add XmlId if necessary & remove empty nodes"
     			println "Contractions managment ($contractionsManagement) & add XmlId if necessary & remove empty nodes"
     			ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
     			for (File conlluFile : files) {
     				cpb_texts.tick()
     				if (conlluFile.getName().endsWith(".conllu")) {
     					String textid = FileUtils.stripExtension(conlluFile)
     					int wcounter = 1;
     					ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8");
     					def temp_multiwords = [:]
     					for (int i = 0 ; i < lines.size() ; i++) {
     						String line = lines[i]
     						if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
-...
     							continue; // next !
+    						}
     						if (keepContractions == UDPreferences.ALL) {
     						if (contractionsManagement == UDPreferences.ALL) {
     							// ok on fait rien
     						} else if (keepContractions == UDPreferences.SYNTAX) {
     						} else if (contractionsManagement == UDPreferences.SYNTAX) {
     							if (split[0].contains("-")) {
     								// stores the syntatic word id and the ortographic word properties
     								temp_multiwords = [:]
     								int n1 = Integer.parseInt(split[0].substring(0,  split[0].indexOf("-")));
     								int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
     								for (int ii = n1 ; ii <= n2 ; ii++) {
     									temp_multiwords[""+ii] = split;
+    								}
     								//println "REMOVE - $split"
     								lines.remove(i)
     								i--
     								continue; /// next !
     							} else if (temp_multiwords.containsKey(split[0])) { // it's a syntactic word of an orthographic word -> add the orthographic form in the misc field
     								def split_ortho = temp_multiwords.remove(split[0])
     								if (split[9].length() > 0) split[9] += "|"
     								split[9] += "multiword="+split_ortho[1] // the orthographic form
+    							}
     						} else if (keepContractions == UDPreferences.SURFACE) {
     						} else if (contractionsManagement == UDPreferences.SURFACE) {
     							if (split[0].contains("-")) {
     								int n1 = Integer.parseInt(split[0].substring(0,  split[0].indexOf("-")));
     								int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
-...
     								split[0] = ""+n1
     								def splits = []
     								for (int j = 1 ; j <= n ;j++) {
     									def tmp = lines[i+j].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
     								for (int j = 0 ; j <= n ;j++) {
     									def tmp = lines[i+j+1].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
     									if (tmp[0].contains(".")) {
     										//println "PRE-REMOVE EMPTY NODE: $split : "+
     										lines.remove(i+j)
-...
+    									}
     									splits << tmp
+    								}
     								for (int j = 2 ; j < 8 ; j++) {
     									split[j] = merge(split[j], splits.collect(){it[j]})
+    								}
     								if (split[9].length() > 0) split[9] += "|"
     								split[9] += "expand="+splits.collect(){it[1]}.join("_")
     								println "REMOVE non- $split"
     								//println "REMOVE non- $split"
     								for (int j = 0 ; j <= n ;j++) {
     									lines.remove(i+1)
+    								}
     								println "splits=$splits"
     								//println "splits=$splits"
+    							}
+    						}
-...
     		def printNewLines = "true" == UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.IMPORT_PRINT_NEWLINES_AFTER_SENTENCES, ""+UDPreferences.getInstance().getString(UDPreferences.IMPORT_PRINT_NEWLINES_AFTER_SENTENCES))
     		String contractionsManagement =  UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
     		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
     		println "Parsing CoNLL-U files..."
-...
     					ImportCoNLLUAnnotations.buildPropertiesProjections(sentencehash, headPropertiesToProject, depsPropertiesToProject)
+    				}
     				if (printNewLines) {
     					 writer.writeStartElement("p")
     					 writer.writeAttribute("type", "sentence")
+    				}
     				for (def word : words) {
     					String id = null
-...
     					writer.writeCharacters(" ")
+    				}
     				if (printNewLines) writer.writeEmptyElement("lb")
     				if (printNewLines) writer.writeEndElement()
     				writer.writeCharacters("\n")
     				writer.writeEndElement() // s
-...
     			writer.writeCharacters("\n")
     			writer.writeEndElement() // TEI
     			writer.close()
+    		}
     		cpb_texts.done()

     import org.txm.tigersearch.preferences.TigerSearchPreferences
     import org.txm.conllu.core.CallUD2TigerPerlScript
     import org.txm.conllu.core.FixDriverFile
     class CoNLLUImport extends XTZImport {
     	public CoNLLUImport(Project params) {
-...
+    				}
+    			}
     			// patch the subcorpus tags in the driver XML file with the right corpus order
     			// patch the subcorpus tags in the driver XML file with the right corpus order : 1) the text order 2) the properties values
     			File driver = new File(this.binaryDirectory, "tiger-xml/"+driverFilename)
     			String content = IOUtils.getText(driver, "UTF-8");
     			content = content.replaceAll("<subcorpus .+\n", "");
-...
     				subcorpusList += "<subcorpus name=\"$name\" external=\"file:${name}.xml\"/>\n"
+    			}
     			content = content.replaceAll("<body>", "<body>\n"+subcorpusList+"\n"); // get the last main.xml content and patch it with the subcorpus tags
     			// write the modified driver file
     			IOUtils.setText(driver, content, "UTF-8");
     			FixDriverFile.fixFeatureValues(driver, tigerxmlFiles)
     			// build TIGER indexes
     			if (isSuccessful) {
     				// read from the 'tiger-xml' and write to the 'tiger' directory

TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZDefaultPagerStep.groovy (revision 4012)
449	449	case "p":
450	450	//case "lg":
451	451	pagedWriter.write("\n")
	452	if ("p".equals(type)) type = null;
452	453	pagedWriter.writeStartElement("p", ["class":rend, "type":type])
453	454	break;
454	455	case "ab":

Laboratoire ICAR » Plateforme TXM

Révision 4012