/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 4019

     	public final String merge(String orig, def sss) {
     		int n = 0;
     		for (String s : sss) {
     			if (s != "" && s != "_") {
     		if (orig.equals("") || orig.equals("_")) {
     				def ssset = new HashSet(sss);
     				if (ssset.size() == 1) return ssset.join(".")
     				return sss.join(".")
+    			}
     		} else {
     			sss.add(0, orig)
+    		}
     		return orig;
     		def ssset = new LinkedHashSet(sss)
     		return ssset.join(".")
+    	}
     	@Override
-...
     		def files = conlluSrcDirectory.listFiles()
     		files.sort()
     		// Keep or not contractions
     		String contractionsManagement =  UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
     		println "Contractions managment ($contractionsManagement) & add XmlId if necessary & remove empty nodes"
     		println "Add XmlId if necessary & remove empty nodes"
     		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
     		for (File conlluFile : files) {
     			cpb_texts.tick()
-...
     				ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8");
     				for (int i = 0 ; i < lines.size() ; i++) {
     					String line = lines[i]
     					if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
     					def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
     					if (split[0].contains(".")) {
     						//println "REMOVE EMPTY NODE: $split : "+
-...
+    						}
+    					}
     					if (contractionsManagement == UDPreferences.ALL) {
     						// ok on fait rien
     					} else if (contractionsManagement == UDPreferences.SYNTAX) {
     						if (split[0].contains("-")) {
     					lines[i] = split.join("\t") // rebuild the line
+    				}
     				IOUtils.write(conlluFile, lines.join("\n") + "\n") // CoNLLU needs the last line
+    			}
+    		}
     		cpb_texts.done()
     							// stores the syntatic word id and the ortographic word properties
     							temp_multiwords = [:]
     							int n1 = Integer.parseInt(split[0].substring(0,  split[0].indexOf("-")));
     							int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
     							for (int ii = n1 ; ii <= n2 ; ii++) {
     								temp_multiwords[""+ii] = split;
+    							}
     							//println "REMOVE - $split"
     							lines.remove(i)
     							i--
     							continue; /// next !
     						} else if (temp_multiwords.containsKey(split[0])) { // it's a syntactic word of an orthographic word
     							def split_ortho = temp_multiwords.remove(split[0])
     		// Keep or not contractions
     		File conlluSrcForTXMDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu-fortxm")
     		String contractionsManagement =  UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
     							if (split[9].length() > 0) split[9] += "|"
     							split[9] += "multiword="+split_ortho[1] // the orthographic form
+    						}
     					} else if (contractionsManagement == UDPreferences.SURFACE) {
     						if (split[0].contains("-")) {
     							int n1 = Integer.parseInt(split[0].substring(0,  split[0].indexOf("-")));
     							int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
     							int n =  n2 - n1
     		if (contractionsManagement == UDPreferences.ALL) {
     			conlluSrcForTXMDirectory = conlluSrcDirectory; // use the same directory as TIGER since no word modifications have been done
     		} else {
     			conlluSrcForTXMDirectory.deleteDir()
     			conlluSrcForTXMDirectory.mkdirs()
     			println "Contractions managment mode is '$contractionsManagement'"
     			cpb_texts = new ConsoleProgressBar(files.size())
     			for (File conlluFile : files) {
     				cpb_texts.tick()
     							//split[0] = ""+n1
     				if (conlluFile.getName().endsWith(".conllu")) {
     							// before merging and deleting words, check if they are the right ones
     							if (lines[i+1].startsWith(""+n1+"\t") && lines[i+n+1].startsWith(""+n2+"\t")) {
     								def splits = []
     								for (int j = 0 ; j <= n ;j++) {
     									def tmp = lines[i+j+1].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
     									splits << tmp
+    								}
     					File conlluFile2 = new File(conlluSrcForTXMDirectory, conlluFile.getName())
     								for (int j = 2 ; j < 8 ; j++) {
     									split[j] = merge(split[j], splits.collect(){it[j]})
     					String textid = FileUtils.stripExtension(conlluFile)
     					int wcounter = 1;
     					ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8");
     					def temp_multiwords = [:]
     					for (int i = 0 ; i < lines.size() ; i++) {
     						String line = lines[i]
     						if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
     						def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
     						if (contractionsManagement == UDPreferences.SYNTAX) {
     							if (split[0].contains("-")) {
     								// stores the syntatic word id and the ortographic word properties
     								temp_multiwords = [:]
     								int n1 = Integer.parseInt(split[0].substring(0,  split[0].indexOf("-")));
     								int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
     								for (int ii = n1 ; ii <= n2 ; ii++) {
     									temp_multiwords[""+ii] = split;
+    								}
     								//println "REMOVE non- $split"
     								for (int j = 0 ; j <= n ;j++) {
     									lines.remove(i+1)
     								//println "REMOVE - $split"
     								lines.remove(i)
     								i--
     								continue; /// next !
     							} else if (temp_multiwords.containsKey(split[0])) { // it's a syntactic word of an orthographic word
     								def split_ortho = temp_multiwords.remove(split[0])
     								if (split[9].length() > 0) split[9] += "|"
     								split[9] += "multiword="+split_ortho[1] // the orthographic form
+    							}
     						} else if (contractionsManagement == UDPreferences.SURFACE) {
     							if (split[0].contains("-")) {
     								int n1 = Integer.parseInt(split[0].substring(0,  split[0].indexOf("-")));
     								int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
     								int n =  n2 - n1
     								//split[0] = ""+n1
     								// before merging and deleting words, check if they are the right ones
     								if (lines[i+1].startsWith(""+n1+"\t") && lines[i+n+1].startsWith(""+n2+"\t")) {
     									def splits = []
     									for (int j = 0 ; j <= n ;j++) {
     										def tmp = lines[i+j+1].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
     										splits << tmp
+    									}
     									for (int j = 1 ; j < 8 ; j++) {
     										split[j] = merge(split[j], splits.collect(){it[j]})
+    									}
     									//println "REMOVE non- $split"
     									for (int j = 0 ; j <= n ;j++) {
     										lines.remove(i+1)
+    									}
+    								}
     								//println "splits=$splits"
+    							}
     							//println "splits=$splits"
+    						}
     						lines[i] = split.join("\t") // rebuild the line
+    					}
     					lines[i] = split.join("\t") // rebuild the line
     					IOUtils.write(conlluFile2, lines.join("\n") + "\n") // CoNLLU needs the last line
+    				}
     				IOUtils.write(conlluFile, lines.join("\n") + "\n") // CoNLLU needs the last line
+    			}
     			cpb_texts.done()
+    		}
     		cpb_texts.done()
     		//		// Fix missing XmlId in conllu files
     		//		println "Setting word XmlID if necessary"
     		//		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
     		//		for (File conlluFile : files) {
     		//			cpb_texts.tick()
     		//			if (conlluFile.getName().endsWith(".conllu")) {
     		//				String textid = FileUtils.stripExtension(conlluFile)
     		//				int wcounter = 1;
     		//				ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8");
     		//				for (int i = 0 ; i < lines.size() ; i++) {
     		//					String line = lines[i]
     		//					if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
     		//
     		//					def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
     		//					if (split[-1] != null && !split[-1].contains("XmlId=")) {
     		//						split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++)
     		//						lines[i] = split.join("\t")
     		//					}
     		//
     		//				}
     		//				IOUtils.write(conlluFile, lines.join("\n") + "\n")
     		//			}
     		//		}
     		//		cpb_texts.done()
     		File metadataFile = Metadatas.findMetadataFile(module.sourceDirectory)
     		File srcDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu2tei")
     		srcDirectory.deleteDir()
-...
+    		}
     		println "Convert CoNLL-U to XML-TEI..."
     		convertCoNLLU2TEI(conlluSrcDirectory, srcDirectory, project)
     		convertCoNLLU2TEI(conlluSrcForTXMDirectory, srcDirectory, project)
     		inputDirectory = srcDirectory // switch files source directory
-...
+    					}
     				} else {
     					LinkedHashMap<String, String> wProperties = new LinkedHashMap<String, String>()
     					def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length)
-...
     				for (def word : words) {
     					//println "UD-ID="+word["id"]
     					if (word["id"].contains("-")) {
     						writer.writeStartElement("seg")
     						writer.writeCharacters("******")
     						writer.writeEndElement() // span
+    					}
     					String id = null
     					wordCounter++
     					writer.writeStartElement ("w")
-...
     						writer.writeAttribute("id", "w_"+text_id+"_"+wordCounter)
+    					}
     					writer.writeCharacters(word["form"])
     					int idx = word["form"].indexOf(".", 1);
     					if (word["id"].contains("-") && idx > 0) {
     						writer.writeCharacters(word["form"].substring(0, idx))
     					} else {
     						writer.writeCharacters(word["form"])
+    					}
     					writer.writeEndElement() // w
     					writer.writeCharacters(" ")
+    				}

     	my $commentlines = 0;                                                                             #added by AL
     	#  my $contractions = 0; #added by AL
     	my $contractions = 0; #added by AL
     	#  my $text_id = "unknown_text";
     	my $text_id = $infilename;
     	my $sent_id = "0";
-...
     			$commentlines++;
     			next;
+    		}
     	#	# Added by AL for contractions
     	#	elsif ( $words[$w] =~ /^\d+-\d+/ ) {
+    	#
     	#		#	print LOG "Contraction line loop 1: $words[$w]\n";
     	#		$commentlines++;
+    	#
     	#		#	$contractions++;
     	#		next;
     	#	}
     		# Added by AL for contractions
     		elsif ( $words[$w] =~ /^\d+-\d+/ ) {
     			#	print LOG "Contraction line loop 1: $words[$w]\n";
     			$commentlines++;
     			#	$contractions++;
     			next;
+    		}
     		else {
     			if ( defined($opt_c) ) {
     				$words[$w] =~ s/coord(\d+)-//g;
-...
     			next;
+    		}
     #		#Added AL for contractions
     #		if ( $words[$i] =~ /^\d+-\d+/ ) {
+    #
     #			#       print LOG "Contraction loop 2 : $words[$i]\n";
     #			next;
     #		}
     		#Added AL for contractions
     		if ( $words[$i] =~ /^\d+-\d+/ ) {
     			#       print LOG "Contraction loop 2 : $words[$i]\n";
     			next;
+    		}
     		else {
     			@cols = split( /\t/, $words[$i] );
-...
     			$print_nt_features = $nt_features;
+    		}
+    	}
     	my $cat = $_[0];
     	if ( $cat eq '' ) {
     		$cat = '__UNDEF__';
+    	}
     	printf XML
     	  "      <nt id=\"n%d_%d%s\" cat=\"%s\" coord=\"--\" dom=\"%s\" type=\"%s\" vform=\"%s\" vlemma=\"%s\"%s note=\"%s\" snr=\"%d\">\n",
     	  $., $w, $dupl, $_[0], $dom, $type, $vform, $vlemma, $print_nt_features,
     	  $., $w, $dupl, $cat, $dom, $type, $vform, $vlemma, $print_nt_features,
     	  notes("$._$w"), $.;
     	printf XML "        <edge idref=\"s%d_%d%s\" label=\"L\"/>\n", $., $w, $dupl;

     import java.io.File;
     import java.io.IOException;
     import java.util.Arrays;
     import java.util.HashMap;
     import java.util.HashSet;
     import java.util.LinkedHashMap;
     import java.util.LinkedHashSet;
     import java.util.List;
     import javax.xml.parsers.ParserConfigurationException;
-...
     	public static boolean fixFeatureValues(File driverFile, List<File> xmlFiles) throws ParserConfigurationException, SAXException, IOException {
     		HashMap<String, HashSet<String>> declaredFeatures = new HashMap<String, HashSet<String>>();
     		HashMap<String, HashSet<String>> missingFeatures = new HashMap<String, HashSet<String>>();
     		HashMap<String, Element> featuresElements = new HashMap<String, Element>();
     		LinkedHashMap<String, LinkedHashSet<String>> declaredFeatures = new LinkedHashMap<String, LinkedHashSet<String>>();
     		LinkedHashMap<String, LinkedHashSet<String>> missingFeatures = new LinkedHashMap<String, LinkedHashSet<String>>();
     		LinkedHashMap<String, Element> featuresElements = new LinkedHashMap<String, Element>();
     		Document doc = DomUtils.load(driverFile);
     		NodeList featuresList = doc.getElementsByTagName("feature");
-...
     			Element f = (Element) featuresList.item(i);
     			featuresElements.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), f);
     			HashSet<String> values = new HashSet<String>();
     			LinkedHashSet<String> values = new LinkedHashSet<String>();
     			declaredFeatures.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), values);
     			missingFeatures.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), new HashSet<String>());
     			missingFeatures.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), new LinkedHashSet<String>());
     			NodeList featureValuesList = f.getElementsByTagName("value");
     			for (int j = 0 ; j < featureValuesList.getLength() ; j++) {
-...
     			//System.out.println("\t"+missingFeatureNamedomain);
     			Element f = featuresElements.get(missingFeatureNamedomain);
     			//missingFeatures.get(missingFeatureNamedomain).add("");
     			for (String v : missingFeatures.get(missingFeatureNamedomain)) {
     				//System.out.println("\t\t"+v);

     					int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
     					int n =  n2 - n1;
     					ArrayList<String[]> newlines = new ArrayList<>();
     					for (int j = 0 ; j <= n ; j++) {
     						newlines.add(new String[split.length]);
     						for (int p = 0 ; p < split.length ; p++) {
     							newlines.get(j)[p] = "_";
+    						}
+    					}
     					//System.out.println("Word "+Arrays.toString(split));
     					//System.out.println("lines to insert: "+n);
     					if ( !(splittedLines.get(i+1)[0].equals(""+n1)) || !(splittedLines.get(i+n+1)[0].equals(""+n2)) ) {
     						System.out.println("FIXING "+conll.get(i));
     						for (int p = 2 ; p < split.length - 1 ; p++) {
     							String[] splittedValues = split[p].split(".");
     						ArrayList<String[]> newlines = new ArrayList<>();
     						for (int j = 0 ; j <= n ; j++) {
     							newlines.add(new String[split.length]);
     							newlines.get(j)[0] = ""+(n1+j);
     							for (int j = 0 ; j <= n ; j++) {
     								if (p >= splittedValues.length) {
     								} else {
     									newlines.get(j)[p] = splittedValues[p];
     							for (int p = 1 ; p < split.length ; p++) {
     								newlines.get(j)[p] = "_";
+    							}
+    						}
     						//System.out.println("FIXING "+split);
     						for (int p = 1 ; p < split.length - 1 ; p++) {
     							String v = split[p];
     							String[] splittedValues = v.split("\\.");
     							if (splittedValues.length == newlines.size()) {
     								for (int j = 0 ; j <= n ; j++) {
     									newlines.get(j)[p] = splittedValues[j];
+    								}
     							} else if ((splittedValues.length - 1) == newlines.size()) {
     								for (int j = 0 ; j <= n ; j++) {
     									newlines.get(j)[p] = splittedValues[j+1];
+    								}
     							} else {
     								for (int j = 0 ; j <= n ; j++) {
     									newlines.get(j)[p] = split[p];
+    								}
+    							}
+    						}
     						for (int j = 0 ; j <= n ; j++) {
     							splittedLines.add(i+j+1, newlines.get(j));
+    						}
     						i = i + newlines.size();
     					} else {
     						System.out.println("NOT FIXING "+conll.get(i));
     						//System.out.println("NOT FIXING "+conll.get(i));
+    					}
+    				}
+    			}
     			for (int i = 0 ; i < splittedLines.size() ; i++) {
     				String split[] = splittedLines.get(i);
     				conll2.add(StringUtils.join(split, "\t"));
+    			}
     			for (String l : conll2) System.out.println(l);
     			String bundle_id = "org.txm.conllu.core";

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 4019