Révision 4019

TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImporter.groovy (revision 4019)
32 32

  
33 33
	public final String merge(String orig, def sss) {
34 34

  
35
		int n = 0;
36
		for (String s : sss) {
37
			if (s != "" && s != "_") {
35
		if (orig.equals("") || orig.equals("_")) {
38 36

  
39
				def ssset = new HashSet(sss);
40
				if (ssset.size() == 1) return ssset.join(".")
41

  
42
				return sss.join(".")
43
			}
37
		} else {
38
			sss.add(0, orig)
44 39
		}
45 40

  
46
		return orig;
41
		def ssset = new LinkedHashSet(sss)
42

  
43
		return ssset.join(".")
47 44
	}
48 45

  
49 46
	@Override
......
66 63
		def files = conlluSrcDirectory.listFiles()
67 64
		files.sort()
68 65

  
69
		// Keep or not contractions
70
		String contractionsManagement =  UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
71

  
72
		println "Contractions managment ($contractionsManagement) & add XmlId if necessary & remove empty nodes"
66
		println "Add XmlId if necessary & remove empty nodes"
73 67
		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
74 68
		for (File conlluFile : files) {
75 69
			cpb_texts.tick()
......
80 74
				ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8");
81 75
				for (int i = 0 ; i < lines.size() ; i++) {
82 76
					String line = lines[i]
83
					
77

  
84 78
					if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
85
					
79

  
86 80
					def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
87 81
					if (split[0].contains(".")) {
88 82
						//println "REMOVE EMPTY NODE: $split : "+
......
107 101
						}
108 102
					}
109 103

  
110
					if (contractionsManagement == UDPreferences.ALL) {
111
						// ok on fait rien
112
					} else if (contractionsManagement == UDPreferences.SYNTAX) {
113
						if (split[0].contains("-")) {
104
					lines[i] = split.join("\t") // rebuild the line
105
				}
106
				IOUtils.write(conlluFile, lines.join("\n") + "\n") // CoNLLU needs the last line
107
			}
108
		}
109
		cpb_texts.done()
114 110

  
115
							// stores the syntatic word id and the ortographic word properties
116
							temp_multiwords = [:]
117
							int n1 = Integer.parseInt(split[0].substring(0,  split[0].indexOf("-")));
118
							int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
119
							for (int ii = n1 ; ii <= n2 ; ii++) {
120
								temp_multiwords[""+ii] = split;
121
							}
122 111

  
123
							//println "REMOVE - $split"
124
							lines.remove(i)
125
							i--
126
							continue; /// next !
127
						} else if (temp_multiwords.containsKey(split[0])) { // it's a syntactic word of an orthographic word
128
							def split_ortho = temp_multiwords.remove(split[0])
112
		// Keep or not contractions
113
		File conlluSrcForTXMDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu-fortxm")
114
		
115
		String contractionsManagement =  UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
129 116

  
130
							if (split[9].length() > 0) split[9] += "|"
131
							split[9] += "multiword="+split_ortho[1] // the orthographic form
132
						}
133
					} else if (contractionsManagement == UDPreferences.SURFACE) {
134
						if (split[0].contains("-")) {
135
							int n1 = Integer.parseInt(split[0].substring(0,  split[0].indexOf("-")));
136
							int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
137
							int n =  n2 - n1
117
		if (contractionsManagement == UDPreferences.ALL) {
118
			conlluSrcForTXMDirectory = conlluSrcDirectory; // use the same directory as TIGER since no word modifications have been done
119
		} else {
120
			
121
			conlluSrcForTXMDirectory.deleteDir()
122
			conlluSrcForTXMDirectory.mkdirs()
123
			
124
			println "Contractions managment mode is '$contractionsManagement'"
125
			cpb_texts = new ConsoleProgressBar(files.size())
126
			for (File conlluFile : files) {
127
				cpb_texts.tick()
138 128

  
139
							//split[0] = ""+n1
129
				if (conlluFile.getName().endsWith(".conllu")) {
140 130

  
141
							// before merging and deleting words, check if they are the right ones 
142
							if (lines[i+1].startsWith(""+n1+"\t") && lines[i+n+1].startsWith(""+n2+"\t")) {
143
								def splits = []
144
								for (int j = 0 ; j <= n ;j++) {
145
									def tmp = lines[i+j+1].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
146
									splits << tmp
147
								}
131
					File conlluFile2 = new File(conlluSrcForTXMDirectory, conlluFile.getName())
148 132

  
149
								for (int j = 2 ; j < 8 ; j++) {
150
									split[j] = merge(split[j], splits.collect(){it[j]})
133
					String textid = FileUtils.stripExtension(conlluFile)
134
					int wcounter = 1;
135

  
136
					ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8");
137

  
138
					def temp_multiwords = [:]
139

  
140
					for (int i = 0 ; i < lines.size() ; i++) {
141
						String line = lines[i]
142
						if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
143

  
144
						def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
145

  
146
						if (contractionsManagement == UDPreferences.SYNTAX) {
147
							if (split[0].contains("-")) {
148

  
149
								// stores the syntatic word id and the ortographic word properties
150
								temp_multiwords = [:]
151
								int n1 = Integer.parseInt(split[0].substring(0,  split[0].indexOf("-")));
152
								int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
153
								for (int ii = n1 ; ii <= n2 ; ii++) {
154
									temp_multiwords[""+ii] = split;
151 155
								}
152 156

  
153
								//println "REMOVE non- $split"
154
								for (int j = 0 ; j <= n ;j++) {
155
									lines.remove(i+1)
157
								//println "REMOVE - $split"
158
								lines.remove(i)
159
								i--
160
								continue; /// next !
161
							} else if (temp_multiwords.containsKey(split[0])) { // it's a syntactic word of an orthographic word
162
								def split_ortho = temp_multiwords.remove(split[0])
163

  
164
								if (split[9].length() > 0) split[9] += "|"
165
								split[9] += "multiword="+split_ortho[1] // the orthographic form
166
							}
167
						} else if (contractionsManagement == UDPreferences.SURFACE) {
168
							if (split[0].contains("-")) {
169
								int n1 = Integer.parseInt(split[0].substring(0,  split[0].indexOf("-")));
170
								int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
171
								int n =  n2 - n1
172

  
173
								//split[0] = ""+n1
174

  
175
								// before merging and deleting words, check if they are the right ones
176
								if (lines[i+1].startsWith(""+n1+"\t") && lines[i+n+1].startsWith(""+n2+"\t")) {
177
									def splits = []
178
									for (int j = 0 ; j <= n ;j++) {
179
										def tmp = lines[i+j+1].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
180
										splits << tmp
181
									}
182

  
183
									for (int j = 1 ; j < 8 ; j++) {
184
										split[j] = merge(split[j], splits.collect(){it[j]})
185
									}
186

  
187
									//println "REMOVE non- $split"
188
									for (int j = 0 ; j <= n ;j++) {
189
										lines.remove(i+1)
190
									}
156 191
								}
192
								//println "splits=$splits"
157 193
							}
158
							//println "splits=$splits"
159 194
						}
195

  
196
						lines[i] = split.join("\t") // rebuild the line
160 197
					}
161

  
162
					lines[i] = split.join("\t") // rebuild the line
198
					IOUtils.write(conlluFile2, lines.join("\n") + "\n") // CoNLLU needs the last line
163 199
				}
164
				IOUtils.write(conlluFile, lines.join("\n") + "\n") // CoNLLU needs the last line
165 200
			}
201
			cpb_texts.done()
166 202
		}
167
		cpb_texts.done()
168

  
169

  
170
		//		// Fix missing XmlId in conllu files
171
		//		println "Setting word XmlID if necessary"
172
		//		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
173
		//		for (File conlluFile : files) {
174
		//			cpb_texts.tick()
175
		//			if (conlluFile.getName().endsWith(".conllu")) {
176
		//				String textid = FileUtils.stripExtension(conlluFile)
177
		//				int wcounter = 1;
178
		//				ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8");
179
		//				for (int i = 0 ; i < lines.size() ; i++) {
180
		//					String line = lines[i]
181
		//					if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
182
		//
183
		//					def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
184
		//					if (split[-1] != null && !split[-1].contains("XmlId=")) {
185
		//						split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++)
186
		//						lines[i] = split.join("\t")
187
		//					}
188
		//
189
		//				}
190
		//				IOUtils.write(conlluFile, lines.join("\n") + "\n")
191
		//			}
192
		//		}
193
		//		cpb_texts.done()
194

  
195 203
		File metadataFile = Metadatas.findMetadataFile(module.sourceDirectory)
196 204
		File srcDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu2tei")
197 205
		srcDirectory.deleteDir()
......
203 211
		}
204 212

  
205 213
		println "Convert CoNLL-U to XML-TEI..."
206
		convertCoNLLU2TEI(conlluSrcDirectory, srcDirectory, project)
214
		convertCoNLLU2TEI(conlluSrcForTXMDirectory, srcDirectory, project)
207 215

  
208 216
		inputDirectory = srcDirectory // switch files source directory
209 217

  
......
322 330
					}
323 331

  
324 332
				} else {
325

  
326 333
					LinkedHashMap<String, String> wProperties = new LinkedHashMap<String, String>()
327 334

  
328 335
					def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length)
......
430 437

  
431 438
				for (def word : words) {
432 439

  
433
					//println "UD-ID="+word["id"]
434
					if (word["id"].contains("-")) {
435
						writer.writeStartElement("seg")
436
						writer.writeCharacters("******")
437
						writer.writeEndElement() // span
438
					}
439

  
440 440
					String id = null
441 441
					wordCounter++
442 442
					writer.writeStartElement ("w")
......
458 458
						writer.writeAttribute("id", "w_"+text_id+"_"+wordCounter)
459 459
					}
460 460

  
461
					writer.writeCharacters(word["form"])
461
					int idx = word["form"].indexOf(".", 1);
462
					if (word["id"].contains("-") && idx > 0) {
463
						writer.writeCharacters(word["form"].substring(0, idx))
464
					} else {
465
						writer.writeCharacters(word["form"])
466
					}
462 467
					writer.writeEndElement() // w
463 468
					writer.writeCharacters(" ")
464 469
				}
TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/conll2tiger-ud.pl (revision 4019)
242 242

  
243 243
	my $commentlines = 0;                                                                             #added by AL
244 244

  
245
	#  my $contractions = 0; #added by AL
245
	my $contractions = 0; #added by AL
246 246
	#  my $text_id = "unknown_text";
247 247
	my $text_id = $infilename;
248 248
	my $sent_id = "0";
......
268 268
			$commentlines++;
269 269
			next;
270 270
		}
271
	#	# Added by AL for contractions
272
	#	elsif ( $words[$w] =~ /^\d+-\d+/ ) {
273
	#
274
	#		#	print LOG "Contraction line loop 1: $words[$w]\n";
275
	#		$commentlines++;
276
	#
277
	#		#	$contractions++;
278
	#		next;
279
	#	}
271
		# Added by AL for contractions
272
		elsif ( $words[$w] =~ /^\d+-\d+/ ) {
273
	
274
			#	print LOG "Contraction line loop 1: $words[$w]\n";
275
			$commentlines++;
276
	
277
			#	$contractions++;
278
			next;
279
		}
280 280
		else {
281 281
			if ( defined($opt_c) ) {
282 282
				$words[$w] =~ s/coord(\d+)-//g;
......
437 437
			next;
438 438
		}
439 439

  
440
#		#Added AL for contractions
441
#		if ( $words[$i] =~ /^\d+-\d+/ ) {
442
#
443
#			#       print LOG "Contraction loop 2 : $words[$i]\n";
444
#			next;
445
#		}
440
		#Added AL for contractions
441
		if ( $words[$i] =~ /^\d+-\d+/ ) {
446 442

  
443
			#       print LOG "Contraction loop 2 : $words[$i]\n";
444
			next;
445
		}
446

  
447 447
		else {
448 448

  
449 449
			@cols = split( /\t/, $words[$i] );
......
1103 1103
			$print_nt_features = $nt_features;
1104 1104
		}
1105 1105
	}
1106
	
1107
	my $cat = $_[0];
1108
	if ( $cat eq '' ) { 
1109
		$cat = '__UNDEF__';
1110
	}
1106 1111
	printf XML
1107 1112
	  "      <nt id=\"n%d_%d%s\" cat=\"%s\" coord=\"--\" dom=\"%s\" type=\"%s\" vform=\"%s\" vlemma=\"%s\"%s note=\"%s\" snr=\"%d\">\n",
1108
	  $., $w, $dupl, $_[0], $dom, $type, $vform, $vlemma, $print_nt_features,
1113
	  $., $w, $dupl, $cat, $dom, $type, $vform, $vlemma, $print_nt_features,
1109 1114
	  notes("$._$w"), $.;
1110 1115
	printf XML "        <edge idref=\"s%d_%d%s\" label=\"L\"/>\n", $., $w, $dupl;
1111 1116

  
TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/FixDriverFile.java (revision 4019)
3 3
import java.io.File;
4 4
import java.io.IOException;
5 5
import java.util.Arrays;
6
import java.util.HashMap;
7 6
import java.util.HashSet;
7
import java.util.LinkedHashMap;
8
import java.util.LinkedHashSet;
8 9
import java.util.List;
9 10

  
10 11
import javax.xml.parsers.ParserConfigurationException;
......
24 25

  
25 26
	public static boolean fixFeatureValues(File driverFile, List<File> xmlFiles) throws ParserConfigurationException, SAXException, IOException {
26 27

  
27
		HashMap<String, HashSet<String>> declaredFeatures = new HashMap<String, HashSet<String>>();
28
		HashMap<String, HashSet<String>> missingFeatures = new HashMap<String, HashSet<String>>();
29
		HashMap<String, Element> featuresElements = new HashMap<String, Element>();
28
		LinkedHashMap<String, LinkedHashSet<String>> declaredFeatures = new LinkedHashMap<String, LinkedHashSet<String>>();
29
		LinkedHashMap<String, LinkedHashSet<String>> missingFeatures = new LinkedHashMap<String, LinkedHashSet<String>>();
30
		LinkedHashMap<String, Element> featuresElements = new LinkedHashMap<String, Element>();
30 31

  
31 32
		Document doc = DomUtils.load(driverFile);
32 33
		NodeList featuresList = doc.getElementsByTagName("feature");
......
34 35
			Element f = (Element) featuresList.item(i);
35 36

  
36 37
			featuresElements.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), f);
37
			HashSet<String> values = new HashSet<String>();
38
			LinkedHashSet<String> values = new LinkedHashSet<String>();
38 39
			declaredFeatures.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), values);
39
			missingFeatures.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), new HashSet<String>());
40
			missingFeatures.put(f.getAttribute("name")+"\t"+f.getAttribute("domain"), new LinkedHashSet<String>());
40 41

  
41 42
			NodeList featureValuesList = f.getElementsByTagName("value");
42 43
			for (int j = 0 ; j < featureValuesList.getLength() ; j++) {
......
95 96
			//System.out.println("\t"+missingFeatureNamedomain);
96 97
			
97 98
			Element f = featuresElements.get(missingFeatureNamedomain);
99
			//missingFeatures.get(missingFeatureNamedomain).add("");
98 100
			
99 101
			for (String v : missingFeatures.get(missingFeatureNamedomain)) {
100 102
				//System.out.println("\t\t"+v);
TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/function/BratPrintTree.java (revision 4019)
30 30
					int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
31 31
					int n =  n2 - n1;
32 32
					
33
					ArrayList<String[]> newlines = new ArrayList<>();
34
					for (int j = 0 ; j <= n ; j++) {
35
						newlines.add(new String[split.length]);
36
						for (int p = 0 ; p < split.length ; p++) {
37
							newlines.get(j)[p] = "_";
38
						}
39
					}
33
					//System.out.println("Word "+Arrays.toString(split));
34
					//System.out.println("lines to insert: "+n);
40 35
					if ( !(splittedLines.get(i+1)[0].equals(""+n1)) || !(splittedLines.get(i+n+1)[0].equals(""+n2)) ) {
41
						System.out.println("FIXING "+conll.get(i));
42
						for (int p = 2 ; p < split.length - 1 ; p++) {
43
							String[] splittedValues = split[p].split(".");
36
						
37
						ArrayList<String[]> newlines = new ArrayList<>();
38
						for (int j = 0 ; j <= n ; j++) {
39
							newlines.add(new String[split.length]);
44 40
							
41
							newlines.get(j)[0] = ""+(n1+j);
45 42
							
46
							for (int j = 0 ; j <= n ; j++) {
47
								if (p >= splittedValues.length) {
48
									
49
								} else {
50
									newlines.get(j)[p] = splittedValues[p];
43
							for (int p = 1 ; p < split.length ; p++) {
44
								newlines.get(j)[p] = "_";
45
							}
46
						}
47
						
48
						//System.out.println("FIXING "+split);
49
						for (int p = 1 ; p < split.length - 1 ; p++) {
50
							String v = split[p];
51
							String[] splittedValues = v.split("\\.");
52
							if (splittedValues.length == newlines.size()) {
53
								for (int j = 0 ; j <= n ; j++) {
54
									newlines.get(j)[p] = splittedValues[j];
51 55
								}
56
							} else if ((splittedValues.length - 1) == newlines.size()) {
57
								for (int j = 0 ; j <= n ; j++) {
58
									newlines.get(j)[p] = splittedValues[j+1];
59
								}
60
							} else {
61
								for (int j = 0 ; j <= n ; j++) {
62
									newlines.get(j)[p] = split[p];
63
								}
52 64
							}
53 65
						}
54 66
						
67
						for (int j = 0 ; j <= n ; j++) {
68
							splittedLines.add(i+j+1, newlines.get(j));
69
							
70
						}
71
						i = i + newlines.size();
72
						
55 73
					} else {
56
						System.out.println("NOT FIXING "+conll.get(i));
74
						//System.out.println("NOT FIXING "+conll.get(i));
57 75
					}
76
					
77
					
58 78
				}
59
				
79
			}
80
			
81
			for (int i = 0 ; i < splittedLines.size() ; i++) {
82
				String split[] = splittedLines.get(i);
60 83
				conll2.add(StringUtils.join(split, "\t"));
61
				
62 84
			}
85
			
63 86
			for (String l : conll2) System.out.println(l);
64 87
			
65 88
			String bundle_id = "org.txm.conllu.core";

Formats disponibles : Unified diff