Révision 3545

TXM/trunk/org.txm.connlu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImporter.groovy (revision 3545)
31 31
	@Override
32 32
	public void process() {
33 33
		
34
		File connluSrcDirectory = inputDirectory
35
		
36
		boolean usenewdocid =  UDPreferences.getInstance().getString(UDPreferences.IMPORT_USE_NEW_DOC_ID); // THE conllu -> Tiger XSL MUST HAVE THE SAME BEHAVIOR BEFORE //
37
		
38
		if (usenewdocid) {
39
			connluSrcDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu")
40
			connluSrcDirectory.deleteDir();
41
			connluSrcDirectory.mkdirs();
42
			
43
			println "Convert CoNLL-U to XML-TEI..."
44
			if (!splitCoNLLUFiles(inputDirectory, connluSrcDirectory, project)) {
45
				return;
46
			}
47
		}
34 48
		File srcDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu2tei")
35 49
		srcDirectory.deleteDir();
36 50
		srcDirectory.mkdirs();
37 51
		
38 52
		println "Convert CoNLL-U to XML-TEI..."
39
		convert(inputDirectory, srcDirectory, project)
53
		convertCoNLLU2TEI(connluSrcDirectory, srcDirectory, project)
40 54
		
41 55
		inputDirectory = srcDirectory // switch source directory
42 56
		
43 57
		super.process();
44 58
	}
45 59
	
46
	public static def convert(File inputDirectory, File srcDirectory, def project) {
60
	public static def splitCoNLLUFiles(File inputDirectory, File srcDirectory, def project) {
61
		def files = inputDirectory.listFiles()
47 62
		
63
		if (files == null) {
64
			println "Aborting. No CONLL file found in $inputDirectory."
65
			return false
66
		}
67
		
68
		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
69
		
70
		println "Splitting CoNLL-U files..."
71
		for (File master : files) {
72
			
73
			cpb_texts.tick()
74
			
75
			if (!master.getName().endsWith(".conllu")) {
76
				continue;
77
			}
78
			
79
			String orig_text_id = master.getName()
80
			String current_text_id = master.getName()
81
			File conlluFile = new File(srcDirectory, current_text_id+".conllu")
82
			def writer = conlluFile.newWriter("UTF-8", true)
83
			
84
			master.eachLine("UTF-8") { line ->
85
				if (line.startsWith("# newdoc id = ")) {
86
					
87
					String text_id = line.substring("# newdoc id = ".length())
88
					if (!text_id.equals(current_text_id)) {
89
						writer.close()
90
						current_text_id = text_id
91
						conlluFile = new File(srcDirectory, current_text_id+".conllu")
92
						writer = conlluFile.newWriter("UTF-8", true)
93
					}
94
				}
95
				
96
				writer.println(line)
97
			}
98
		}
99
		cpb_texts.done()
100
		return true
101
	}
102
	
103
	public static def convertCoNLLU2TEI(File inputDirectory, File srcDirectory, def project) {
104
		
48 105
		def files = inputDirectory.listFiles()
49 106
		
50 107
		if (files == null) {
......
52 109
			return false
53 110
		}
54 111
		
55
		def content = new LinkedHashMap(); // /text/par/sent
56 112
		
113
		
57 114
		def properties = Arrays.asList(ImportCoNLLUAnnotations.UD_PROPERTY_NAMES)
58 115
		
59 116
		String prefix = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.UDPREFIX, UDPreferences.getInstance().getString(UDPreferences.UDPREFIX));
......
62 119
		
63 120
		boolean keepContractions =  UDPreferences.getInstance().getString(UDPreferences.KEEP_CONTRACTIONS)
64 121
		
65
		boolean usenewdocid =  false; // THE conllu -> Tiger XSL MUST HAVE THE SAME BEHAVIOR BEFORE // UDPreferences.getInstance().getString(UDPreferences.IMPORT_USE_NEW_DOC_ID)
66
		
67 122
		def headPropertiesToProject = UDPreferences.getInstance().getString(UDPreferences.IMPORT_HEAD_TO_PROJECT).split(",") as Set
68 123
		
69 124
		def depsPropertiesToProject = UDPreferences.getInstance().getString(UDPreferences.IMPORT_DEPS_TO_PROJECT).split(",") as Set
70 125
		
71 126
		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
72 127
		
73
		def docToMaster = [:]
74 128
		println "Parsing CoNLL-U files..."
75 129
		for (File master : files) {
76 130
			
......
80 134
				continue;
81 135
			}
82 136
			
137
			def content = []; // list of sentence
138
			
83 139
			String text_id = master.getName();
84 140
			String sent_id = "";
85 141
			String par_id = "1";
142
			def comments = []; // /text/par/sent
143
			def words = []
86 144
			
87 145
			master.eachLine("UTF-8") { line ->
88
				if (usenewdocid && line.startsWith("# newdoc id = ")) {
89
					text_id = line.substring("# newdoc id = ".length())
90
					
91
					if (!docToMaster.containsKey(text_id)) docToMaster[text_id] = new HashSet();
92
					docToMaster[text_id].add(master.getName())
93
					
146
				
147
				if (line.startsWith("# newdoc id = ")) {
148
					// already set or ignored
94 149
				} else if (line.startsWith("# sent_id = ")) {
95 150
					sent_id = line.substring("# sent_id = ".length())
96 151
				} else if (line.startsWith("# newpar id = ")) {
97 152
					par_id = line.substring("# newpar id = ".length())
98 153
				} else if (line.startsWith("#")) {
99
					//sent_id = line.substring("# sent_id = ".length())
154
					comments << line
155
				} else if (line.trim().isEmpty()) {
156
					if (words.size() > 0) {
157
						def sentence = [par_id, sent_id, words, comments]
158
						content.add(sentence)
159
					}
100 160
				} else {
101
					if (text_id != null && sent_id != null) {
102
						if (!content.containsKey(text_id)) {
103
							content[text_id] = new LinkedHashMap()
161
					
162
					HashMap<String, String> wProperties = new HashMap<String, String>()
163
					
164
					def split = line.split("\t")
165
					if (split.size() == properties.size()) {
166
						String id = split[0]
167
						for (int i = 0 ; i < split.size() ; i++) {
168
							wProperties[properties[i]] = split[i]
104 169
						}
105
						def text = content[text_id]
106
						if (!text.containsKey(par_id)) {
107
							text[par_id] = new LinkedHashMap()
108
						}
109
						if (!text[par_id].containsKey(sent_id)) {
110
							LinkedHashMap<String, HashMap<String, String>> sentenceProperties = new LinkedHashMap<String, HashMap<String, String>>();
111
							text[par_id][sent_id] = sentenceProperties
112
						}
113
						HashMap<String, String> wProperties = new HashMap<String, String>()
114 170
						
115
						def split = line.split("\t")
116
						if (split.size() == properties.size()) {
117
							String id = split[0]
118
							for (int i = 0 ; i < split.size() ; i++) {
119
								wProperties[properties[i]] = split[i]
171
						if (wProperties.get("id").equals("1")) { // it's a new sentence, store the current if any and starts a new sentence
172
							if (words.size() > 0) {
173
								def sentence = [par_id, sent_id, words, comments]
174
								content.add(sentence)
120 175
							}
121 176
							
122
							text[par_id][sent_id][id] = wProperties
177
							sent_id = "";
178
							par_id = "1";
179
							comments = [];
180
							words = []
123 181
						}
182
						
183
						words << wProperties
184
					} else {
185
						//println "Warning: not a line: "+line
124 186
					}
125 187
				}
126 188
			}
127
		}
128
		cpb_texts.done()
129
		
130
		println "Writing XML files..."
131
		cpb_texts = new ConsoleProgressBar(content.size())
132
		for (def text_id2 : content.keySet()) {
133 189
			
134
			cpb_texts.tick()
190
			if (content.size() == 0) {
191
				continue;
192
			}
135 193
			
136
			File xmlFile = new File(srcDirectory, text_id2+".xml")
194
			File xmlFile = new File(srcDirectory, text_id+".xml")
137 195
			// println "xmlFile=$xmlFile"
138 196
			BufferedOutputStream output = new BufferedOutputStream(new FileOutputStream(xmlFile))
139 197
			XMLOutputFactory factory = XMLOutputFactory.newInstance()
......
148 206
			writer.writeEndElement()
149 207
			writer.writeCharacters("\n")
150 208
			writer.writeStartElement ("text")
151
			if (docToMaster.containsKey(text_id2)) {
152
				writer.writeAttribute ("filename", docToMaster[text_id2].join(", "))
153
			}
209
			
154 210
			writer.writeCharacters("\n")
155 211
			
156
			def text = content[text_id2]
157
			for (def par_id2 : text.keySet()) { // for all paragraph of the current text
212
			String current_par_id = null
213
			
214
			for (def sentence : content) { // for all paragraph of the current text
158 215
				
159
				writer.writeStartElement ("p");
160
				writer.writeAttribute("id", par_id2)
216
				par_id = sentence[0]
217
				sent_id = sentence[1]
218
				words = sentence[2]
219
				comments = sentence[3]
220
				
221
				if (current_par_id == null || par_id != current_par_id) {
222
					if (current_par_id != null) {
223
						writer.writeEndElement() // p
224
					}
225
					writer.writeStartElement ("p");
226
					writer.writeAttribute("id", par_id)
227
					writer.writeCharacters("\n")
228
					
229
					current_par_id = par_id
230
				}
231
				
232
				writer.writeStartElement ("s")
233
				writer.writeAttribute("id", sent_id)
161 234
				writer.writeCharacters("\n")
162 235
				
163
				for (def sent_id2 : text[par_id2].keySet()) { // for all sentence of the current paragraph
164
					
165
					writer.writeStartElement ("s")
166
					writer.writeAttribute("id", sent_id2)
236
				for (def comment : comments) {
237
					writer.writeComment(comment)
167 238
					writer.writeCharacters("\n")
168
					
169
					def sentence = text[par_id2][sent_id2]
170
					
171
					if (!keepContractions) { // merge properties in the "-" word and remove the parts
172
						for (String wordid : sentence.keySet()) {
173
							def word = sentence[wordid]
174
							if (word == null) continue;
239
				}
240
				
241
				if (!keepContractions) { // merge properties in the "-" word and remove the parts
242
					for (int i = 0 ; i < words.size() ; i++) {
243
						def word = words[i]
244
						String id = word[0]
245
						
246
						if (id.contains("-")) { // multi-word line
247
							int index = id.indexOf("-")
248
							String id1 = id.substring(0, index)
249
							String id2 = id.substring(index+1)
250
							def token1 = sentence[id1]
251
							def token2 = sentence[id2]
175 252
							
176
							String id = word[0]
253
							if (token1 == null || token2 == null) {
254
								println "Error: text $text_id paragraph $par_id sent $sent_id word $id has wrong token ids $id1 and $id2 -> $token1 and $token2"
255
								continue
256
							}
177 257
							
178
							if (id.contains("-")) { // multi-word line
179
								int index = id.indexOf("-")
180
								String id1 = id.substring(0, index)
181
								String id2 = id.substring(index+1)
182
								def token1 = sentence[id1]
183
								def token2 = sentence[id2]
184
								
185
								if (token1 == null || token2 == null) {
186
									println "Error: text $text_id2 paragraph $par_id2 sent $sent_id2 word $id has wrong token ids $id1 and $id2 -> $token1 and $token2"
187
									continue
188
								}
189
								
190
								for (String p : properties) {
191
									if (p == "id") continue // don't merge the form property
192
										if (p == "form") continue // don't merge the form property
193
										word[p] =  token1[p] + "+" + token2[p]
194
								}
195
								sentence.remove(id1) // remove the token
196
								sentence.remove(id2) // remove the token
258
							for (String p : properties) {
259
								if (p == "id") continue // don't merge the form property
260
									if (p == "form") continue // don't merge the form property
261
									word[p] =  token1[p] + "+" + token2[p]
197 262
							}
263
							words.remove(i+1) // remove the token
264
							words.remove(i+1) // remove the token
198 265
						}
199 266
					}
267
				}
268
				
269
				if (headPropertiesToProject.size() > 0 || depsPropertiesToProject.size() > 0) {
270
					LinkedHashMap sentencehash = new LinkedHashMap()
271
					//println "WORDS="+words
272
					for (def word : words) {
273
						sentencehash[word["id"]] = word
274
					}
275
					//println "SENTENCE="+sentencehash
276
					ImportCoNLLUAnnotations.buildPropertiesProjections(sentencehash, headPropertiesToProject, depsPropertiesToProject)
277
				}
278
				
279
				for (def word : words) {
200 280
					
201
					if (headPropertiesToProject.size() > 0 || depsPropertiesToProject.size() > 0) {
202
						ImportCoNLLUAnnotations.buildPropertiesProjections(sentence, headPropertiesToProject, depsPropertiesToProject)
281
					def id = word["id"]
282
					
283
					writer.writeStartElement ("w")
284
					for (String p : word.keySet()) {
285
						if (p == "feats") word[p] = "|"+word[p]+"|"
286
						//println "WORD="+word
287
						writer.writeAttribute(prefix+p, word[p])
203 288
					}
204 289
					
205
					for (String wordid : sentence.keySet()) {
206
						def word = sentence[wordid]
207
						
208
						def id = word["id"]
209
						
210
						writer.writeStartElement ("w")
211
						for (String p : word.keySet()) {
212
							if (p == "feats") word[p] = "|"+word[p]+"|"
213
							//println "WORD="+word
214
							writer.writeAttribute(prefix+p, word[p])
215
						}
216
						
217
						writer.writeCharacters(word["form"])
218
						writer.writeEndElement() // w
219
						writer.writeCharacters(" ")
220
					}
221
					writer.writeCharacters("\n")
222
					writer.writeEndElement() // s
290
					writer.writeCharacters(word["form"])
291
					writer.writeEndElement() // w
292
					writer.writeCharacters(" ")
223 293
				}
224 294
				writer.writeCharacters("\n")
295
				writer.writeEndElement() // s
296
			}
297
			
298
			if (current_par_id != null) {
225 299
				writer.writeEndElement() // p
300
				writer.writeCharacters("\n")
226 301
			}
227 302
			
228 303
			writer.writeEndElement() // text
......
236 311
		
237 312
		return true
238 313
	}
314
	
239 315
}
TXM/trunk/org.txm.connlu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImport.groovy (revision 3545)
48 48
		// build the TIGER-XML file
49 49
		CallUD2TigerPerlScript cutps = new CallUD2TigerPerlScript();
50 50
		
51
		boolean buildtigerindexes =  UDPreferences.getInstance().getString(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES); // THE conllu -> Tiger XSL MUST HAVE THE SAME BEHAVIOR BEFORE //
52
		if (!buildtigerindexes) {
53
			println "Skipping TIGER conversion step."
54
			return;
55
		}
56
		
51 57
		if (cutps.canBuildTSFiles()) {
52 58
			
53 59
			println "Converting CoNLL-U files to TIGER-XML files..."
......
108 114
				TIGERSearchEngine.buildTIGERCorpus(tigerXMLDirectory, this.binaryDirectory, corpusName);
109 115
			}
110 116
		} else {
111
			println "Skipping TIGER conversion step."
117
			println "Can not do the TIGER indexes step."
112 118
		}
113 119
	}
114 120
}
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZDefaultPagerStep.groovy (revision 3545)
662 662
							
663 663
								String interpvalue = null;
664 664
								def tooltipProperties = pager.project.getEditionDefinition("default").get(TBXPreferences.EDITION_DEFINITION_TOOLTIP_PROPERTIES, "*");
665
								println tooltipProperties
666 665
								if (tooltipProperties.equals("*")) {
667 666
									interpvalue = "- "+anaValues.entrySet().join("\n- ")+"\n- "+wordid
668 667
								} else {

Formats disponibles : Unified diff