Révision 4018

TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImporter.groovy (revision 4018)
25 25
 *
26 26
 */
27 27
class CoNLLUImporter extends XTZImporter {
28
	
28

  
29 29
	public CoNLLUImporter(ImportModule module) {
30 30
		super(module)
31 31
	}
32
	
32

  
33 33
	public final String merge(String orig, def sss) {
34
			
34

  
35 35
		int n = 0;
36 36
		for (String s : sss) {
37 37
			if (s != "" && s != "_") {
38
				
38

  
39 39
				def ssset = new HashSet(sss);
40 40
				if (ssset.size() == 1) return ssset.join(".")
41
				
41

  
42 42
				return sss.join(".")
43 43
			}
44 44
		}
45
		
45

  
46 46
		return orig;
47 47
	}
48
	
48

  
49 49
	@Override
50 50
	public void process() {
51
		
51

  
52 52
		File conlluSrcDirectory = inputDirectory
53
		
53

  
54 54
		boolean usenewdocid =  "true".equals(UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.IMPORT_USE_NEW_DOC_ID)); // THE conllu -> Tiger XSL MUST HAVE THE SAME BEHAVIOR BEFORE //
55
		
55

  
56 56
		if (usenewdocid) {
57 57
			conlluSrcDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu")
58 58
			conlluSrcDirectory.deleteDir()
59 59
			conlluSrcDirectory.mkdirs()
60
			
60

  
61 61
			if (!splitCoNLLUFiles(inputDirectory, conlluSrcDirectory, project)) {
62 62
				return
63 63
			}
64 64
		}
65
		
65

  
66 66
		def files = conlluSrcDirectory.listFiles()
67 67
		files.sort()
68
		
68

  
69 69
		// Keep or not contractions
70 70
		String contractionsManagement =  UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
71
		
72
			println "Contractions managment ($contractionsManagement) & add XmlId if necessary & remove empty nodes"
73
			ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
74
			for (File conlluFile : files) {
75
				cpb_texts.tick()
76
				if (conlluFile.getName().endsWith(".conllu")) {
77
					String textid = FileUtils.stripExtension(conlluFile)
78
					int wcounter = 1;
71

  
72
		println "Contractions managment ($contractionsManagement) & add XmlId if necessary & remove empty nodes"
73
		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
74
		for (File conlluFile : files) {
75
			cpb_texts.tick()
76
			if (conlluFile.getName().endsWith(".conllu")) {
77
				String textid = FileUtils.stripExtension(conlluFile)
78
				int wcounter = 1;
79

  
80
				ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8");
81
				for (int i = 0 ; i < lines.size() ; i++) {
82
					String line = lines[i]
79 83
					
80
					ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8");
84
					if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
81 85
					
82
					def temp_multiwords = [:]
83
					
84
					for (int i = 0 ; i < lines.size() ; i++) {
85
						String line = lines[i]
86
						if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
87
						
88
						def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
89
						
90
						if (split[-1] != null && !split[-1].contains("XmlId=")) {
91
							if (split[-1] == "_") {
92
								split[-1] = "XmlId=w_"+textid+"_"+(wcounter++)
93
							} else {
94
								split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++)
86
					def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
87
					if (split[0].contains(".")) {
88
						//println "REMOVE EMPTY NODE: $split : "+
89
						lines.remove(i)
90
						i--
91
						continue; // next !
92
					}
93
				}
94
				def temp_multiwords = [:]
95

  
96
				for (int i = 0 ; i < lines.size() ; i++) {
97
					String line = lines[i]
98
					if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
99

  
100
					def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
101

  
102
					if (split[-1] != null && !split[-1].contains("XmlId=")) {
103
						if (split[-1] == "_") {
104
							split[-1] = "XmlId=w_"+textid+"_"+(wcounter++)
105
						} else {
106
							split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++)
107
						}
108
					}
109

  
110
					if (contractionsManagement == UDPreferences.ALL) {
111
						// ok on fait rien
112
					} else if (contractionsManagement == UDPreferences.SYNTAX) {
113
						if (split[0].contains("-")) {
114

  
115
							// stores the syntatic word id and the ortographic word properties
116
							temp_multiwords = [:]
117
							int n1 = Integer.parseInt(split[0].substring(0,  split[0].indexOf("-")));
118
							int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
119
							for (int ii = n1 ; ii <= n2 ; ii++) {
120
								temp_multiwords[""+ii] = split;
95 121
							}
96
						}
97
						
98
						if (split[0].contains(".")) {
99
							//println "REMOVE EMPTY NODE: $split : "+
122

  
123
							//println "REMOVE - $split"
100 124
							lines.remove(i)
101 125
							i--
102
							continue; // next !
126
							continue; /// next !
127
						} else if (temp_multiwords.containsKey(split[0])) { // it's a syntactic word of an orthographic word
128
							def split_ortho = temp_multiwords.remove(split[0])
129

  
130
							if (split[9].length() > 0) split[9] += "|"
131
							split[9] += "multiword="+split_ortho[1] // the orthographic form
103 132
						}
104
						
105
						if (contractionsManagement == UDPreferences.ALL) {
106
							// ok on fait rien
107
						} else if (contractionsManagement == UDPreferences.SYNTAX) {
108
							if (split[0].contains("-")) {
109
								
110
								// stores the syntatic word id and the ortographic word properties
111
								temp_multiwords = [:]
112
								int n1 = Integer.parseInt(split[0].substring(0,  split[0].indexOf("-")));
113
								int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
114
								for (int ii = n1 ; ii <= n2 ; ii++) {
115
									temp_multiwords[""+ii] = split;
116
								}
117
								
118
								//println "REMOVE - $split"
119
								lines.remove(i)
120
								i--
121
								continue; /// next !
122
							} else if (temp_multiwords.containsKey(split[0])) { // it's a syntactic word of an orthographic word
123
								def split_ortho = temp_multiwords.remove(split[0])
124
								
125
								if (split[9].length() > 0) split[9] += "|"
126
								split[9] += "multiword="+split_ortho[1] // the orthographic form
127
							}
128
						} else if (contractionsManagement == UDPreferences.SURFACE) {
129
							if (split[0].contains("-")) {
130
								int n1 = Integer.parseInt(split[0].substring(0,  split[0].indexOf("-")));
131
								int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
132
								int n =  n2 - n1
133
								
134
								split[0] = ""+n1
135
								
133
					} else if (contractionsManagement == UDPreferences.SURFACE) {
134
						if (split[0].contains("-")) {
135
							int n1 = Integer.parseInt(split[0].substring(0,  split[0].indexOf("-")));
136
							int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
137
							int n =  n2 - n1
138

  
139
							//split[0] = ""+n1
140

  
141
							// before merging and deleting words, check if they are the right ones 
142
							if (lines[i+1].startsWith(""+n1+"\t") && lines[i+n+1].startsWith(""+n2+"\t")) {
136 143
								def splits = []
137 144
								for (int j = 0 ; j <= n ;j++) {
138 145
									def tmp = lines[i+j+1].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
139
									if (tmp[0].contains(".")) {
140
										//println "PRE-REMOVE EMPTY NODE: $split : "+
141
										lines.remove(i+j)
142
										j--
143
										continue // next token
144
									}
145 146
									splits << tmp
146 147
								}
147 148

  
148 149
								for (int j = 2 ; j < 8 ; j++) {
149 150
									split[j] = merge(split[j], splits.collect(){it[j]})
150 151
								}
151
																
152

  
152 153
								//println "REMOVE non- $split"
153 154
								for (int j = 0 ; j <= n ;j++) {
154 155
									lines.remove(i+1)
155 156
								}
156
								//println "splits=$splits"
157 157
							}
158
							//println "splits=$splits"
158 159
						}
159
						
160
						lines[i] = split.join("\t") // rebuild the line
161 160
					}
162
					IOUtils.write(conlluFile, lines.join("\n") + "\n") // CoNLLU needs the last line
161

  
162
					lines[i] = split.join("\t") // rebuild the line
163 163
				}
164
				IOUtils.write(conlluFile, lines.join("\n") + "\n") // CoNLLU needs the last line
164 165
			}
165
			cpb_texts.done()
166
			
167
		
168
//		// Fix missing XmlId in conllu files
169
//		println "Setting word XmlID if necessary"
170
//		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
171
//		for (File conlluFile : files) {
172
//			cpb_texts.tick()
173
//			if (conlluFile.getName().endsWith(".conllu")) {
174
//				String textid = FileUtils.stripExtension(conlluFile)
175
//				int wcounter = 1;
176
//				ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8");
177
//				for (int i = 0 ; i < lines.size() ; i++) {
178
//					String line = lines[i]
179
//					if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
180
//					
181
//					def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
182
//					if (split[-1] != null && !split[-1].contains("XmlId=")) {
183
//						split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++)
184
//						lines[i] = split.join("\t")
185
//					}
186
//					
187
//				}
188
//				IOUtils.write(conlluFile, lines.join("\n") + "\n")
189
//			}
190
//		}
191
//		cpb_texts.done()
192
		
166
		}
167
		cpb_texts.done()
168

  
169

  
170
		//		// Fix missing XmlId in conllu files
171
		//		println "Setting word XmlID if necessary"
172
		//		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
173
		//		for (File conlluFile : files) {
174
		//			cpb_texts.tick()
175
		//			if (conlluFile.getName().endsWith(".conllu")) {
176
		//				String textid = FileUtils.stripExtension(conlluFile)
177
		//				int wcounter = 1;
178
		//				ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8");
179
		//				for (int i = 0 ; i < lines.size() ; i++) {
180
		//					String line = lines[i]
181
		//					if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
182
		//
183
		//					def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
184
		//					if (split[-1] != null && !split[-1].contains("XmlId=")) {
185
		//						split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++)
186
		//						lines[i] = split.join("\t")
187
		//					}
188
		//
189
		//				}
190
		//				IOUtils.write(conlluFile, lines.join("\n") + "\n")
191
		//			}
192
		//		}
193
		//		cpb_texts.done()
194

  
193 195
		File metadataFile = Metadatas.findMetadataFile(module.sourceDirectory)
194 196
		File srcDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu2tei")
195 197
		srcDirectory.deleteDir()
196 198
		srcDirectory.mkdirs()
197
		
199

  
198 200
		if (metadataFile != null && metadataFile.exists()) {
199 201
			File metadataFile2 = new File(srcDirectory, metadataFile.getName())
200 202
			FileCopy.copy(metadataFile, metadataFile2)
201 203
		}
202
		
204

  
203 205
		println "Convert CoNLL-U to XML-TEI..."
204 206
		convertCoNLLU2TEI(conlluSrcDirectory, srcDirectory, project)
205
		
207

  
206 208
		inputDirectory = srcDirectory // switch files source directory
207
		
209

  
208 210
		super.process()
209 211
	}
210
	
212

  
211 213
	public static def splitCoNLLUFiles(File inputDirectory, File srcDirectory, def project) {
212 214
		def files = inputDirectory.listFiles(new FilenameFilter() {
213
			boolean accept(File dir, String name) {
214
				return name.toLowerCase().endsWith(".conllu")
215
			}
216
		});
217
		
215
					boolean accept(File dir, String name) {
216
						return name.toLowerCase().endsWith(".conllu")
217
					}
218
				});
219

  
218 220
		if (files == null) {
219 221
			println "Aborting. No CONLL file found in $inputDirectory."
220 222
			return false
221 223
		}
222 224
		files.sort()
223 225
		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
224
		
226

  
225 227
		println "Splitting CoNLL-U files..."
226 228
		for (File master : files) {
227
			
229

  
228 230
			cpb_texts.tick()
229
			
231

  
230 232
			if (!master.getName().endsWith(".conllu")) {
231 233
				continue
232 234
			}
233
			
235

  
234 236
			String orig_text_id = FileUtils.stripExtension(master)
235 237
			String current_text_id = FileUtils.stripExtension(master)
236 238
			File conlluFile = new File(srcDirectory, current_text_id+".conllu")
237 239
			def writer = conlluFile.newWriter("UTF-8", true)
238
			
240

  
239 241
			master.eachLine("UTF-8") { line ->
240 242
				if (line.startsWith("# newdoc id = ")) {
241
					
243

  
242 244
					String text_id = line.substring("# newdoc id = ".length())
243 245
					if (!text_id.equals(current_text_id)) {
244 246
						writer.close()
......
247 249
						writer = conlluFile.newWriter("UTF-8", true)
248 250
					}
249 251
				}
250
				
252

  
251 253
				writer.println(line)
252 254
			}
253 255
			writer.close()
......
255 257
		cpb_texts.done()
256 258
		return true
257 259
	}
258
	
260

  
259 261
	public static def convertCoNLLU2TEI(File inputDirectory, File srcDirectory, def project) {
260
		
262

  
261 263
		def files = inputDirectory.listFiles()
262
		
264

  
263 265
		if (files == null) {
264 266
			println "Aborting. No CONLL file found in $inputDirectory."
265 267
			return false
266 268
		}
267 269
		files.sort()
268 270
		def properties = Arrays.asList(ImportCoNLLUAnnotations.UD_PROPERTY_NAMES)
269
		
271

  
270 272
		String prefix = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.UDPREFIX, UDPreferences.getInstance().getString(UDPreferences.UDPREFIX));
271
		
273

  
272 274
		UDPreferences.getInstance().setProjectPreferenceValue(project, UDPreferences.UDPREFIX, prefix); // copy the current preference into the corpus preference
273
		
275

  
274 276
		def headPropertiesToProject = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.IMPORT_HEAD_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_HEAD_TO_PROJECT)).split(",") as Set
275
		
277

  
276 278
		def depsPropertiesToProject = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.IMPORT_DEPS_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_DEPS_TO_PROJECT)).split(",") as Set
277
		
279

  
278 280
		def formatSentences = "true" == UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.IMPORT_PRINT_NEWLINES_AFTER_SENTENCES, ""+UDPreferences.getInstance().getString(UDPreferences.IMPORT_PRINT_NEWLINES_AFTER_SENTENCES))
279
		
281

  
280 282
		String contractionsManagement =  UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT));
281
		
283

  
282 284
		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
283
		
285

  
284 286
		println "Parsing CoNLL-U files..."
285 287
		for (File master : files) {
286
			
288

  
287 289
			cpb_texts.tick()
288
			
290

  
289 291
			if (!master.getName().endsWith(".conllu")) {
290 292
				continue
291 293
			}
292
			
294

  
293 295
			def content = [] // list of sentence
294
			
296

  
295 297
			String text_id = FileUtils.stripExtension(master)
296 298
			String sent_id = ""
297 299
			String par_id = "1"
298 300
			def comments = [] // /text/par/sent
299 301
			def words = []
300
			
302

  
301 303
			master.eachLine("UTF-8") { line ->
302
				
304

  
303 305
				if (line.startsWith("# newdoc id = ")) {
304 306
					// already set or ignored
305 307
				} else if (line.startsWith("# sent_id = ")) {
......
312 314
					if (words.size() > 0) {
313 315
						def sentence = [par_id, sent_id, words, comments]
314 316
						content.add(sentence)
315
						
317

  
316 318
						sent_id = ""
317 319
						par_id = "1"
318 320
						comments = []
319 321
						words = []
320 322
					}
321
					
323

  
322 324
				} else {
323
					
325

  
324 326
					LinkedHashMap<String, String> wProperties = new LinkedHashMap<String, String>()
325
					
327

  
326 328
					def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length)
327 329
					if (split.size() == properties.size()) {
328 330
						String id = split[0]
329 331
						for (int i = 0 ; i < split.size() ; i++) {
330 332
							wProperties[properties[i]] = split[i]
331 333
						}
332
						
333
//						if (wProperties.get("id").equals("1") || wProperties.get("id").startsWith("1-")) { // it's a new sentence, store the current if any and starts a new sentence
334
//							if (words.size() > 0) {
335
//								def sentence = [par_id, sent_id, words, comments]
336
//								content.add(sentence)
337
//								
338
//								sent_id = ""
339
//								par_id = "1"
340
//								comments = []
341
//								words = []
342
//							}
343
//						}
334

  
335
						//						if (wProperties.get("id").equals("1") || wProperties.get("id").startsWith("1-")) { // it's a new sentence, store the current if any and starts a new sentence
336
						//							if (words.size() > 0) {
337
						//								def sentence = [par_id, sent_id, words, comments]
338
						//								content.add(sentence)
339
						//
340
						//								sent_id = ""
341
						//								par_id = "1"
342
						//								comments = []
343
						//								words = []
344
						//							}
345
						//						}
344 346
						if (wProperties[properties[0]].contains(".")) { // id
345 347
							// empty node
346 348
						} else {
......
351 353
					}
352 354
				}
353 355
			}
354
			
356

  
355 357
			if (words.size() > 0) { // last sentence ?
356 358
				def sentence = [par_id, sent_id, words, comments]
357 359
				content.add(sentence)
358 360
			}
359
			
361

  
360 362
			if (content.size() == 0) {
361 363
				continue;
362 364
			}
363
			
365

  
364 366
			//println "${content.size()} sentences found."
365
			
367

  
366 368
			File xmlFile = new File(srcDirectory, text_id+".xml")
367 369
			// println "xmlFile=$xmlFile"
368 370
			BufferedOutputStream output = new BufferedOutputStream(new FileOutputStream(xmlFile))
369 371
			XMLOutputFactory factory = XMLOutputFactory.newInstance()
370 372
			XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8")
371
			
373

  
372 374
			writer.writeStartDocument("UTF-8","1.0")
373 375
			writer.writeStartElement ("TEI")
374 376
			writer.writeDefaultNamespace("http://www.tei-c.org/ns/1.0")
......
378 380
			writer.writeEndElement()
379 381
			writer.writeCharacters("\n")
380 382
			writer.writeStartElement ("text")
381
			
383

  
382 384
			writer.writeCharacters("\n")
383
			
385

  
384 386
			String current_par_id = null
385 387
			int wordCounter = 0
386 388
			for (def sentence : content) { // for all paragraph of the current text
387
				
389

  
388 390
				par_id = sentence[0]
389 391
				sent_id = sentence[1]
390 392
				words = sentence[2]
391 393
				comments = sentence[3]
392
				
394

  
393 395
				if (current_par_id == null || par_id != current_par_id) {
394 396
					if (current_par_id != null) {
395 397
						writer.writeEndElement() // p
......
397 399
					writer.writeStartElement ("p")
398 400
					writer.writeAttribute("id", par_id)
399 401
					writer.writeCharacters("\n")
400
					
402

  
401 403
					current_par_id = par_id
402 404
				}
403
				
405

  
404 406
				writer.writeStartElement ("s")
405 407
				writer.writeAttribute("id", sent_id)
406 408
				writer.writeCharacters("\n")
407
				
409

  
408 410
				for (def comment : comments) {
409 411
					writer.writeComment(comment.replace("--", "&#x2212;&#x2212;"))
410 412
					writer.writeCharacters("\n")
411 413
				}
412
				
414

  
413 415
				if (headPropertiesToProject.size() > 0 || depsPropertiesToProject.size() > 0) {
414 416
					LinkedHashMap sentencehash = new LinkedHashMap()
415 417
					//println "WORDS="+words
......
419 421
					//println "SENTENCE="+sentencehash
420 422
					ImportCoNLLUAnnotations.buildPropertiesProjections(sentencehash, headPropertiesToProject, depsPropertiesToProject)
421 423
				}
422
				
424

  
423 425
				if (formatSentences) {
424
					 writer.writeStartElement("p")
425
					 writer.writeAttribute("type", "sentence")
426
					 writer.writeAttribute("style", "--before-content:'$sent_id';")
426
					writer.writeStartElement("p")
427
					writer.writeAttribute("type", "sentence")
428
					writer.writeAttribute("style", "--before-content:'$sent_id';")
427 429
				}
428
				
430

  
429 431
				for (def word : words) {
430
					
431
					println "UD-ID="+word["id"]
432

  
433
					//println "UD-ID="+word["id"]
432 434
					if (word["id"].contains("-")) {
433 435
						writer.writeStartElement("seg")
434 436
						writer.writeCharacters("******")
435 437
						writer.writeEndElement() // span
436 438
					}
437
					
439

  
438 440
					String id = null
439 441
					wordCounter++
440 442
					writer.writeStartElement ("w")
......
449 451
						//println "WORD="+word
450 452
						writer.writeAttribute(prefix+p, word[p])
451 453
					}
452
					
454

  
453 455
					if (id != null) {
454
						writer.writeAttribute("id", id)	
456
						writer.writeAttribute("id", id)
455 457
					} else {
456 458
						writer.writeAttribute("id", "w_"+text_id+"_"+wordCounter)
457 459
					}
458
					
460

  
459 461
					writer.writeCharacters(word["form"])
460 462
					writer.writeEndElement() // w
461 463
					writer.writeCharacters(" ")
462 464
				}
463
				
465

  
464 466
				if (formatSentences) writer.writeEndElement()
465
				
467

  
466 468
				writer.writeCharacters("\n")
467 469
				writer.writeEndElement() // s
468 470
			}
469
			
471

  
470 472
			if (current_par_id != null) {
471 473
				writer.writeEndElement() // p
472 474
				writer.writeCharacters("\n")
473 475
			}
474
			
476

  
475 477
			writer.writeEndElement() // text
476 478
			writer.writeCharacters("\n")
477 479
			writer.writeEndElement() // TEI
478 480
			writer.close()
479 481
		}
480
		
482

  
481 483
		cpb_texts.done()
482
		
484

  
483 485
		return true
484 486
	}
485
	
487

  
486 488
}
TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/function/BratPrintTree.java (revision 4018)
2 2

  
3 3
import java.io.File;
4 4
import java.io.IOException;
5
import java.util.ArrayList;
5 6
import java.util.List;
6 7

  
7 8
import org.apache.commons.lang.StringUtils;
......
13 14
	public static File print(File file, List<String> conll, String[] Tvalues, String[] NTvalues) {
14 15
		
15 16
		try {
17
			ArrayList<String[]> splittedLines = new ArrayList<>();
18
			for (int i = 0 ; i < conll.size() ; i++) {
19
				String l = conll.get(i);
20
				String split[] = l.split("\t");
21
				splittedLines.add(split);
22
			}
23
			
24
			ArrayList<String> conll2 = new ArrayList<>();
25
			for (int i = 0 ; i < splittedLines.size() ; i++) {
26
				String split[] = splittedLines.get(i);
27
				
28
				if (split[0].contains("-")) {
29
					int n1 = Integer.parseInt(split[0].substring(0,  split[0].indexOf("-")));
30
					int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
31
					int n =  n2 - n1;
32
					
33
					ArrayList<String[]> newlines = new ArrayList<>();
34
					for (int j = 0 ; j <= n ; j++) {
35
						newlines.add(new String[split.length]);
36
						for (int p = 0 ; p < split.length ; p++) {
37
							newlines.get(j)[p] = "_";
38
						}
39
					}
40
					if ( !(splittedLines.get(i+1)[0].equals(""+n1)) || !(splittedLines.get(i+n+1)[0].equals(""+n2)) ) {
41
						System.out.println("FIXING "+conll.get(i));
42
						for (int p = 2 ; p < split.length - 1 ; p++) {
43
							String[] splittedValues = split[p].split(".");
44
							
45
							
46
							for (int j = 0 ; j <= n ; j++) {
47
								if (p >= splittedValues.length) {
48
									
49
								} else {
50
									newlines.get(j)[p] = splittedValues[p];
51
								}
52
							}
53
						}
54
						
55
					} else {
56
						System.out.println("NOT FIXING "+conll.get(i));
57
					}
58
				}
59
				
60
				conll2.add(StringUtils.join(split, "\t"));
61
				
62
			}
63
			for (String l : conll2) System.out.println(l);
64
			
16 65
			String bundle_id = "org.txm.conllu.core";
17 66
			File HTMLTEMPATE = BundleUtils.getFile(bundle_id, "template", "/", "index.html");
18 67
			File root = HTMLTEMPATE.getParentFile();
19 68
			
20 69
			String content = IOUtils.getText(HTMLTEMPATE);
21 70
			content = content.replace("HTMLROOTDIRECTORY", root.getAbsolutePath());
22
			content = content.replace("CONLLUSENTENCE", StringUtils.join(conll, "\n"));
71
			content = content.replace("CONLLUSENTENCE", StringUtils.join(conll2, "\n"));
23 72
			
24 73
			IOUtils.write(file, content);
25 74
			//BundleUtils.copyFiles(bundle_id, "groovy", "org/txm/scripts/importer", "", scriptsPackageDirectory, true);

Formats disponibles : Unified diff