Revision 2553 tmp/org.txm.core/src/java/org/txm/scripts/importer/XMLTXM2WTC.groovy

XMLTXM2WTC.groovy (revision 2553)
46 46

  
47 47
class XMLTXM2CQP
48 48
{
49

  
49
	
50 50
	/** The url. */
51 51
	private def url;
52

  
52
	
53 53
	/** The input data. */
54 54
	private def inputData;
55

  
55
	
56 56
	/** The factory. */
57 57
	private def factory;
58

  
58
	
59 59
	/** The parser. */
60 60
	private XMLStreamReader parser;
61

  
61
	
62 62
	/** The output. */
63 63
	private def output;
64

  
64
	
65 65
	/** The hashmap of txm:form and txm:ana values and the attributes hash*/
66 66
	LinkedHashMap<String, String> anahash = new LinkedHashMap<String, String>();
67 67
	LinkedHashMap<String, String> formhash = new LinkedHashMap<String, String>();
68 68
	LinkedHashMap<String, String> wordattributes = new LinkedHashMap<String, String>();
69

  
69
	
70 70
	/** The balisesfound. */
71 71
	HashMap<String, List<String>> balisesfound;// = new HashMap<String, List<String>>();
72

  
72
	
73 73
	/** The balises to keep. */
74 74
	List<String> balisesToKeep;
75

  
75
	
76 76
	/** The send to p attributes. */
77 77
	HashMap <String, List<String>> sendToPAttributes;// = new HashMap<String, List<String>>();
78

  
78
	
79 79
	/** The injected p attributes. */
80 80
	List<String> injectedPAttributes = new ArrayList<String>();
81

  
81
	
82 82
	/** The default reference : a pattern + the properties to use */
83 83
	List<String> defaultReferences = new ArrayList<String>();
84 84
	String defaultReferencePattern;
85

  
85
	
86 86
	/** The injected p attributes values. */
87 87
	HashMap <String, String> injectedPAttributesValues;// = new ArrayList<String>();
88

  
88
	
89 89
	/** The addinfos. */
90 90
	boolean addinfos = false;
91

  
91
	
92 92
	/** The txtname. */
93 93
	String txtname;
94

  
94
	
95 95
	/** The base. */
96 96
	String base;
97

  
97
	
98 98
	/** The project. */
99 99
	String project;
100

  
100
	
101 101
	/** The lang. */
102 102
	public String lang= "fr";
103 103
	public String currentForm;
104 104
	public String currentAna;
105

  
105
	
106 106
	/**
107 107
	 * Sets the lang.
108 108
	 *
......
113 113
	{
114 114
		this.lang = lang;
115 115
	}
116

  
116
	
117 117
	/**
118 118
	 * Instantiates a new xMLTX m2 cqp.
119 119
	 *
......
124 124
			this.url = url;
125 125
			inputData = url.openStream();
126 126
			factory = XMLInputFactory.newInstance();
127

  
127
			
128 128
			parser = factory.createXMLStreamReader(inputData);
129

  
130

  
129
			
130
			
131 131
		} catch (XMLStreamException ex) {
132 132
			System.out.println(ex);
133 133
		}catch (IOException ex) {
134 134
			System.out.println("IOException while parsing ");
135 135
		}
136 136
	}
137

  
137
	
138 138
	/**
139 139
	 * Sets the text info.
140 140
	 *
......
149 149
		this.base = base;
150 150
		this.project = project;
151 151
	}
152

  
152
	
153 153
	/**
154 154
	 * Creates the output.
155 155
	 *
......
166 166
			return false;
167 167
		}
168 168
	}
169

  
169
	
170 170
	/** The haspb. */
171 171
	boolean haspb = false;
172

  
172
	
173 173
	/** The haslb. */
174 174
	boolean haslb = false;
175

  
175
	
176 176
	/**
177 177
	 * Transform file.
178 178
	 *
......
186 186
			println "no element has been defined to be keeped"
187 187
			return false;
188 188
		}
189

  
189
		
190 190
		haspb = false;
191 191
		haslb = false;
192

  
192
		
193 193
		boolean flagAna;
194 194
		boolean flagForm;
195 195
		boolean flagWord;
196 196
		String vWord = "";
197 197
		String vForm = "";
198 198
		String vAna = "";
199

  
199
		
200 200
		String lb_id = "";
201 201
		String pb_id = "";
202

  
202
		
203 203
		wordattributes = [:];
204 204
		balisesfound = new HashMap<String, List<String>>();
205

  
206

  
205
		
206
		
207 207
		if(!createOutput(outfile))
208 208
			return false;
209

  
209
		
210 210
		if(sendToPAttributes != null)
211 211
		{
212 212
			for(String tag: sendToPAttributes.keySet())
......
214 214
					injectedPAttributes.add(tag+attr);
215 215
			injectedPAttributesValues = [:];
216 216
		}
217

  
217
		
218 218
		//output.write("<txmcorpus lang=\""+lang+"\">\n");
219 219
		balisesfound.put("txmcorpus",["lang"]);
220 220
		try {
......
222 222
				switch (event) {
223 223
					case XMLStreamConstants.START_ELEMENT:
224 224
						String localname = parser.getLocalName().toLowerCase();
225

  
225
					
226 226
					// we will only declare found tags in cwb registry
227 227
						if(balisesToKeep.contains(localname)) {
228 228
							if(!balisesfound.containsKey(localname)) {
229 229
								balisesfound.put(localname, []);
230 230
							}
231

  
231
							
232 232
							List<String> attrlist = balisesfound.get(localname);
233 233
							for (int i= 0 ; i < parser.getAttributeCount() ;i++ )
234 234
								if(!attrlist.contains(parser.getAttributeLocalName(i)))
235 235
									attrlist.add(parser.getAttributeLocalName(i));
236 236
						}
237

  
237
					
238 238
						switch (localname) {
239 239
							case "w": // get word id !!
240 240
								wordattributes.put("id", parser.getAttributeValue(null, "id"));
241 241
								break;
242

  
242
							
243 243
							case "form":
244 244
								flagForm = true;
245 245
								currentForm = parser.getAttributeValue(null, "type");
......
247 247
									currentForm = "default";
248 248
								vForm = "";
249 249
								break;
250

  
250
							
251 251
							case "ana":
252 252
								flagAna = true;
253 253
								vAna ="";
254

  
254
							
255 255
								currentAna = (parser.getAttributeValue(null,"type"));
256 256
								if(currentAna != null)
257 257
									currentAna = currentAna.substring(1)// remove the #
258 258
								else
259 259
									flagAna = false;
260 260
								break;
261

  
261
							
262 262
							default:
263

  
263
							
264 264
								if (sendToPAttributes != null) {
265 265
									//println "should store $localname ? with "+sendToPAttributes.keySet()
266 266
									if (sendToPAttributes.keySet().contains(localname)) {
......
273 273
										}
274 274
									}
275 275
								}
276

  
276
							
277 277
								if (balisesToKeep.contains(localname)) {
278 278
									output.write("<"+localname);
279 279
									//println "write <"+localname+"..."
......
292 292
										
293 293
										output.write(" "+attrname+"=\""+parser.getAttributeValue(i).replace("&", "&amp;").replace("\"", "&quot;")+"\"" );
294 294
									}
295

  
295
									
296 296
									if (localname.equals("text"))
297 297
										if (addinfos) {
298 298
											List<String> attrlist = balisesfound.get(localname);
299

  
299
											
300 300
											if (!idwritten) {
301 301
												output.write(" id=\""+txtname+"\"")
302 302
												attrlist.add("id");
......
310 310
												attrlist.add("project");
311 311
											}
312 312
										}
313

  
313
									
314 314
									// finalize tag
315 315
									output.write(">\n");
316 316
								}
317 317
						}
318 318
						break;
319

  
319
					
320 320
					case XMLStreamConstants.END_ELEMENT:
321 321
						String localname = parser.getLocalName().toLowerCase();
322 322
						switch (localname) {
......
325 325
									formhash.put(currentForm, vForm);
326 326
								flagForm = false;
327 327
								break;
328

  
328
							
329 329
							case "ana":
330 330
								if(flagAna)
331 331
									anahash.put(currentAna, vAna);
332 332
								flagAna = false;
333 333
								break;
334

  
334
							
335 335
							case "w":
336 336
								vWord = "";
337 337
								vWord = formhash.get("default").replaceAll("&", "&amp;").replaceAll("<", "&lt;"); // get default form
338 338
								for (String form : formhash.keySet()) // and the others
339 339
									if (form != "default")
340 340
										vWord += "\t"+formhash.get(form);
341

  
341
							
342 342
								for (String type : wordattributes.keySet()) // only word id ?
343 343
									vWord+="\t"+wordattributes.get(type)
344

  
344
							
345 345
								if (sendToPAttributes != null) // word attributes from structure properties
346 346
								{
347 347
									//println "injectedPAttributesValues: "+injectedPAttributesValues
348 348
									for(String pattr : injectedPAttributes)
349 349
										vWord+="\t"+injectedPAttributesValues.get(pattr) ;//les attributs injecter
350 350
								}
351

  
351
							
352 352
								for (String type : anahash.keySet()) // word annotations in txm:ana
353 353
									vWord+="\t"+anahash.get(type)
354

  
354
							
355 355
								output.write(vWord+"\n");
356 356
								vWord= "";
357 357
								break;
358

  
358
							
359 359
							default:
360 360
								if (sendToPAttributes != null) // reset structure properties
361 361
								{
......
365 365
										}
366 366
									}
367 367
								}
368

  
368
							
369 369
								if (balisesToKeep.contains(localname)) {
370 370
									output.write("</"+localname+">\n");
371 371
								}
372 372
						}
373 373
						break;
374

  
374
					
375 375
					case XMLStreamConstants.CHARACTERS:
376 376
						if (flagForm) {
377 377
							vForm += parser.getText().trim();
......
385 385
			//output.write("</txmcorpus>\n");
386 386
			output.close();
387 387
			if (parser != null) parser.close();
388
		if (inputData != null) inputData.close();
388
			if (inputData != null) inputData.close();
389 389
		} catch (Exception ex) {
390 390
			println "Error while parsing $url : "+ex
391 391
			ex.printStackTrace();
......
395 395
		}
396 396
		return true;
397 397
	}
398

  
398
	
399 399
	/**
400 400
	 * Gets the p attributs.
401 401
	 *
......
404 404
	public List<String> getpAttributs()
405 405
	{
406 406
		def pAttributs = [];
407

  
407
		
408 408
		for (String wordattr : wordattributes.keySet()) {
409 409
			pAttributs.add(wordattr);
410 410
		}
411

  
411
		
412 412
		if (sendToPAttributes != null)
413 413
			for (String pAttr : this.injectedPAttributes)
414 414
				pAttributs.add(pAttr);
415

  
415
		
416 416
		for (String anakey : anahash.keySet()) {
417 417
			pAttributs.add(anakey);
418 418
		}
419

  
419
		
420 420
		return pAttributs;
421 421
	}
422

  
422
	
423 423
	/**
424 424
	 * Gets the s attributs.
425 425
	 *
......
427 427
	 */
428 428
	public List<String> getsAttributs()
429 429
	{
430
		println balisesfound
431 430
		def sAttributs = [];
432 431
		for (String balise : this.balisesfound.keySet()) {
433 432
			List<String> sAtt = this.balisesfound.get(balise);
......
435 434
			for (String attr : sAtt) {
436 435
				attributes+="+"+attr;
437 436
			}
438

  
437
			
439 438
			if (sAtt.size() > 0)
440 439
				sAttributs.add(balise +":"+attributes);
441 440
			else
......
443 442
		}
444 443
		return sAttributs;
445 444
	}
446

  
445
	
447 446
	/**
448 447
	 * Sets the balises to keep.
449 448
	 *
......
456 455
		else
457 456
			println("Warning: the list of elements to keep is null")
458 457
	}
459

  
458
	
460 459
	/**
461 460
	 * Sets the defautl reference pattern
462 461
	 * TODO: not implemented
......
470 469
			defaultReferencePattern = pattern;
471 470
		}
472 471
	}
473

  
474

  
472
	
475 473
	/**
476 474
	 * Sets the send to p attributes.
477 475
	 *
......
484 482
		else
485 483
			println("Warning: the pAttributes to inject is null")
486 484
	}
487

  
488

  
485
	
486
	
489 487
	/**
490 488
	 * The main method.
491 489
	 *
492 490
	 * @param args the arguments
493 491
	 */
494 492
	public static void main(String[] args) {
495

  
493
		
496 494
		String rootDir = "/home/mdecorde/TXM/corpora/CORNEILLEMOLIERETER/txm/CORNEILLEMOLIERETER";
497

  
495
		
498 496
		File srcfile = new File(rootDir,"CORNEILLEP_AGESILAS_1666.xml");
499 497
		println srcfile.exists()
500 498
		File cqpfile = new File(rootDir, "out/CORNEILLEP_AGESILAS_1666.cqp");
501 499
		new File(rootDir,"out").deleteDir()
502 500
		new File(rootDir,"out").mkdir()
503

  
501
		
504 502
		System.out.println("XMLTXM2CQP : "+srcfile+" >> "+cqpfile);
505 503
		def builder = new XMLTXM2CQP(srcfile.toURL());
506 504
		def balises = ["text", "s"];
507 505
		builder.setBalisesToKeep(balises);
508 506
		builder.transformFile(cqpfile);
509

  
507
		
510 508
		println("SATTRIBUTS: "+builder.getsAttributs());
511 509
		println("PATTRIBUTS: "+builder.getpAttributs());
512 510
		return;

Also available in: Unified diff