Revision 1094 tmp/org.txm.core/src/java/org/txm/scripts/importer/XMLTXM2WTC.groovy

XMLTXM2WTC.groovy (revision 1094)
2 2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 3
// Lyon 2, University of Franche-Comté, University of Nice
4 4
// Sophia Antipolis, University of Paris 3.
5
// 
5
//
6 6
// The TXM platform is free software: you can redistribute it
7 7
// and/or modify it under the terms of the GNU General Public
8 8
// License as published by the Free Software Foundation,
9 9
// either version 2 of the License, or (at your option) any
10 10
// later version.
11
// 
11
//
12 12
// The TXM platform is distributed in the hope that it will be
13 13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 15
// PURPOSE. See the GNU General Public License for more
16 16
// details.
17
// 
17
//
18 18
// You should have received a copy of the GNU General
19 19
// Public License along with the TXM platform. If not, see
20 20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
21
//
22
//
23
//
24 24
// $LastChangedDate: 2017-04-11 15:30:35 +0200 (mar. 11 avril 2017) $
25 25
// $LastChangedRevision: 3426 $
26
// $LastChangedBy: mdecorde $ 
26
// $LastChangedBy: mdecorde $
27 27
//
28 28
package org.txm.scripts.importer
29 29

  
......
46 46

  
47 47
class XMLTXM2CQP
48 48
{
49
	
49

  
50 50
	/** The url. */
51 51
	private def url;
52
	
52

  
53 53
	/** The input data. */
54 54
	private def inputData;
55
	
55

  
56 56
	/** The factory. */
57 57
	private def factory;
58
	
58

  
59 59
	/** The parser. */
60 60
	private XMLStreamReader parser;
61
	
61

  
62 62
	/** The output. */
63 63
	private def output;
64
	
64

  
65 65
	/** The hashmap of txm:form and txm:ana values and the attributes hash*/
66 66
	LinkedHashMap<String, String> anahash = new LinkedHashMap<String, String>();
67 67
	LinkedHashMap<String, String> formhash = new LinkedHashMap<String, String>();
68 68
	LinkedHashMap<String, String> wordattributes = new LinkedHashMap<String, String>();
69
	
69

  
70 70
	/** The balisesfound. */
71 71
	HashMap<String, List<String>> balisesfound;// = new HashMap<String, List<String>>();
72 72

  
73 73
	/** The balises to keep. */
74 74
	List<String> balisesToKeep;
75
	
75

  
76 76
	/** The send to p attributes. */
77 77
	HashMap <String, List<String>> sendToPAttributes;// = new HashMap<String, List<String>>();
78
	
78

  
79 79
	/** The injected p attributes. */
80 80
	List<String> injectedPAttributes = new ArrayList<String>();
81
	
81

  
82 82
	/** The default reference : a pattern + the properties to use */
83 83
	List<String> defaultReferences = new ArrayList<String>();
84 84
	String defaultReferencePattern;
85
	
85

  
86 86
	/** The injected p attributes values. */
87 87
	HashMap <String, String> injectedPAttributesValues;// = new ArrayList<String>();
88
	
88

  
89 89
	/** The addinfos. */
90 90
	boolean addinfos = false;
91
	
91

  
92 92
	/** The txtname. */
93 93
	String txtname;
94
	
94

  
95 95
	/** The base. */
96 96
	String base;
97
	
97

  
98 98
	/** The project. */
99 99
	String project;
100
	
100

  
101 101
	/** The lang. */
102 102
	public String lang= "fr";
103 103
	public String currentForm;
104 104
	public String currentAna;
105
	
105

  
106 106
	/**
107 107
	 * Sets the lang.
108 108
	 *
......
113 113
	{
114 114
		this.lang = lang;
115 115
	}
116
	
116

  
117 117
	/**
118 118
	 * Instantiates a new xMLTX m2 cqp.
119 119
	 *
......
124 124
			this.url = url;
125 125
			inputData = url.openStream();
126 126
			factory = XMLInputFactory.newInstance();
127
			
127

  
128 128
			parser = factory.createXMLStreamReader(inputData);
129
			
130
			
129

  
130

  
131 131
		} catch (XMLStreamException ex) {
132 132
			System.out.println(ex);
133 133
		}catch (IOException ex) {
134 134
			System.out.println("IOException while parsing ");
135 135
		}
136 136
	}
137
	
137

  
138 138
	/**
139 139
	 * Sets the text info.
140 140
	 *
......
149 149
		this.base = base;
150 150
		this.project = project;
151 151
	}
152
	
152

  
153 153
	/**
154 154
	 * Creates the output.
155 155
	 *
......
166 166
			return false;
167 167
		}
168 168
	}
169
	
169

  
170 170
	/** The haspb. */
171 171
	boolean haspb = false;
172
	
172

  
173 173
	/** The haslb. */
174 174
	boolean haslb = false;
175
	
175

  
176 176
	/**
177 177
	 * Transform file.
178 178
	 *
......
186 186
			println "no element has been defined to be keeped"
187 187
			return false;
188 188
		}
189
		
189

  
190 190
		haspb = false;
191
		haslb = false;	
192
		
191
		haslb = false;
192

  
193 193
		boolean flagAna;
194 194
		boolean flagForm;
195 195
		boolean flagWord;
196 196
		String vWord = "";
197 197
		String vForm = "";
198 198
		String vAna = "";
199
		
199

  
200 200
		String lb_id = "";
201 201
		String pb_id = "";
202
		
202

  
203 203
		wordattributes = [:];
204 204
		balisesfound = new HashMap<String, List<String>>();
205
		
206
		
205

  
206

  
207 207
		if(!createOutput(outfile))
208 208
			return false;
209
		
209

  
210 210
		if(sendToPAttributes != null)
211 211
		{
212 212
			for(String tag: sendToPAttributes.keySet())
......
214 214
					injectedPAttributes.add(tag+attr);
215 215
			injectedPAttributesValues = [:];
216 216
		}
217
		
217

  
218 218
		//output.write("<txmcorpus lang=\""+lang+"\">\n");
219 219
		balisesfound.put("txmcorpus",["lang"]);
220
		
221
		for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
222
			switch (event) 
223
			{
224
				case XMLStreamConstants.START_ELEMENT:
225
					String localname = parser.getLocalName().toLowerCase();
226
				
227
				// we will only declare found tags in cwb registry
228
					if(balisesToKeep.contains(localname)) {
229
						if(!balisesfound.containsKey(localname)) {
230
							balisesfound.put(localname, []);
220
		try {
221
			for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
222
				switch (event) {
223
					case XMLStreamConstants.START_ELEMENT:
224
						String localname = parser.getLocalName().toLowerCase();
225

  
226
					// we will only declare found tags in cwb registry
227
						if(balisesToKeep.contains(localname)) {
228
							if(!balisesfound.containsKey(localname)) {
229
								balisesfound.put(localname, []);
230
							}
231

  
232
							List<String> attrlist = balisesfound.get(localname);
233
							for (int i= 0 ; i < parser.getAttributeCount() ;i++ )
234
								if(!attrlist.contains(parser.getAttributeLocalName(i)))
235
									attrlist.add(parser.getAttributeLocalName(i));
231 236
						}
232
						
233
						List<String> attrlist = balisesfound.get(localname);
234
						for (int i= 0 ; i < parser.getAttributeCount() ;i++ )
235
							if(!attrlist.contains(parser.getAttributeLocalName(i)))
236
								attrlist.add(parser.getAttributeLocalName(i));
237
					}
238
					
239
					switch (localname) {						
240
						case "w": // get word id !!
241
							wordattributes.put("id", parser.getAttributeValue(null, "id"));
242
							break;
243
						
244
						case "form":
245
							flagForm = true;
246
							currentForm = parser.getAttributeValue(null, "type");
247
							if(currentForm == null)
248
								currentForm = "default";
249
							vForm = "";
250
							break;
251
						
252
						case "ana":
253
							flagAna = true;
254
							vAna ="";
255
						
256
							currentAna = (parser.getAttributeValue(null,"type"));
257
							if(currentAna != null)
258
								currentAna = currentAna.substring(1)// remove the #
259
							else
260
								flagAna = false;
261
							break;
262
						
263
						default:
264
							
265
							if (sendToPAttributes != null) {
266
								//println "should store $localname ? with "+sendToPAttributes.keySet()
267
								if (sendToPAttributes.keySet().contains(localname)) {
268
									//println "store attr of "+localname
269
									List<String> attrs = sendToPAttributes.get(localname);
270
									for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) {
271
										if (attrs.contains(parser.getAttributeLocalName(i))) {
272
											injectedPAttributesValues.put(localname+parser.getAttributeLocalName(i).toLowerCase(),parser.getAttributeValue(i)) 
237

  
238
						switch (localname) {
239
							case "w": // get word id !!
240
								wordattributes.put("id", parser.getAttributeValue(null, "id"));
241
								break;
242

  
243
							case "form":
244
								flagForm = true;
245
								currentForm = parser.getAttributeValue(null, "type");
246
								if(currentForm == null)
247
									currentForm = "default";
248
								vForm = "";
249
								break;
250

  
251
							case "ana":
252
								flagAna = true;
253
								vAna ="";
254

  
255
								currentAna = (parser.getAttributeValue(null,"type"));
256
								if(currentAna != null)
257
									currentAna = currentAna.substring(1)// remove the #
258
								else
259
									flagAna = false;
260
								break;
261

  
262
							default:
263

  
264
								if (sendToPAttributes != null) {
265
									//println "should store $localname ? with "+sendToPAttributes.keySet()
266
									if (sendToPAttributes.keySet().contains(localname)) {
267
										//println "store attr of "+localname
268
										List<String> attrs = sendToPAttributes.get(localname);
269
										for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) {
270
											if (attrs.contains(parser.getAttributeLocalName(i))) {
271
												injectedPAttributesValues.put(localname+parser.getAttributeLocalName(i).toLowerCase(),parser.getAttributeValue(i))
272
											}
273 273
										}
274 274
									}
275 275
								}
276
							}
277
							
278
							if (balisesToKeep.contains(localname)) {
279
								output.write("<"+localname);
280
								//println "write <"+localname+"..."
281
								//write attributes
282
								boolean idwritten = false;
283
								boolean basewritten = false;
284
								boolean projectwritten = false;
285
								for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) {
286
									String attrname = parser.getAttributeLocalName(i).toLowerCase();
287
									if (attrname == "id")
288
										idwritten = true;
289
									if (attrname == "base")
290
										basewritten = true;
291
									if (attrname == "project")
292
										projectwritten = true;
293
									output.write(" "+attrname+"=\""+parser.getAttributeValue(i).replace("\"", "'")+"\"" );
276

  
277
								if (balisesToKeep.contains(localname)) {
278
									output.write("<"+localname);
279
									//println "write <"+localname+"..."
280
									//write attributes
281
									boolean idwritten = false;
282
									boolean basewritten = false;
283
									boolean projectwritten = false;
284
									for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) {
285
										String attrname = parser.getAttributeLocalName(i).toLowerCase();
286
										if (attrname == "id")
287
											idwritten = true;
288
										if (attrname == "base")
289
											basewritten = true;
290
										if (attrname == "project")
291
											projectwritten = true;
292
										output.write(" "+attrname+"=\""+parser.getAttributeValue(i).replace("\"", "'")+"\"" );
293
									}
294

  
295
									if (localname.equals("text"))
296
										if (addinfos) {
297
											List<String> attrlist = balisesfound.get(localname);
298

  
299
											if (!idwritten) {
300
												output.write(" id=\""+txtname+"\"")
301
												attrlist.add("id");
302
											}
303
											if (!basewritten) {
304
												output.write(" base=\""+base+"\"");
305
												attrlist.add("base");
306
											}
307
											if (!projectwritten) {
308
												output.write(" project=\""+project+"\"");
309
												attrlist.add("project");
310
											}
311
										}
312

  
313
									// finalize tag
314
									output.write(">\n");
294 315
								}
295
								
296
								if (localname.equals("text"))
297
									if (addinfos) {
298
										List<String> attrlist = balisesfound.get(localname);
299
	
300
										if (!idwritten) {
301
											output.write(" id=\""+txtname+"\"")
302
											attrlist.add("id");
316
						}
317
						break;
318

  
319
					case XMLStreamConstants.END_ELEMENT:
320
						String localname = parser.getLocalName().toLowerCase();
321
						switch (localname) {
322
							case "form":
323
								if(flagForm)
324
									formhash.put(currentForm, vForm);
325
								flagForm = false;
326
								break;
327

  
328
							case "ana":
329
								if(flagAna)
330
									anahash.put(currentAna, vAna);
331
								flagAna = false;
332
								break;
333

  
334
							case "w":
335
								vWord = "";
336
								vWord = formhash.get("default").replaceAll("&", "&amp;").replaceAll("<", "&lt;"); // get default form
337
								for (String form : formhash.keySet()) // and the others
338
									if (form != "default")
339
										vWord += "\t"+formhash.get(form);
340

  
341
								for (String type : wordattributes.keySet()) // only word id ?
342
									vWord+="\t"+wordattributes.get(type)
343

  
344
								if (sendToPAttributes != null) // word attributes from structure properties
345
								{
346
									//println "injectedPAttributesValues: "+injectedPAttributesValues
347
									for(String pattr : injectedPAttributes)
348
										vWord+="\t"+injectedPAttributesValues.get(pattr) ;//les attributs injecter
349
								}
350

  
351
								for (String type : anahash.keySet()) // word annotations in txm:ana
352
									vWord+="\t"+anahash.get(type)
353

  
354
								output.write(vWord+"\n");
355
								vWord= "";
356
								break;
357

  
358
							default:
359
								if (sendToPAttributes != null) // reset structure properties
360
								{
361
									if (sendToPAttributes.keySet().contains(localname)) {
362
										for (String attr : sendToPAttributes.get(localname)) {
363
											injectedPAttributesValues.put(attr, "N/A")
303 364
										}
304
										if (!basewritten) {
305
											output.write(" base=\""+base+"\"");
306
											attrlist.add("base");
307
										}
308
										if (!projectwritten) {
309
											output.write(" project=\""+project+"\"");
310
											attrlist.add("project");
311
										}											
312
									}	
313
											
314
								// finalize tag
315
								output.write(">\n");
316
							}
317
					}
318
					break;
319
				
320
				case XMLStreamConstants.END_ELEMENT:
321
					String localname = parser.getLocalName().toLowerCase();
322
					switch (localname) {
323
						case "form":
324
							if(flagForm)
325
								formhash.put(currentForm, vForm);
326
							flagForm = false; 
327
							break;
328
						
329
						case "ana":
330
							if(flagAna)
331
								anahash.put(currentAna, vAna);
332
							flagAna = false;
333
							break;
334
						
335
						case "w":
336
							vWord = "";
337
							vWord = formhash.get("default").replaceAll("&", "&amp;").replaceAll("<", "&lt;"); // get default form
338
							for (String form : formhash.keySet()) // and the others
339
								if (form != "default")
340
									vWord += "\t"+formhash.get(form);
341
							
342
							for (String type : wordattributes.keySet()) // only word id ?
343
								vWord+="\t"+wordattributes.get(type)
344
						
345
							if (sendToPAttributes != null) // word attributes from structure properties
346
							{
347
								//println "injectedPAttributesValues: "+injectedPAttributesValues
348
								for(String pattr : injectedPAttributes)
349
									vWord+="\t"+injectedPAttributesValues.get(pattr) ;//les attributs injecter
350
							}
351
							
352
							for (String type : anahash.keySet()) // word annotations in txm:ana
353
								vWord+="\t"+anahash.get(type)
354
							
355
							output.write(vWord+"\n");
356
							vWord= "";
357
							break;
358
						
359
						default:
360
							if (sendToPAttributes != null) // reset structure properties
361
							{
362
								if (sendToPAttributes.keySet().contains(localname)) {
363
									for (String attr : sendToPAttributes.get(localname)) {
364
										injectedPAttributesValues.put(attr, "N/A") 
365 365
									}
366 366
								}
367
							}
368
							
369
							if (balisesToKeep.contains(localname)) {
370
								output.write("</"+localname+">\n");
371
							}
372
					}
373
					break;
374
				
375
				case XMLStreamConstants.CHARACTERS:
376
					if (flagForm) {
377
						vForm += parser.getText().trim();
378
					}
379
					if (flagAna) {
380
						vAna += parser.getText().trim();
381
					}
382
					break;
367

  
368
								if (balisesToKeep.contains(localname)) {
369
									output.write("</"+localname+">\n");
370
								}
371
						}
372
						break;
373

  
374
					case XMLStreamConstants.CHARACTERS:
375
						if (flagForm) {
376
							vForm += parser.getText().trim();
377
						}
378
						if (flagAna) {
379
							vAna += parser.getText().trim();
380
						}
381
						break;
382
				}
383 383
			}
384
			//output.write("</txmcorpus>\n");
385
			output.close();
386
			parser.close();
387
		} catch (Exception ex) {
388
			println "Error while parsing $url : "+ex
389
			ex.printStackTrace();
390
			return false;
384 391
		}
385
		//output.write("</txmcorpus>\n"); 
386
		output.close();
387
		parser.close();
388
		
389 392
		return true;
390 393
	}
391
	
394

  
392 395
	/**
393 396
	 * Gets the p attributs.
394 397
	 *
......
397 400
	public List<String> getpAttributs()
398 401
	{
399 402
		def pAttributs = [];
400
		
403

  
401 404
		for (String wordattr : wordattributes.keySet()) {
402 405
			pAttributs.add(wordattr);
403 406
		}
404
		
407

  
405 408
		if (sendToPAttributes != null)
406 409
			for (String pAttr : this.injectedPAttributes)
407 410
				pAttributs.add(pAttr);
408
		
411

  
409 412
		for (String anakey : anahash.keySet()) {
410 413
			pAttributs.add(anakey);
411 414
		}
412
		
415

  
413 416
		return pAttributs;
414 417
	}
415
	
418

  
416 419
	/**
417 420
	 * Gets the s attributs.
418 421
	 *
......
428 431
			for (String attr : sAtt) {
429 432
				attributes+="+"+attr;
430 433
			}
431
			
434

  
432 435
			if (sAtt.size() > 0)
433 436
				sAttributs.add(balise +":"+attributes);
434 437
			else
......
436 439
		}
437 440
		return sAttributs;
438 441
	}
439
	
442

  
440 443
	/**
441 444
	 * Sets the balises to keep.
442 445
	 *
......
449 452
		else
450 453
			println("Warning: the list of elements to keep is null")
451 454
	}
452
	
455

  
453 456
	/**
454 457
	 * Sets the defautl reference pattern
455 458
	 * TODO: not implemented
......
463 466
			defaultReferencePattern = pattern;
464 467
		}
465 468
	}
466
	
467
	
469

  
470

  
468 471
	/**
469 472
	 * Sets the send to p attributes.
470 473
	 *
......
477 480
		else
478 481
			println("Warning: the pAttributes to inject is null")
479 482
	}
480
	
481
	
483

  
484

  
482 485
	/**
483 486
	 * The main method.
484 487
	 *
485 488
	 * @param args the arguments
486 489
	 */
487 490
	public static void main(String[] args) {
488
		
491

  
489 492
		String rootDir = "/home/mdecorde/TXM/corpora/CORNEILLEMOLIERETER/txm/CORNEILLEMOLIERETER";
490
		
493

  
491 494
		File srcfile = new File(rootDir,"CORNEILLEP_AGESILAS_1666.xml");
492 495
		println srcfile.exists()
493 496
		File cqpfile = new File(rootDir, "out/CORNEILLEP_AGESILAS_1666.cqp");
494 497
		new File(rootDir,"out").deleteDir()
495 498
		new File(rootDir,"out").mkdir()
496
		
499

  
497 500
		System.out.println("XMLTXM2CQP : "+srcfile+" >> "+cqpfile);
498 501
		def builder = new XMLTXM2CQP(srcfile.toURL());
499 502
		def balises = ["text", "s"];
500 503
		builder.setBalisesToKeep(balises);
501 504
		builder.transformFile(cqpfile);
502
		
505

  
503 506
		println("SATTRIBUTS: "+builder.getsAttributs());
504 507
		println("PATTRIBUTS: "+builder.getpAttributs());
505 508
		return;

Also available in: Unified diff