Révision 2553
tmp/org.txm.core/src/java/org/txm/scripts/importer/XMLTXM2WTC.groovy (revision 2553) | ||
---|---|---|
46 | 46 |
|
47 | 47 |
class XMLTXM2CQP |
48 | 48 |
{ |
49 |
|
|
49 |
|
|
50 | 50 |
/** The url. */ |
51 | 51 |
private def url; |
52 |
|
|
52 |
|
|
53 | 53 |
/** The input data. */ |
54 | 54 |
private def inputData; |
55 |
|
|
55 |
|
|
56 | 56 |
/** The factory. */ |
57 | 57 |
private def factory; |
58 |
|
|
58 |
|
|
59 | 59 |
/** The parser. */ |
60 | 60 |
private XMLStreamReader parser; |
61 |
|
|
61 |
|
|
62 | 62 |
/** The output. */ |
63 | 63 |
private def output; |
64 |
|
|
64 |
|
|
65 | 65 |
/** The hashmap of txm:form and txm:ana values and the attributes hash*/ |
66 | 66 |
LinkedHashMap<String, String> anahash = new LinkedHashMap<String, String>(); |
67 | 67 |
LinkedHashMap<String, String> formhash = new LinkedHashMap<String, String>(); |
68 | 68 |
LinkedHashMap<String, String> wordattributes = new LinkedHashMap<String, String>(); |
69 |
|
|
69 |
|
|
70 | 70 |
/** The balisesfound. */ |
71 | 71 |
HashMap<String, List<String>> balisesfound;// = new HashMap<String, List<String>>(); |
72 |
|
|
72 |
|
|
73 | 73 |
/** The balises to keep. */ |
74 | 74 |
List<String> balisesToKeep; |
75 |
|
|
75 |
|
|
76 | 76 |
/** The send to p attributes. */ |
77 | 77 |
HashMap <String, List<String>> sendToPAttributes;// = new HashMap<String, List<String>>(); |
78 |
|
|
78 |
|
|
79 | 79 |
/** The injected p attributes. */ |
80 | 80 |
List<String> injectedPAttributes = new ArrayList<String>(); |
81 |
|
|
81 |
|
|
82 | 82 |
/** The default reference : a pattern + the properties to use */ |
83 | 83 |
List<String> defaultReferences = new ArrayList<String>(); |
84 | 84 |
String defaultReferencePattern; |
85 |
|
|
85 |
|
|
86 | 86 |
/** The injected p attributes values. */ |
87 | 87 |
HashMap <String, String> injectedPAttributesValues;// = new ArrayList<String>(); |
88 |
|
|
88 |
|
|
89 | 89 |
/** The addinfos. */ |
90 | 90 |
boolean addinfos = false; |
91 |
|
|
91 |
|
|
92 | 92 |
/** The txtname. */ |
93 | 93 |
String txtname; |
94 |
|
|
94 |
|
|
95 | 95 |
/** The base. */ |
96 | 96 |
String base; |
97 |
|
|
97 |
|
|
98 | 98 |
/** The project. */ |
99 | 99 |
String project; |
100 |
|
|
100 |
|
|
101 | 101 |
/** The lang. */ |
102 | 102 |
public String lang= "fr"; |
103 | 103 |
public String currentForm; |
104 | 104 |
public String currentAna; |
105 |
|
|
105 |
|
|
106 | 106 |
/** |
107 | 107 |
* Sets the lang. |
108 | 108 |
* |
... | ... | |
113 | 113 |
{ |
114 | 114 |
this.lang = lang; |
115 | 115 |
} |
116 |
|
|
116 |
|
|
117 | 117 |
/** |
118 | 118 |
* Instantiates a new xMLTX m2 cqp. |
119 | 119 |
* |
... | ... | |
124 | 124 |
this.url = url; |
125 | 125 |
inputData = url.openStream(); |
126 | 126 |
factory = XMLInputFactory.newInstance(); |
127 |
|
|
127 |
|
|
128 | 128 |
parser = factory.createXMLStreamReader(inputData); |
129 |
|
|
130 |
|
|
129 |
|
|
130 |
|
|
131 | 131 |
} catch (XMLStreamException ex) { |
132 | 132 |
System.out.println(ex); |
133 | 133 |
}catch (IOException ex) { |
134 | 134 |
System.out.println("IOException while parsing "); |
135 | 135 |
} |
136 | 136 |
} |
137 |
|
|
137 |
|
|
138 | 138 |
/** |
139 | 139 |
* Sets the text info. |
140 | 140 |
* |
... | ... | |
149 | 149 |
this.base = base; |
150 | 150 |
this.project = project; |
151 | 151 |
} |
152 |
|
|
152 |
|
|
153 | 153 |
/** |
154 | 154 |
* Creates the output. |
155 | 155 |
* |
... | ... | |
166 | 166 |
return false; |
167 | 167 |
} |
168 | 168 |
} |
169 |
|
|
169 |
|
|
170 | 170 |
/** The haspb. */ |
171 | 171 |
boolean haspb = false; |
172 |
|
|
172 |
|
|
173 | 173 |
/** The haslb. */ |
174 | 174 |
boolean haslb = false; |
175 |
|
|
175 |
|
|
176 | 176 |
/** |
177 | 177 |
* Transform file. |
178 | 178 |
* |
... | ... | |
186 | 186 |
println "no element has been defined to be keeped" |
187 | 187 |
return false; |
188 | 188 |
} |
189 |
|
|
189 |
|
|
190 | 190 |
haspb = false; |
191 | 191 |
haslb = false; |
192 |
|
|
192 |
|
|
193 | 193 |
boolean flagAna; |
194 | 194 |
boolean flagForm; |
195 | 195 |
boolean flagWord; |
196 | 196 |
String vWord = ""; |
197 | 197 |
String vForm = ""; |
198 | 198 |
String vAna = ""; |
199 |
|
|
199 |
|
|
200 | 200 |
String lb_id = ""; |
201 | 201 |
String pb_id = ""; |
202 |
|
|
202 |
|
|
203 | 203 |
wordattributes = [:]; |
204 | 204 |
balisesfound = new HashMap<String, List<String>>(); |
205 |
|
|
206 |
|
|
205 |
|
|
206 |
|
|
207 | 207 |
if(!createOutput(outfile)) |
208 | 208 |
return false; |
209 |
|
|
209 |
|
|
210 | 210 |
if(sendToPAttributes != null) |
211 | 211 |
{ |
212 | 212 |
for(String tag: sendToPAttributes.keySet()) |
... | ... | |
214 | 214 |
injectedPAttributes.add(tag+attr); |
215 | 215 |
injectedPAttributesValues = [:]; |
216 | 216 |
} |
217 |
|
|
217 |
|
|
218 | 218 |
//output.write("<txmcorpus lang=\""+lang+"\">\n"); |
219 | 219 |
balisesfound.put("txmcorpus",["lang"]); |
220 | 220 |
try { |
... | ... | |
222 | 222 |
switch (event) { |
223 | 223 |
case XMLStreamConstants.START_ELEMENT: |
224 | 224 |
String localname = parser.getLocalName().toLowerCase(); |
225 |
|
|
225 |
|
|
226 | 226 |
// we will only declare found tags in cwb registry |
227 | 227 |
if(balisesToKeep.contains(localname)) { |
228 | 228 |
if(!balisesfound.containsKey(localname)) { |
229 | 229 |
balisesfound.put(localname, []); |
230 | 230 |
} |
231 |
|
|
231 |
|
|
232 | 232 |
List<String> attrlist = balisesfound.get(localname); |
233 | 233 |
for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) |
234 | 234 |
if(!attrlist.contains(parser.getAttributeLocalName(i))) |
235 | 235 |
attrlist.add(parser.getAttributeLocalName(i)); |
236 | 236 |
} |
237 |
|
|
237 |
|
|
238 | 238 |
switch (localname) { |
239 | 239 |
case "w": // get word id !! |
240 | 240 |
wordattributes.put("id", parser.getAttributeValue(null, "id")); |
241 | 241 |
break; |
242 |
|
|
242 |
|
|
243 | 243 |
case "form": |
244 | 244 |
flagForm = true; |
245 | 245 |
currentForm = parser.getAttributeValue(null, "type"); |
... | ... | |
247 | 247 |
currentForm = "default"; |
248 | 248 |
vForm = ""; |
249 | 249 |
break; |
250 |
|
|
250 |
|
|
251 | 251 |
case "ana": |
252 | 252 |
flagAna = true; |
253 | 253 |
vAna =""; |
254 |
|
|
254 |
|
|
255 | 255 |
currentAna = (parser.getAttributeValue(null,"type")); |
256 | 256 |
if(currentAna != null) |
257 | 257 |
currentAna = currentAna.substring(1)// remove the # |
258 | 258 |
else |
259 | 259 |
flagAna = false; |
260 | 260 |
break; |
261 |
|
|
261 |
|
|
262 | 262 |
default: |
263 |
|
|
263 |
|
|
264 | 264 |
if (sendToPAttributes != null) { |
265 | 265 |
//println "should store $localname ? with "+sendToPAttributes.keySet() |
266 | 266 |
if (sendToPAttributes.keySet().contains(localname)) { |
... | ... | |
273 | 273 |
} |
274 | 274 |
} |
275 | 275 |
} |
276 |
|
|
276 |
|
|
277 | 277 |
if (balisesToKeep.contains(localname)) { |
278 | 278 |
output.write("<"+localname); |
279 | 279 |
//println "write <"+localname+"..." |
... | ... | |
292 | 292 |
|
293 | 293 |
output.write(" "+attrname+"=\""+parser.getAttributeValue(i).replace("&", "&").replace("\"", """)+"\"" ); |
294 | 294 |
} |
295 |
|
|
295 |
|
|
296 | 296 |
if (localname.equals("text")) |
297 | 297 |
if (addinfos) { |
298 | 298 |
List<String> attrlist = balisesfound.get(localname); |
299 |
|
|
299 |
|
|
300 | 300 |
if (!idwritten) { |
301 | 301 |
output.write(" id=\""+txtname+"\"") |
302 | 302 |
attrlist.add("id"); |
... | ... | |
310 | 310 |
attrlist.add("project"); |
311 | 311 |
} |
312 | 312 |
} |
313 |
|
|
313 |
|
|
314 | 314 |
// finalize tag |
315 | 315 |
output.write(">\n"); |
316 | 316 |
} |
317 | 317 |
} |
318 | 318 |
break; |
319 |
|
|
319 |
|
|
320 | 320 |
case XMLStreamConstants.END_ELEMENT: |
321 | 321 |
String localname = parser.getLocalName().toLowerCase(); |
322 | 322 |
switch (localname) { |
... | ... | |
325 | 325 |
formhash.put(currentForm, vForm); |
326 | 326 |
flagForm = false; |
327 | 327 |
break; |
328 |
|
|
328 |
|
|
329 | 329 |
case "ana": |
330 | 330 |
if(flagAna) |
331 | 331 |
anahash.put(currentAna, vAna); |
332 | 332 |
flagAna = false; |
333 | 333 |
break; |
334 |
|
|
334 |
|
|
335 | 335 |
case "w": |
336 | 336 |
vWord = ""; |
337 | 337 |
vWord = formhash.get("default").replaceAll("&", "&").replaceAll("<", "<"); // get default form |
338 | 338 |
for (String form : formhash.keySet()) // and the others |
339 | 339 |
if (form != "default") |
340 | 340 |
vWord += "\t"+formhash.get(form); |
341 |
|
|
341 |
|
|
342 | 342 |
for (String type : wordattributes.keySet()) // only word id ? |
343 | 343 |
vWord+="\t"+wordattributes.get(type) |
344 |
|
|
344 |
|
|
345 | 345 |
if (sendToPAttributes != null) // word attributes from structure properties |
346 | 346 |
{ |
347 | 347 |
//println "injectedPAttributesValues: "+injectedPAttributesValues |
348 | 348 |
for(String pattr : injectedPAttributes) |
349 | 349 |
vWord+="\t"+injectedPAttributesValues.get(pattr) ;//les attributs injecter |
350 | 350 |
} |
351 |
|
|
351 |
|
|
352 | 352 |
for (String type : anahash.keySet()) // word annotations in txm:ana |
353 | 353 |
vWord+="\t"+anahash.get(type) |
354 |
|
|
354 |
|
|
355 | 355 |
output.write(vWord+"\n"); |
356 | 356 |
vWord= ""; |
357 | 357 |
break; |
358 |
|
|
358 |
|
|
359 | 359 |
default: |
360 | 360 |
if (sendToPAttributes != null) // reset structure properties |
361 | 361 |
{ |
... | ... | |
365 | 365 |
} |
366 | 366 |
} |
367 | 367 |
} |
368 |
|
|
368 |
|
|
369 | 369 |
if (balisesToKeep.contains(localname)) { |
370 | 370 |
output.write("</"+localname+">\n"); |
371 | 371 |
} |
372 | 372 |
} |
373 | 373 |
break; |
374 |
|
|
374 |
|
|
375 | 375 |
case XMLStreamConstants.CHARACTERS: |
376 | 376 |
if (flagForm) { |
377 | 377 |
vForm += parser.getText().trim(); |
... | ... | |
385 | 385 |
//output.write("</txmcorpus>\n"); |
386 | 386 |
output.close(); |
387 | 387 |
if (parser != null) parser.close(); |
388 |
if (inputData != null) inputData.close(); |
|
388 |
if (inputData != null) inputData.close();
|
|
389 | 389 |
} catch (Exception ex) { |
390 | 390 |
println "Error while parsing $url : "+ex |
391 | 391 |
ex.printStackTrace(); |
... | ... | |
395 | 395 |
} |
396 | 396 |
return true; |
397 | 397 |
} |
398 |
|
|
398 |
|
|
399 | 399 |
/** |
400 | 400 |
* Gets the p attributs. |
401 | 401 |
* |
... | ... | |
404 | 404 |
public List<String> getpAttributs() |
405 | 405 |
{ |
406 | 406 |
def pAttributs = []; |
407 |
|
|
407 |
|
|
408 | 408 |
for (String wordattr : wordattributes.keySet()) { |
409 | 409 |
pAttributs.add(wordattr); |
410 | 410 |
} |
411 |
|
|
411 |
|
|
412 | 412 |
if (sendToPAttributes != null) |
413 | 413 |
for (String pAttr : this.injectedPAttributes) |
414 | 414 |
pAttributs.add(pAttr); |
415 |
|
|
415 |
|
|
416 | 416 |
for (String anakey : anahash.keySet()) { |
417 | 417 |
pAttributs.add(anakey); |
418 | 418 |
} |
419 |
|
|
419 |
|
|
420 | 420 |
return pAttributs; |
421 | 421 |
} |
422 |
|
|
422 |
|
|
423 | 423 |
/** |
424 | 424 |
* Gets the s attributs. |
425 | 425 |
* |
... | ... | |
427 | 427 |
*/ |
428 | 428 |
public List<String> getsAttributs() |
429 | 429 |
{ |
430 |
println balisesfound |
|
431 | 430 |
def sAttributs = []; |
432 | 431 |
for (String balise : this.balisesfound.keySet()) { |
433 | 432 |
List<String> sAtt = this.balisesfound.get(balise); |
... | ... | |
435 | 434 |
for (String attr : sAtt) { |
436 | 435 |
attributes+="+"+attr; |
437 | 436 |
} |
438 |
|
|
437 |
|
|
439 | 438 |
if (sAtt.size() > 0) |
440 | 439 |
sAttributs.add(balise +":"+attributes); |
441 | 440 |
else |
... | ... | |
443 | 442 |
} |
444 | 443 |
return sAttributs; |
445 | 444 |
} |
446 |
|
|
445 |
|
|
447 | 446 |
/** |
448 | 447 |
* Sets the balises to keep. |
449 | 448 |
* |
... | ... | |
456 | 455 |
else |
457 | 456 |
println("Warning: the list of elements to keep is null") |
458 | 457 |
} |
459 |
|
|
458 |
|
|
460 | 459 |
/** |
461 | 460 |
* Sets the defautl reference pattern |
462 | 461 |
* TODO: not implemented |
... | ... | |
470 | 469 |
defaultReferencePattern = pattern; |
471 | 470 |
} |
472 | 471 |
} |
473 |
|
|
474 |
|
|
472 |
|
|
475 | 473 |
/** |
476 | 474 |
* Sets the send to p attributes. |
477 | 475 |
* |
... | ... | |
484 | 482 |
else |
485 | 483 |
println("Warning: the pAttributes to inject is null") |
486 | 484 |
} |
487 |
|
|
488 |
|
|
485 |
|
|
486 |
|
|
489 | 487 |
/** |
490 | 488 |
* The main method. |
491 | 489 |
* |
492 | 490 |
* @param args the arguments |
493 | 491 |
*/ |
494 | 492 |
public static void main(String[] args) { |
495 |
|
|
493 |
|
|
496 | 494 |
String rootDir = "/home/mdecorde/TXM/corpora/CORNEILLEMOLIERETER/txm/CORNEILLEMOLIERETER"; |
497 |
|
|
495 |
|
|
498 | 496 |
File srcfile = new File(rootDir,"CORNEILLEP_AGESILAS_1666.xml"); |
499 | 497 |
println srcfile.exists() |
500 | 498 |
File cqpfile = new File(rootDir, "out/CORNEILLEP_AGESILAS_1666.cqp"); |
501 | 499 |
new File(rootDir,"out").deleteDir() |
502 | 500 |
new File(rootDir,"out").mkdir() |
503 |
|
|
501 |
|
|
504 | 502 |
System.out.println("XMLTXM2CQP : "+srcfile+" >> "+cqpfile); |
505 | 503 |
def builder = new XMLTXM2CQP(srcfile.toURL()); |
506 | 504 |
def balises = ["text", "s"]; |
507 | 505 |
builder.setBalisesToKeep(balises); |
508 | 506 |
builder.transformFile(cqpfile); |
509 |
|
|
507 |
|
|
510 | 508 |
println("SATTRIBUTS: "+builder.getsAttributs()); |
511 | 509 |
println("PATTRIBUTS: "+builder.getpAttributs()); |
512 | 510 |
return; |
Formats disponibles : Unified diff