Révision 2553
| tmp/org.txm.core/src/java/org/txm/scripts/importer/XMLTXM2WTC.groovy (revision 2553) | ||
|---|---|---|
| 46 | 46 |
|
| 47 | 47 |
class XMLTXM2CQP |
| 48 | 48 |
{
|
| 49 |
|
|
| 49 |
|
|
| 50 | 50 |
/** The url. */ |
| 51 | 51 |
private def url; |
| 52 |
|
|
| 52 |
|
|
| 53 | 53 |
/** The input data. */ |
| 54 | 54 |
private def inputData; |
| 55 |
|
|
| 55 |
|
|
| 56 | 56 |
/** The factory. */ |
| 57 | 57 |
private def factory; |
| 58 |
|
|
| 58 |
|
|
| 59 | 59 |
/** The parser. */ |
| 60 | 60 |
private XMLStreamReader parser; |
| 61 |
|
|
| 61 |
|
|
| 62 | 62 |
/** The output. */ |
| 63 | 63 |
private def output; |
| 64 |
|
|
| 64 |
|
|
| 65 | 65 |
/** The hashmap of txm:form and txm:ana values and the attributes hash*/ |
| 66 | 66 |
LinkedHashMap<String, String> anahash = new LinkedHashMap<String, String>(); |
| 67 | 67 |
LinkedHashMap<String, String> formhash = new LinkedHashMap<String, String>(); |
| 68 | 68 |
LinkedHashMap<String, String> wordattributes = new LinkedHashMap<String, String>(); |
| 69 |
|
|
| 69 |
|
|
| 70 | 70 |
/** The balisesfound. */ |
| 71 | 71 |
HashMap<String, List<String>> balisesfound;// = new HashMap<String, List<String>>(); |
| 72 |
|
|
| 72 |
|
|
| 73 | 73 |
/** The balises to keep. */ |
| 74 | 74 |
List<String> balisesToKeep; |
| 75 |
|
|
| 75 |
|
|
| 76 | 76 |
/** The send to p attributes. */ |
| 77 | 77 |
HashMap <String, List<String>> sendToPAttributes;// = new HashMap<String, List<String>>(); |
| 78 |
|
|
| 78 |
|
|
| 79 | 79 |
/** The injected p attributes. */ |
| 80 | 80 |
List<String> injectedPAttributes = new ArrayList<String>(); |
| 81 |
|
|
| 81 |
|
|
| 82 | 82 |
/** The default reference : a pattern + the properties to use */ |
| 83 | 83 |
List<String> defaultReferences = new ArrayList<String>(); |
| 84 | 84 |
String defaultReferencePattern; |
| 85 |
|
|
| 85 |
|
|
| 86 | 86 |
/** The injected p attributes values. */ |
| 87 | 87 |
HashMap <String, String> injectedPAttributesValues;// = new ArrayList<String>(); |
| 88 |
|
|
| 88 |
|
|
| 89 | 89 |
/** The addinfos. */ |
| 90 | 90 |
boolean addinfos = false; |
| 91 |
|
|
| 91 |
|
|
| 92 | 92 |
/** The txtname. */ |
| 93 | 93 |
String txtname; |
| 94 |
|
|
| 94 |
|
|
| 95 | 95 |
/** The base. */ |
| 96 | 96 |
String base; |
| 97 |
|
|
| 97 |
|
|
| 98 | 98 |
/** The project. */ |
| 99 | 99 |
String project; |
| 100 |
|
|
| 100 |
|
|
| 101 | 101 |
/** The lang. */ |
| 102 | 102 |
public String lang= "fr"; |
| 103 | 103 |
public String currentForm; |
| 104 | 104 |
public String currentAna; |
| 105 |
|
|
| 105 |
|
|
| 106 | 106 |
/** |
| 107 | 107 |
* Sets the lang. |
| 108 | 108 |
* |
| ... | ... | |
| 113 | 113 |
{
|
| 114 | 114 |
this.lang = lang; |
| 115 | 115 |
} |
| 116 |
|
|
| 116 |
|
|
| 117 | 117 |
/** |
| 118 | 118 |
* Instantiates a new xMLTX m2 cqp. |
| 119 | 119 |
* |
| ... | ... | |
| 124 | 124 |
this.url = url; |
| 125 | 125 |
inputData = url.openStream(); |
| 126 | 126 |
factory = XMLInputFactory.newInstance(); |
| 127 |
|
|
| 127 |
|
|
| 128 | 128 |
parser = factory.createXMLStreamReader(inputData); |
| 129 |
|
|
| 130 |
|
|
| 129 |
|
|
| 130 |
|
|
| 131 | 131 |
} catch (XMLStreamException ex) {
|
| 132 | 132 |
System.out.println(ex); |
| 133 | 133 |
}catch (IOException ex) {
|
| 134 | 134 |
System.out.println("IOException while parsing ");
|
| 135 | 135 |
} |
| 136 | 136 |
} |
| 137 |
|
|
| 137 |
|
|
| 138 | 138 |
/** |
| 139 | 139 |
* Sets the text info. |
| 140 | 140 |
* |
| ... | ... | |
| 149 | 149 |
this.base = base; |
| 150 | 150 |
this.project = project; |
| 151 | 151 |
} |
| 152 |
|
|
| 152 |
|
|
| 153 | 153 |
/** |
| 154 | 154 |
* Creates the output. |
| 155 | 155 |
* |
| ... | ... | |
| 166 | 166 |
return false; |
| 167 | 167 |
} |
| 168 | 168 |
} |
| 169 |
|
|
| 169 |
|
|
| 170 | 170 |
/** The haspb. */ |
| 171 | 171 |
boolean haspb = false; |
| 172 |
|
|
| 172 |
|
|
| 173 | 173 |
/** The haslb. */ |
| 174 | 174 |
boolean haslb = false; |
| 175 |
|
|
| 175 |
|
|
| 176 | 176 |
/** |
| 177 | 177 |
* Transform file. |
| 178 | 178 |
* |
| ... | ... | |
| 186 | 186 |
println "no element has been defined to be keeped" |
| 187 | 187 |
return false; |
| 188 | 188 |
} |
| 189 |
|
|
| 189 |
|
|
| 190 | 190 |
haspb = false; |
| 191 | 191 |
haslb = false; |
| 192 |
|
|
| 192 |
|
|
| 193 | 193 |
boolean flagAna; |
| 194 | 194 |
boolean flagForm; |
| 195 | 195 |
boolean flagWord; |
| 196 | 196 |
String vWord = ""; |
| 197 | 197 |
String vForm = ""; |
| 198 | 198 |
String vAna = ""; |
| 199 |
|
|
| 199 |
|
|
| 200 | 200 |
String lb_id = ""; |
| 201 | 201 |
String pb_id = ""; |
| 202 |
|
|
| 202 |
|
|
| 203 | 203 |
wordattributes = [:]; |
| 204 | 204 |
balisesfound = new HashMap<String, List<String>>(); |
| 205 |
|
|
| 206 |
|
|
| 205 |
|
|
| 206 |
|
|
| 207 | 207 |
if(!createOutput(outfile)) |
| 208 | 208 |
return false; |
| 209 |
|
|
| 209 |
|
|
| 210 | 210 |
if(sendToPAttributes != null) |
| 211 | 211 |
{
|
| 212 | 212 |
for(String tag: sendToPAttributes.keySet()) |
| ... | ... | |
| 214 | 214 |
injectedPAttributes.add(tag+attr); |
| 215 | 215 |
injectedPAttributesValues = [:]; |
| 216 | 216 |
} |
| 217 |
|
|
| 217 |
|
|
| 218 | 218 |
//output.write("<txmcorpus lang=\""+lang+"\">\n");
|
| 219 | 219 |
balisesfound.put("txmcorpus",["lang"]);
|
| 220 | 220 |
try {
|
| ... | ... | |
| 222 | 222 |
switch (event) {
|
| 223 | 223 |
case XMLStreamConstants.START_ELEMENT: |
| 224 | 224 |
String localname = parser.getLocalName().toLowerCase(); |
| 225 |
|
|
| 225 |
|
|
| 226 | 226 |
// we will only declare found tags in cwb registry |
| 227 | 227 |
if(balisesToKeep.contains(localname)) {
|
| 228 | 228 |
if(!balisesfound.containsKey(localname)) {
|
| 229 | 229 |
balisesfound.put(localname, []); |
| 230 | 230 |
} |
| 231 |
|
|
| 231 |
|
|
| 232 | 232 |
List<String> attrlist = balisesfound.get(localname); |
| 233 | 233 |
for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) |
| 234 | 234 |
if(!attrlist.contains(parser.getAttributeLocalName(i))) |
| 235 | 235 |
attrlist.add(parser.getAttributeLocalName(i)); |
| 236 | 236 |
} |
| 237 |
|
|
| 237 |
|
|
| 238 | 238 |
switch (localname) {
|
| 239 | 239 |
case "w": // get word id !! |
| 240 | 240 |
wordattributes.put("id", parser.getAttributeValue(null, "id"));
|
| 241 | 241 |
break; |
| 242 |
|
|
| 242 |
|
|
| 243 | 243 |
case "form": |
| 244 | 244 |
flagForm = true; |
| 245 | 245 |
currentForm = parser.getAttributeValue(null, "type"); |
| ... | ... | |
| 247 | 247 |
currentForm = "default"; |
| 248 | 248 |
vForm = ""; |
| 249 | 249 |
break; |
| 250 |
|
|
| 250 |
|
|
| 251 | 251 |
case "ana": |
| 252 | 252 |
flagAna = true; |
| 253 | 253 |
vAna =""; |
| 254 |
|
|
| 254 |
|
|
| 255 | 255 |
currentAna = (parser.getAttributeValue(null,"type")); |
| 256 | 256 |
if(currentAna != null) |
| 257 | 257 |
currentAna = currentAna.substring(1)// remove the # |
| 258 | 258 |
else |
| 259 | 259 |
flagAna = false; |
| 260 | 260 |
break; |
| 261 |
|
|
| 261 |
|
|
| 262 | 262 |
default: |
| 263 |
|
|
| 263 |
|
|
| 264 | 264 |
if (sendToPAttributes != null) {
|
| 265 | 265 |
//println "should store $localname ? with "+sendToPAttributes.keySet() |
| 266 | 266 |
if (sendToPAttributes.keySet().contains(localname)) {
|
| ... | ... | |
| 273 | 273 |
} |
| 274 | 274 |
} |
| 275 | 275 |
} |
| 276 |
|
|
| 276 |
|
|
| 277 | 277 |
if (balisesToKeep.contains(localname)) {
|
| 278 | 278 |
output.write("<"+localname);
|
| 279 | 279 |
//println "write <"+localname+"..." |
| ... | ... | |
| 292 | 292 |
|
| 293 | 293 |
output.write(" "+attrname+"=\""+parser.getAttributeValue(i).replace("&", "&").replace("\"", """)+"\"" );
|
| 294 | 294 |
} |
| 295 |
|
|
| 295 |
|
|
| 296 | 296 |
if (localname.equals("text"))
|
| 297 | 297 |
if (addinfos) {
|
| 298 | 298 |
List<String> attrlist = balisesfound.get(localname); |
| 299 |
|
|
| 299 |
|
|
| 300 | 300 |
if (!idwritten) {
|
| 301 | 301 |
output.write(" id=\""+txtname+"\"")
|
| 302 | 302 |
attrlist.add("id");
|
| ... | ... | |
| 310 | 310 |
attrlist.add("project");
|
| 311 | 311 |
} |
| 312 | 312 |
} |
| 313 |
|
|
| 313 |
|
|
| 314 | 314 |
// finalize tag |
| 315 | 315 |
output.write(">\n");
|
| 316 | 316 |
} |
| 317 | 317 |
} |
| 318 | 318 |
break; |
| 319 |
|
|
| 319 |
|
|
| 320 | 320 |
case XMLStreamConstants.END_ELEMENT: |
| 321 | 321 |
String localname = parser.getLocalName().toLowerCase(); |
| 322 | 322 |
switch (localname) {
|
| ... | ... | |
| 325 | 325 |
formhash.put(currentForm, vForm); |
| 326 | 326 |
flagForm = false; |
| 327 | 327 |
break; |
| 328 |
|
|
| 328 |
|
|
| 329 | 329 |
case "ana": |
| 330 | 330 |
if(flagAna) |
| 331 | 331 |
anahash.put(currentAna, vAna); |
| 332 | 332 |
flagAna = false; |
| 333 | 333 |
break; |
| 334 |
|
|
| 334 |
|
|
| 335 | 335 |
case "w": |
| 336 | 336 |
vWord = ""; |
| 337 | 337 |
vWord = formhash.get("default").replaceAll("&", "&").replaceAll("<", "<"); // get default form
|
| 338 | 338 |
for (String form : formhash.keySet()) // and the others |
| 339 | 339 |
if (form != "default") |
| 340 | 340 |
vWord += "\t"+formhash.get(form); |
| 341 |
|
|
| 341 |
|
|
| 342 | 342 |
for (String type : wordattributes.keySet()) // only word id ? |
| 343 | 343 |
vWord+="\t"+wordattributes.get(type) |
| 344 |
|
|
| 344 |
|
|
| 345 | 345 |
if (sendToPAttributes != null) // word attributes from structure properties |
| 346 | 346 |
{
|
| 347 | 347 |
//println "injectedPAttributesValues: "+injectedPAttributesValues |
| 348 | 348 |
for(String pattr : injectedPAttributes) |
| 349 | 349 |
vWord+="\t"+injectedPAttributesValues.get(pattr) ;//les attributs injecter |
| 350 | 350 |
} |
| 351 |
|
|
| 351 |
|
|
| 352 | 352 |
for (String type : anahash.keySet()) // word annotations in txm:ana |
| 353 | 353 |
vWord+="\t"+anahash.get(type) |
| 354 |
|
|
| 354 |
|
|
| 355 | 355 |
output.write(vWord+"\n"); |
| 356 | 356 |
vWord= ""; |
| 357 | 357 |
break; |
| 358 |
|
|
| 358 |
|
|
| 359 | 359 |
default: |
| 360 | 360 |
if (sendToPAttributes != null) // reset structure properties |
| 361 | 361 |
{
|
| ... | ... | |
| 365 | 365 |
} |
| 366 | 366 |
} |
| 367 | 367 |
} |
| 368 |
|
|
| 368 |
|
|
| 369 | 369 |
if (balisesToKeep.contains(localname)) {
|
| 370 | 370 |
output.write("</"+localname+">\n");
|
| 371 | 371 |
} |
| 372 | 372 |
} |
| 373 | 373 |
break; |
| 374 |
|
|
| 374 |
|
|
| 375 | 375 |
case XMLStreamConstants.CHARACTERS: |
| 376 | 376 |
if (flagForm) {
|
| 377 | 377 |
vForm += parser.getText().trim(); |
| ... | ... | |
| 385 | 385 |
//output.write("</txmcorpus>\n");
|
| 386 | 386 |
output.close(); |
| 387 | 387 |
if (parser != null) parser.close(); |
| 388 |
if (inputData != null) inputData.close(); |
|
| 388 |
if (inputData != null) inputData.close();
|
|
| 389 | 389 |
} catch (Exception ex) {
|
| 390 | 390 |
println "Error while parsing $url : "+ex |
| 391 | 391 |
ex.printStackTrace(); |
| ... | ... | |
| 395 | 395 |
} |
| 396 | 396 |
return true; |
| 397 | 397 |
} |
| 398 |
|
|
| 398 |
|
|
| 399 | 399 |
/** |
| 400 | 400 |
* Gets the p attributs. |
| 401 | 401 |
* |
| ... | ... | |
| 404 | 404 |
public List<String> getpAttributs() |
| 405 | 405 |
{
|
| 406 | 406 |
def pAttributs = []; |
| 407 |
|
|
| 407 |
|
|
| 408 | 408 |
for (String wordattr : wordattributes.keySet()) {
|
| 409 | 409 |
pAttributs.add(wordattr); |
| 410 | 410 |
} |
| 411 |
|
|
| 411 |
|
|
| 412 | 412 |
if (sendToPAttributes != null) |
| 413 | 413 |
for (String pAttr : this.injectedPAttributes) |
| 414 | 414 |
pAttributs.add(pAttr); |
| 415 |
|
|
| 415 |
|
|
| 416 | 416 |
for (String anakey : anahash.keySet()) {
|
| 417 | 417 |
pAttributs.add(anakey); |
| 418 | 418 |
} |
| 419 |
|
|
| 419 |
|
|
| 420 | 420 |
return pAttributs; |
| 421 | 421 |
} |
| 422 |
|
|
| 422 |
|
|
| 423 | 423 |
/** |
| 424 | 424 |
* Gets the s attributs. |
| 425 | 425 |
* |
| ... | ... | |
| 427 | 427 |
*/ |
| 428 | 428 |
public List<String> getsAttributs() |
| 429 | 429 |
{
|
| 430 |
println balisesfound |
|
| 431 | 430 |
def sAttributs = []; |
| 432 | 431 |
for (String balise : this.balisesfound.keySet()) {
|
| 433 | 432 |
List<String> sAtt = this.balisesfound.get(balise); |
| ... | ... | |
| 435 | 434 |
for (String attr : sAtt) {
|
| 436 | 435 |
attributes+="+"+attr; |
| 437 | 436 |
} |
| 438 |
|
|
| 437 |
|
|
| 439 | 438 |
if (sAtt.size() > 0) |
| 440 | 439 |
sAttributs.add(balise +":"+attributes); |
| 441 | 440 |
else |
| ... | ... | |
| 443 | 442 |
} |
| 444 | 443 |
return sAttributs; |
| 445 | 444 |
} |
| 446 |
|
|
| 445 |
|
|
| 447 | 446 |
/** |
| 448 | 447 |
* Sets the balises to keep. |
| 449 | 448 |
* |
| ... | ... | |
| 456 | 455 |
else |
| 457 | 456 |
println("Warning: the list of elements to keep is null")
|
| 458 | 457 |
} |
| 459 |
|
|
| 458 |
|
|
| 460 | 459 |
/** |
| 461 | 460 |
* Sets the defautl reference pattern |
| 462 | 461 |
* TODO: not implemented |
| ... | ... | |
| 470 | 469 |
defaultReferencePattern = pattern; |
| 471 | 470 |
} |
| 472 | 471 |
} |
| 473 |
|
|
| 474 |
|
|
| 472 |
|
|
| 475 | 473 |
/** |
| 476 | 474 |
* Sets the send to p attributes. |
| 477 | 475 |
* |
| ... | ... | |
| 484 | 482 |
else |
| 485 | 483 |
println("Warning: the pAttributes to inject is null")
|
| 486 | 484 |
} |
| 487 |
|
|
| 488 |
|
|
| 485 |
|
|
| 486 |
|
|
| 489 | 487 |
/** |
| 490 | 488 |
* The main method. |
| 491 | 489 |
* |
| 492 | 490 |
* @param args the arguments |
| 493 | 491 |
*/ |
| 494 | 492 |
public static void main(String[] args) {
|
| 495 |
|
|
| 493 |
|
|
| 496 | 494 |
String rootDir = "/home/mdecorde/TXM/corpora/CORNEILLEMOLIERETER/txm/CORNEILLEMOLIERETER"; |
| 497 |
|
|
| 495 |
|
|
| 498 | 496 |
File srcfile = new File(rootDir,"CORNEILLEP_AGESILAS_1666.xml"); |
| 499 | 497 |
println srcfile.exists() |
| 500 | 498 |
File cqpfile = new File(rootDir, "out/CORNEILLEP_AGESILAS_1666.cqp"); |
| 501 | 499 |
new File(rootDir,"out").deleteDir() |
| 502 | 500 |
new File(rootDir,"out").mkdir() |
| 503 |
|
|
| 501 |
|
|
| 504 | 502 |
System.out.println("XMLTXM2CQP : "+srcfile+" >> "+cqpfile);
|
| 505 | 503 |
def builder = new XMLTXM2CQP(srcfile.toURL()); |
| 506 | 504 |
def balises = ["text", "s"]; |
| 507 | 505 |
builder.setBalisesToKeep(balises); |
| 508 | 506 |
builder.transformFile(cqpfile); |
| 509 |
|
|
| 507 |
|
|
| 510 | 508 |
println("SATTRIBUTS: "+builder.getsAttributs());
|
| 511 | 509 |
println("PATTRIBUTS: "+builder.getpAttributs());
|
| 512 | 510 |
return; |
Formats disponibles : Unified diff