root / tmp / org.txm.core / src / java / org / txm / scripts / importer / XMLTXM2WTC.groovy @ 2473
History | View | Annotate | Download (13 kB)
1 |
// Copyright © 2010-2013 ENS de Lyon.
|
---|---|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice
|
4 |
// Sophia Antipolis, University of Paris 3.
|
5 |
//
|
6 |
// The TXM platform is free software: you can redistribute it
|
7 |
// and/or modify it under the terms of the GNU General Public
|
8 |
// License as published by the Free Software Foundation,
|
9 |
// either version 2 of the License, or (at your option) any
|
10 |
// later version.
|
11 |
//
|
12 |
// The TXM platform is distributed in the hope that it will be
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 |
// PURPOSE. See the GNU General Public License for more
|
16 |
// details.
|
17 |
//
|
18 |
// You should have received a copy of the GNU General
|
19 |
// Public License along with the TXM platform. If not, see
|
20 |
// http://www.gnu.org/licenses.
|
21 |
//
|
22 |
//
|
23 |
//
|
24 |
// $LastChangedDate: 2017-04-11 15:30:35 +0200 (mar. 11 avril 2017) $
|
25 |
// $LastChangedRevision: 3426 $
|
26 |
// $LastChangedBy: mdecorde $
|
27 |
//
|
28 |
package org.txm.scripts.importer
|
29 |
|
30 |
import java.text.DateFormat; |
31 |
import java.util.Date; |
32 |
import java.util.ArrayList; |
33 |
import java.util.HashMap; |
34 |
import java.util.LinkedHashMap; |
35 |
import javax.xml.stream.*; |
36 |
import java.net.URL; |
37 |
import org.txm.importer.filters.*; |
38 |
// TODO: Auto-generated Javadoc
|
39 |
|
40 |
/**
|
41 |
* The Class XMLTXM2CQP.
|
42 |
*
|
43 |
* @author mdecorde
|
44 |
* simple transofmration of a xml-tei-txm file into cqp file
|
45 |
*/
|
46 |
|
47 |
class XMLTXM2CQP |
48 |
{ |
49 |
|
50 |
/** The url. */
|
51 |
private def url; |
52 |
|
53 |
/** The input data. */
|
54 |
private def inputData; |
55 |
|
56 |
/** The factory. */
|
57 |
private def factory; |
58 |
|
59 |
/** The parser. */
|
60 |
private XMLStreamReader parser;
|
61 |
|
62 |
/** The output. */
|
63 |
private def output; |
64 |
|
65 |
/** The hashmap of txm:form and txm:ana values and the attributes hash*/
|
66 |
LinkedHashMap<String, String> anahash = new LinkedHashMap<String, String>(); |
67 |
LinkedHashMap<String, String> formhash = new LinkedHashMap<String, String>(); |
68 |
LinkedHashMap<String, String> wordattributes = new LinkedHashMap<String, String>(); |
69 |
|
70 |
/** The balisesfound. */
|
71 |
HashMap<String, List<String>> balisesfound;// = new HashMap<String, List<String>>(); |
72 |
|
73 |
/** The balises to keep. */
|
74 |
List<String> balisesToKeep; |
75 |
|
76 |
/** The send to p attributes. */
|
77 |
HashMap <String, List<String>> sendToPAttributes;// = new HashMap<String, List<String>>(); |
78 |
|
79 |
/** The injected p attributes. */
|
80 |
List<String> injectedPAttributes = new ArrayList<String>(); |
81 |
|
82 |
/** The default reference : a pattern + the properties to use */
|
83 |
List<String> defaultReferences = new ArrayList<String>(); |
84 |
String defaultReferencePattern;
|
85 |
|
86 |
/** The injected p attributes values. */
|
87 |
HashMap <String, String> injectedPAttributesValues;// = new ArrayList<String>(); |
88 |
|
89 |
/** The addinfos. */
|
90 |
boolean addinfos = false; |
91 |
|
92 |
/** The txtname. */
|
93 |
String txtname;
|
94 |
|
95 |
/** The base. */
|
96 |
String base;
|
97 |
|
98 |
/** The project. */
|
99 |
String project;
|
100 |
|
101 |
/** The lang. */
|
102 |
public String lang= "fr"; |
103 |
public String currentForm; |
104 |
public String currentAna; |
105 |
|
106 |
/**
|
107 |
* Sets the lang.
|
108 |
*
|
109 |
* @param lang the lang
|
110 |
* @return the java.lang. object
|
111 |
*/
|
112 |
public setLang(String lang) |
113 |
{ |
114 |
this.lang = lang;
|
115 |
} |
116 |
|
117 |
/**
|
118 |
* Instantiates a new xMLTX m2 cqp.
|
119 |
*
|
120 |
* @param url the url
|
121 |
*/
|
122 |
public XMLTXM2CQP(URL url){ |
123 |
try {
|
124 |
this.url = url;
|
125 |
inputData = url.openStream(); |
126 |
factory = XMLInputFactory.newInstance(); |
127 |
|
128 |
parser = factory.createXMLStreamReader(inputData); |
129 |
|
130 |
|
131 |
} catch (XMLStreamException ex) {
|
132 |
System.out.println(ex);
|
133 |
}catch (IOException ex) { |
134 |
System.out.println("IOException while parsing "); |
135 |
} |
136 |
} |
137 |
|
138 |
/**
|
139 |
* Sets the text info.
|
140 |
*
|
141 |
* @param name the name
|
142 |
* @param base the base
|
143 |
* @param project the project
|
144 |
*/
|
145 |
public void setTextInfo(String name, String base, String project) |
146 |
{ |
147 |
this.addinfos = true; |
148 |
this.txtname= name;
|
149 |
this.base = base;
|
150 |
this.project = project;
|
151 |
} |
152 |
|
153 |
/**
|
154 |
* Creates the output.
|
155 |
*
|
156 |
* @param outfile the outfile
|
157 |
* @return true, if successful
|
158 |
*/
|
159 |
private boolean createOutput(File outfile) |
160 |
{ |
161 |
try {
|
162 |
output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile, outfile.exists()) , "UTF-8")); |
163 |
return true; |
164 |
} catch (Exception e) { |
165 |
System.err.println(e);
|
166 |
return false; |
167 |
} |
168 |
} |
169 |
|
170 |
/** The haspb. */
|
171 |
boolean haspb = false; |
172 |
|
173 |
/** The haslb. */
|
174 |
boolean haslb = false; |
175 |
|
176 |
/**
|
177 |
* Transform file.
|
178 |
*
|
179 |
* @param outfile the outfile
|
180 |
* @return true, if successful
|
181 |
*/
|
182 |
public boolean transformFile(File outfile) |
183 |
{ |
184 |
if(balisesToKeep == null) |
185 |
{ |
186 |
println "no element has been defined to be keeped"
|
187 |
return false; |
188 |
} |
189 |
|
190 |
haspb = false;
|
191 |
haslb = false;
|
192 |
|
193 |
boolean flagAna;
|
194 |
boolean flagForm;
|
195 |
boolean flagWord;
|
196 |
String vWord = ""; |
197 |
String vForm = ""; |
198 |
String vAna = ""; |
199 |
|
200 |
String lb_id = ""; |
201 |
String pb_id = ""; |
202 |
|
203 |
wordattributes = [:]; |
204 |
balisesfound = new HashMap<String, List<String>>(); |
205 |
|
206 |
|
207 |
if(!createOutput(outfile))
|
208 |
return false; |
209 |
|
210 |
if(sendToPAttributes != null) |
211 |
{ |
212 |
for(String tag: sendToPAttributes.keySet()) |
213 |
for(String attr : sendToPAttributes.get(tag)) |
214 |
injectedPAttributes.add(tag+attr); |
215 |
injectedPAttributesValues = [:]; |
216 |
} |
217 |
|
218 |
//output.write("<txmcorpus lang=\""+lang+"\">\n");
|
219 |
balisesfound.put("txmcorpus",["lang"]); |
220 |
try {
|
221 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
222 |
switch (event) {
|
223 |
case XMLStreamConstants.START_ELEMENT:
|
224 |
String localname = parser.getLocalName().toLowerCase();
|
225 |
|
226 |
// we will only declare found tags in cwb registry
|
227 |
if(balisesToKeep.contains(localname)) {
|
228 |
if(!balisesfound.containsKey(localname)) {
|
229 |
balisesfound.put(localname, []);
|
230 |
} |
231 |
|
232 |
List<String> attrlist = balisesfound.get(localname); |
233 |
for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) |
234 |
if(!attrlist.contains(parser.getAttributeLocalName(i)))
|
235 |
attrlist.add(parser.getAttributeLocalName(i)); |
236 |
} |
237 |
|
238 |
switch (localname) {
|
239 |
case "w": // get word id !! |
240 |
wordattributes.put("id", parser.getAttributeValue(null, "id")); |
241 |
break;
|
242 |
|
243 |
case "form": |
244 |
flagForm = true;
|
245 |
currentForm = parser.getAttributeValue(null, "type"); |
246 |
if(currentForm == null) |
247 |
currentForm = "default";
|
248 |
vForm = "";
|
249 |
break;
|
250 |
|
251 |
case "ana": |
252 |
flagAna = true;
|
253 |
vAna ="";
|
254 |
|
255 |
currentAna = (parser.getAttributeValue(null,"type")); |
256 |
if(currentAna != null) |
257 |
currentAna = currentAna.substring(1)// remove the # |
258 |
else
|
259 |
flagAna = false;
|
260 |
break;
|
261 |
|
262 |
default:
|
263 |
|
264 |
if (sendToPAttributes != null) { |
265 |
//println "should store $localname ? with "+sendToPAttributes.keySet()
|
266 |
if (sendToPAttributes.keySet().contains(localname)) {
|
267 |
//println "store attr of "+localname
|
268 |
List<String> attrs = sendToPAttributes.get(localname); |
269 |
for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) { |
270 |
if (attrs.contains(parser.getAttributeLocalName(i))) {
|
271 |
injectedPAttributesValues.put(localname+parser.getAttributeLocalName(i).toLowerCase(),parser.getAttributeValue(i)) |
272 |
} |
273 |
} |
274 |
} |
275 |
} |
276 |
|
277 |
if (balisesToKeep.contains(localname)) {
|
278 |
output.write("<"+localname);
|
279 |
//println "write <"+localname+"..."
|
280 |
//write attributes
|
281 |
boolean idwritten = false; |
282 |
boolean basewritten = false; |
283 |
boolean projectwritten = false; |
284 |
for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) { |
285 |
String attrname = parser.getAttributeLocalName(i).toLowerCase();
|
286 |
if (attrname == "id") |
287 |
idwritten = true;
|
288 |
if (attrname == "base") |
289 |
basewritten = true;
|
290 |
if (attrname == "project") |
291 |
projectwritten = true;
|
292 |
|
293 |
output.write(" "+attrname+"=\""+parser.getAttributeValue(i).replace("&", "&").replace("\"", """)+"\"" ); |
294 |
} |
295 |
|
296 |
if (localname.equals("text")) |
297 |
if (addinfos) {
|
298 |
List<String> attrlist = balisesfound.get(localname); |
299 |
|
300 |
if (!idwritten) {
|
301 |
output.write(" id=\""+txtname+"\"") |
302 |
attrlist.add("id");
|
303 |
} |
304 |
if (!basewritten) {
|
305 |
output.write(" base=\""+base+"\""); |
306 |
attrlist.add("base");
|
307 |
} |
308 |
if (!projectwritten) {
|
309 |
output.write(" project=\""+project+"\""); |
310 |
attrlist.add("project");
|
311 |
} |
312 |
} |
313 |
|
314 |
// finalize tag
|
315 |
output.write(">\n");
|
316 |
} |
317 |
} |
318 |
break;
|
319 |
|
320 |
case XMLStreamConstants.END_ELEMENT:
|
321 |
String localname = parser.getLocalName().toLowerCase();
|
322 |
switch (localname) {
|
323 |
case "form": |
324 |
if(flagForm)
|
325 |
formhash.put(currentForm, vForm); |
326 |
flagForm = false;
|
327 |
break;
|
328 |
|
329 |
case "ana": |
330 |
if(flagAna)
|
331 |
anahash.put(currentAna, vAna); |
332 |
flagAna = false;
|
333 |
break;
|
334 |
|
335 |
case "w": |
336 |
vWord = "";
|
337 |
vWord = formhash.get("default").replaceAll("&", "&").replaceAll("<", "<"); // get default form |
338 |
for (String form : formhash.keySet()) // and the others |
339 |
if (form != "default") |
340 |
vWord += "\t"+formhash.get(form);
|
341 |
|
342 |
for (String type : wordattributes.keySet()) // only word id ? |
343 |
vWord+="\t"+wordattributes.get(type)
|
344 |
|
345 |
if (sendToPAttributes != null) // word attributes from structure properties |
346 |
{ |
347 |
//println "injectedPAttributesValues: "+injectedPAttributesValues
|
348 |
for(String pattr : injectedPAttributes) |
349 |
vWord+="\t"+injectedPAttributesValues.get(pattr) ;//les attributs injecter |
350 |
} |
351 |
|
352 |
for (String type : anahash.keySet()) // word annotations in txm:ana |
353 |
vWord+="\t"+anahash.get(type)
|
354 |
|
355 |
output.write(vWord+"\n");
|
356 |
vWord= "";
|
357 |
break;
|
358 |
|
359 |
default:
|
360 |
if (sendToPAttributes != null) // reset structure properties |
361 |
{ |
362 |
if (sendToPAttributes.keySet().contains(localname)) {
|
363 |
for (String attr : sendToPAttributes.get(localname)) { |
364 |
injectedPAttributesValues.put(attr, "N/A")
|
365 |
} |
366 |
} |
367 |
} |
368 |
|
369 |
if (balisesToKeep.contains(localname)) {
|
370 |
output.write("</"+localname+">\n"); |
371 |
} |
372 |
} |
373 |
break;
|
374 |
|
375 |
case XMLStreamConstants.CHARACTERS:
|
376 |
if (flagForm) {
|
377 |
vForm += parser.getText().trim(); |
378 |
} |
379 |
if (flagAna) {
|
380 |
vAna += parser.getText().trim(); |
381 |
} |
382 |
break;
|
383 |
} |
384 |
} |
385 |
//output.write("</txmcorpus>\n");
|
386 |
output.close(); |
387 |
if (parser != null) parser.close(); |
388 |
if (inputData != null) inputData.close(); |
389 |
} catch (Exception ex) { |
390 |
println "Error while parsing $url : "+ex
|
391 |
ex.printStackTrace(); |
392 |
if (parser != null) parser.close(); |
393 |
if (inputData != null) inputData.close(); |
394 |
return false; |
395 |
} |
396 |
return true; |
397 |
} |
398 |
|
399 |
/**
|
400 |
* Gets the p attributs.
|
401 |
*
|
402 |
* @return the p attributs
|
403 |
*/
|
404 |
public List<String> getpAttributs() |
405 |
{ |
406 |
def pAttributs = []; |
407 |
|
408 |
for (String wordattr : wordattributes.keySet()) { |
409 |
pAttributs.add(wordattr); |
410 |
} |
411 |
|
412 |
if (sendToPAttributes != null) |
413 |
for (String pAttr : this.injectedPAttributes) |
414 |
pAttributs.add(pAttr); |
415 |
|
416 |
for (String anakey : anahash.keySet()) { |
417 |
pAttributs.add(anakey); |
418 |
} |
419 |
|
420 |
return pAttributs;
|
421 |
} |
422 |
|
423 |
/**
|
424 |
* Gets the s attributs.
|
425 |
*
|
426 |
* @return the s attributs
|
427 |
*/
|
428 |
public List<String> getsAttributs() |
429 |
{ |
430 |
println balisesfound |
431 |
def sAttributs = []; |
432 |
for (String balise : this.balisesfound.keySet()) { |
433 |
List<String> sAtt = this.balisesfound.get(balise); |
434 |
String attributes = ""; |
435 |
for (String attr : sAtt) { |
436 |
attributes+="+"+attr;
|
437 |
} |
438 |
|
439 |
if (sAtt.size() > 0) |
440 |
sAttributs.add(balise +":"+attributes);
|
441 |
else
|
442 |
sAttributs.add(balise); |
443 |
} |
444 |
return sAttributs;
|
445 |
} |
446 |
|
447 |
/**
|
448 |
* Sets the balises to keep.
|
449 |
*
|
450 |
* @param balisesToKeep the new balises to keep
|
451 |
*/
|
452 |
public void setBalisesToKeep(List<String> balisesToKeep) |
453 |
{ |
454 |
if (balisesToKeep != null) |
455 |
this.balisesToKeep = balisesToKeep;
|
456 |
else
|
457 |
println("Warning: the list of elements to keep is null")
|
458 |
} |
459 |
|
460 |
/**
|
461 |
* Sets the defautl reference pattern
|
462 |
* TODO: not implemented
|
463 |
*
|
464 |
* @param balisesToKeep the new balises to keep
|
465 |
*/
|
466 |
public void setDefaultReference(String pattern, List<String> strucProperties) |
467 |
{ |
468 |
if (defaultReferencePattern != null) { |
469 |
this.defaultReferences = defaultReferences;
|
470 |
defaultReferencePattern = pattern; |
471 |
} |
472 |
} |
473 |
|
474 |
|
475 |
/**
|
476 |
* Sets the send to p attributes.
|
477 |
*
|
478 |
* @param sendus the sendus
|
479 |
*/
|
480 |
public void setSendToPAttributes(HashMap<String, List<String>> sendus) |
481 |
{ |
482 |
if (sendus != null) |
483 |
this.sendToPAttributes = sendus;
|
484 |
else
|
485 |
println("Warning: the pAttributes to inject is null")
|
486 |
} |
487 |
|
488 |
|
489 |
/**
|
490 |
* The main method.
|
491 |
*
|
492 |
* @param args the arguments
|
493 |
*/
|
494 |
public static void main(String[] args) { |
495 |
|
496 |
String rootDir = "/home/mdecorde/TXM/corpora/CORNEILLEMOLIERETER/txm/CORNEILLEMOLIERETER"; |
497 |
|
498 |
File srcfile = new File(rootDir,"CORNEILLEP_AGESILAS_1666.xml"); |
499 |
println srcfile.exists() |
500 |
File cqpfile = new File(rootDir, "out/CORNEILLEP_AGESILAS_1666.cqp"); |
501 |
new File(rootDir,"out").deleteDir() |
502 |
new File(rootDir,"out").mkdir() |
503 |
|
504 |
System.out.println("XMLTXM2CQP : "+srcfile+" >> "+cqpfile); |
505 |
def builder = new XMLTXM2CQP(srcfile.toURL()); |
506 |
def balises = ["text", "s"]; |
507 |
builder.setBalisesToKeep(balises); |
508 |
builder.transformFile(cqpfile); |
509 |
|
510 |
println("SATTRIBUTS: "+builder.getsAttributs());
|
511 |
println("PATTRIBUTS: "+builder.getpAttributs());
|
512 |
return;
|
513 |
} |
514 |
} |
515 |
|