Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / XMLTXM2WTC.groovy @ 2553

History | View | Annotate | Download (13.2 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-04-11 15:30:35 +0200 (mar. 11 avril 2017) $
25
// $LastChangedRevision: 3426 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer
29

    
30
import java.text.DateFormat;
31
import java.util.Date;
32
import java.util.ArrayList;
33
import java.util.HashMap;
34
import java.util.LinkedHashMap;
35
import javax.xml.stream.*;
36
import java.net.URL;
37
import org.txm.importer.filters.*;
38
// TODO: Auto-generated Javadoc
39

    
40
/**
41
 * The Class XMLTXM2CQP.
42
 *
43
 * @author mdecorde
44
 * simple transofmration of a xml-tei-txm file into cqp file
45
 */
46

    
47
class XMLTXM2CQP
48
{
49
        
50
        /** The url. */
51
        private def url;
52
        
53
        /** The input data. */
54
        private def inputData;
55
        
56
        /** The factory. */
57
        private def factory;
58
        
59
        /** The parser. */
60
        private XMLStreamReader parser;
61
        
62
        /** The output. */
63
        private def output;
64
        
65
        /** The hashmap of txm:form and txm:ana values and the attributes hash*/
66
        LinkedHashMap<String, String> anahash = new LinkedHashMap<String, String>();
67
        LinkedHashMap<String, String> formhash = new LinkedHashMap<String, String>();
68
        LinkedHashMap<String, String> wordattributes = new LinkedHashMap<String, String>();
69
        
70
        /** The balisesfound. */
71
        HashMap<String, List<String>> balisesfound;// = new HashMap<String, List<String>>();
72
        
73
        /** The balises to keep. */
74
        List<String> balisesToKeep;
75
        
76
        /** The send to p attributes. */
77
        HashMap <String, List<String>> sendToPAttributes;// = new HashMap<String, List<String>>();
78
        
79
        /** The injected p attributes. */
80
        List<String> injectedPAttributes = new ArrayList<String>();
81
        
82
        /** The default reference : a pattern + the properties to use */
83
        List<String> defaultReferences = new ArrayList<String>();
84
        String defaultReferencePattern;
85
        
86
        /** The injected p attributes values. */
87
        HashMap <String, String> injectedPAttributesValues;// = new ArrayList<String>();
88
        
89
        /** The addinfos. */
90
        boolean addinfos = false;
91
        
92
        /** The txtname. */
93
        String txtname;
94
        
95
        /** The base. */
96
        String base;
97
        
98
        /** The project. */
99
        String project;
100
        
101
        /** The lang. */
102
        public String lang= "fr";
103
        public String currentForm;
104
        public String currentAna;
105
        
106
        /**
107
         * Sets the lang.
108
         *
109
         * @param lang the lang
110
         * @return the java.lang. object
111
         */
112
        public setLang(String lang)
113
        {
114
                this.lang = lang;
115
        }
116
        
117
        /**
118
         * Instantiates a new xMLTX m2 cqp.
119
         *
120
         * @param url the url
121
         */
122
        public XMLTXM2CQP(URL url){
123
                try {
124
                        this.url = url;
125
                        inputData = url.openStream();
126
                        factory = XMLInputFactory.newInstance();
127
                        
128
                        parser = factory.createXMLStreamReader(inputData);
129
                        
130
                        
131
                } catch (XMLStreamException ex) {
132
                        System.out.println(ex);
133
                }catch (IOException ex) {
134
                        System.out.println("IOException while parsing ");
135
                }
136
        }
137
        
138
        /**
139
         * Sets the text info.
140
         *
141
         * @param name the name
142
         * @param base the base
143
         * @param project the project
144
         */
145
        public void setTextInfo(String name, String base, String project)
146
        {
147
                this.addinfos = true;
148
                this.txtname= name;
149
                this.base = base;
150
                this.project = project;
151
        }
152
        
153
        /**
154
         * Creates the output.
155
         *
156
         * @param outfile the outfile
157
         * @return true, if successful
158
         */
159
        private boolean createOutput(File outfile)
160
        {
161
                try {
162
                        output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile, outfile.exists()) , "UTF-8"));
163
                        return true;
164
                } catch (Exception e) {
165
                        System.err.println(e);
166
                        return false;
167
                }
168
        }
169
        
170
        /** The haspb. */
171
        boolean haspb = false;
172
        
173
        /** The haslb. */
174
        boolean haslb = false;
175
        
176
        /**
177
         * Transform file.
178
         *
179
         * @param outfile the outfile
180
         * @return true, if successful
181
         */
182
        public boolean transformFile(File outfile)
183
        {
184
                if(balisesToKeep == null)
185
                {
186
                        println "no element has been defined to be keeped"
187
                        return false;
188
                }
189
                
190
                haspb = false;
191
                haslb = false;
192
                
193
                boolean flagAna;
194
                boolean flagForm;
195
                boolean flagWord;
196
                String vWord = "";
197
                String vForm = "";
198
                String vAna = "";
199
                
200
                String lb_id = "";
201
                String pb_id = "";
202
                
203
                wordattributes = [:];
204
                balisesfound = new HashMap<String, List<String>>();
205
                
206
                
207
                if(!createOutput(outfile))
208
                        return false;
209
                
210
                if(sendToPAttributes != null)
211
                {
212
                        for(String tag: sendToPAttributes.keySet())
213
                                for(String attr : sendToPAttributes.get(tag))
214
                                        injectedPAttributes.add(tag+attr);
215
                        injectedPAttributesValues = [:];
216
                }
217
                
218
                //output.write("<txmcorpus lang=\""+lang+"\">\n");
219
                balisesfound.put("txmcorpus",["lang"]);
220
                try {
221
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
222
                                switch (event) {
223
                                        case XMLStreamConstants.START_ELEMENT:
224
                                                String localname = parser.getLocalName().toLowerCase();
225
                                        
226
                                        // we will only declare found tags in cwb registry
227
                                                if(balisesToKeep.contains(localname)) {
228
                                                        if(!balisesfound.containsKey(localname)) {
229
                                                                balisesfound.put(localname, []);
230
                                                        }
231
                                                        
232
                                                        List<String> attrlist = balisesfound.get(localname);
233
                                                        for (int i= 0 ; i < parser.getAttributeCount() ;i++ )
234
                                                                if(!attrlist.contains(parser.getAttributeLocalName(i)))
235
                                                                        attrlist.add(parser.getAttributeLocalName(i));
236
                                                }
237
                                        
238
                                                switch (localname) {
239
                                                        case "w": // get word id !!
240
                                                                wordattributes.put("id", parser.getAttributeValue(null, "id"));
241
                                                                break;
242
                                                        
243
                                                        case "form":
244
                                                                flagForm = true;
245
                                                                currentForm = parser.getAttributeValue(null, "type");
246
                                                                if(currentForm == null)
247
                                                                        currentForm = "default";
248
                                                                vForm = "";
249
                                                                break;
250
                                                        
251
                                                        case "ana":
252
                                                                flagAna = true;
253
                                                                vAna ="";
254
                                                        
255
                                                                currentAna = (parser.getAttributeValue(null,"type"));
256
                                                                if(currentAna != null)
257
                                                                        currentAna = currentAna.substring(1)// remove the #
258
                                                                else
259
                                                                        flagAna = false;
260
                                                                break;
261
                                                        
262
                                                        default:
263
                                                        
264
                                                                if (sendToPAttributes != null) {
265
                                                                        //println "should store $localname ? with "+sendToPAttributes.keySet()
266
                                                                        if (sendToPAttributes.keySet().contains(localname)) {
267
                                                                                //println "store attr of "+localname
268
                                                                                List<String> attrs = sendToPAttributes.get(localname);
269
                                                                                for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) {
270
                                                                                        if (attrs.contains(parser.getAttributeLocalName(i))) {
271
                                                                                                injectedPAttributesValues.put(localname+parser.getAttributeLocalName(i).toLowerCase(),parser.getAttributeValue(i))
272
                                                                                        }
273
                                                                                }
274
                                                                        }
275
                                                                }
276
                                                        
277
                                                                if (balisesToKeep.contains(localname)) {
278
                                                                        output.write("<"+localname);
279
                                                                        //println "write <"+localname+"..."
280
                                                                        //write attributes
281
                                                                        boolean idwritten = false;
282
                                                                        boolean basewritten = false;
283
                                                                        boolean projectwritten = false;
284
                                                                        for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) {
285
                                                                                String attrname = parser.getAttributeLocalName(i).toLowerCase();
286
                                                                                if (attrname == "id")
287
                                                                                        idwritten = true;
288
                                                                                if (attrname == "base")
289
                                                                                        basewritten = true;
290
                                                                                if (attrname == "project")
291
                                                                                        projectwritten = true;
292
                                                                                
293
                                                                                output.write(" "+attrname+"=\""+parser.getAttributeValue(i).replace("&", "&amp;").replace("\"", "&quot;")+"\"" );
294
                                                                        }
295
                                                                        
296
                                                                        if (localname.equals("text"))
297
                                                                                if (addinfos) {
298
                                                                                        List<String> attrlist = balisesfound.get(localname);
299
                                                                                        
300
                                                                                        if (!idwritten) {
301
                                                                                                output.write(" id=\""+txtname+"\"")
302
                                                                                                attrlist.add("id");
303
                                                                                        }
304
                                                                                        if (!basewritten) {
305
                                                                                                output.write(" base=\""+base+"\"");
306
                                                                                                attrlist.add("base");
307
                                                                                        }
308
                                                                                        if (!projectwritten) {
309
                                                                                                output.write(" project=\""+project+"\"");
310
                                                                                                attrlist.add("project");
311
                                                                                        }
312
                                                                                }
313
                                                                        
314
                                                                        // finalize tag
315
                                                                        output.write(">\n");
316
                                                                }
317
                                                }
318
                                                break;
319
                                        
320
                                        case XMLStreamConstants.END_ELEMENT:
321
                                                String localname = parser.getLocalName().toLowerCase();
322
                                                switch (localname) {
323
                                                        case "form":
324
                                                                if(flagForm)
325
                                                                        formhash.put(currentForm, vForm);
326
                                                                flagForm = false;
327
                                                                break;
328
                                                        
329
                                                        case "ana":
330
                                                                if(flagAna)
331
                                                                        anahash.put(currentAna, vAna);
332
                                                                flagAna = false;
333
                                                                break;
334
                                                        
335
                                                        case "w":
336
                                                                vWord = "";
337
                                                                vWord = formhash.get("default").replaceAll("&", "&amp;").replaceAll("<", "&lt;"); // get default form
338
                                                                for (String form : formhash.keySet()) // and the others
339
                                                                        if (form != "default")
340
                                                                                vWord += "\t"+formhash.get(form);
341
                                                        
342
                                                                for (String type : wordattributes.keySet()) // only word id ?
343
                                                                        vWord+="\t"+wordattributes.get(type)
344
                                                        
345
                                                                if (sendToPAttributes != null) // word attributes from structure properties
346
                                                                {
347
                                                                        //println "injectedPAttributesValues: "+injectedPAttributesValues
348
                                                                        for(String pattr : injectedPAttributes)
349
                                                                                vWord+="\t"+injectedPAttributesValues.get(pattr) ;//les attributs injecter
350
                                                                }
351
                                                        
352
                                                                for (String type : anahash.keySet()) // word annotations in txm:ana
353
                                                                        vWord+="\t"+anahash.get(type)
354
                                                        
355
                                                                output.write(vWord+"\n");
356
                                                                vWord= "";
357
                                                                break;
358
                                                        
359
                                                        default:
360
                                                                if (sendToPAttributes != null) // reset structure properties
361
                                                                {
362
                                                                        if (sendToPAttributes.keySet().contains(localname)) {
363
                                                                                for (String attr : sendToPAttributes.get(localname)) {
364
                                                                                        injectedPAttributesValues.put(attr, "N/A")
365
                                                                                }
366
                                                                        }
367
                                                                }
368
                                                        
369
                                                                if (balisesToKeep.contains(localname)) {
370
                                                                        output.write("</"+localname+">\n");
371
                                                                }
372
                                                }
373
                                                break;
374
                                        
375
                                        case XMLStreamConstants.CHARACTERS:
376
                                                if (flagForm) {
377
                                                        vForm += parser.getText().trim();
378
                                                }
379
                                                if (flagAna) {
380
                                                        vAna += parser.getText().trim();
381
                                                }
382
                                                break;
383
                                }
384
                        }
385
                        //output.write("</txmcorpus>\n");
386
                        output.close();
387
                        if (parser != null) parser.close();
388
                        if (inputData != null) inputData.close();
389
                } catch (Exception ex) {
390
                        println "Error while parsing $url : "+ex
391
                        ex.printStackTrace();
392
                        if (parser != null) parser.close();
393
                        if (inputData != null) inputData.close();
394
                        return false;
395
                }
396
                return true;
397
        }
398
        
399
        /**
400
         * Gets the p attributs.
401
         *
402
         * @return the p attributs
403
         */
404
        public List<String> getpAttributs()
405
        {
406
                def pAttributs = [];
407
                
408
                for (String wordattr : wordattributes.keySet()) {
409
                        pAttributs.add(wordattr);
410
                }
411
                
412
                if (sendToPAttributes != null)
413
                        for (String pAttr : this.injectedPAttributes)
414
                                pAttributs.add(pAttr);
415
                
416
                for (String anakey : anahash.keySet()) {
417
                        pAttributs.add(anakey);
418
                }
419
                
420
                return pAttributs;
421
        }
422
        
423
        /**
424
         * Gets the s attributs.
425
         *
426
         * @return the s attributs
427
         */
428
        public List<String> getsAttributs()
429
        {
430
                def sAttributs = [];
431
                for (String balise : this.balisesfound.keySet()) {
432
                        List<String> sAtt = this.balisesfound.get(balise);
433
                        String attributes = "";
434
                        for (String attr : sAtt) {
435
                                attributes+="+"+attr;
436
                        }
437
                        
438
                        if (sAtt.size() > 0)
439
                                sAttributs.add(balise +":"+attributes);
440
                        else
441
                                sAttributs.add(balise);
442
                }
443
                return sAttributs;
444
        }
445
        
446
        /**
447
         * Sets the balises to keep.
448
         *
449
         * @param balisesToKeep the new balises to keep
450
         */
451
        public void setBalisesToKeep(List<String> balisesToKeep)
452
        {
453
                if (balisesToKeep != null)
454
                        this.balisesToKeep = balisesToKeep;
455
                else
456
                        println("Warning: the list of elements to keep is null")
457
        }
458
        
459
        /**
460
         * Sets the defautl reference pattern
461
         * TODO: not implemented
462
         *
463
         * @param balisesToKeep the new balises to keep
464
         */
465
        public void setDefaultReference(String pattern, List<String> strucProperties)
466
        {
467
                if (defaultReferencePattern != null) {
468
                        this.defaultReferences = defaultReferences;
469
                        defaultReferencePattern = pattern;
470
                }
471
        }
472
        
473
        /**
474
         * Sets the send to p attributes.
475
         *
476
         * @param sendus the sendus
477
         */
478
        public void setSendToPAttributes(HashMap<String, List<String>> sendus)
479
        {
480
                if (sendus != null)
481
                        this.sendToPAttributes = sendus;
482
                else
483
                        println("Warning: the pAttributes to inject is null")
484
        }
485
        
486
        
487
        /**
488
         * The main method.
489
         *
490
         * @param args the arguments
491
         */
492
        public static void main(String[] args) {
493
                
494
                String rootDir = "/home/mdecorde/TXM/corpora/CORNEILLEMOLIERETER/txm/CORNEILLEMOLIERETER";
495
                
496
                File srcfile = new File(rootDir,"CORNEILLEP_AGESILAS_1666.xml");
497
                println srcfile.exists()
498
                File cqpfile = new File(rootDir, "out/CORNEILLEP_AGESILAS_1666.cqp");
499
                new File(rootDir,"out").deleteDir()
500
                new File(rootDir,"out").mkdir()
501
                
502
                System.out.println("XMLTXM2CQP : "+srcfile+" >> "+cqpfile);
503
                def builder = new XMLTXM2CQP(srcfile.toURL());
504
                def balises = ["text", "s"];
505
                builder.setBalisesToKeep(balises);
506
                builder.transformFile(cqpfile);
507
                
508
                println("SATTRIBUTS: "+builder.getsAttributs());
509
                println("PATTRIBUTS: "+builder.getpAttributs());
510
                return;
511
        }
512
}
513