Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / XMLTXM2WTC.groovy @ 479

History | View | Annotate | Download (12.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-04-11 15:30:35 +0200 (mar. 11 avril 2017) $
25
// $LastChangedRevision: 3426 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer
29

    
30
import java.text.DateFormat;
31
import java.util.Date;
32
import java.util.ArrayList;
33
import java.util.HashMap;
34
import java.util.LinkedHashMap;
35
import javax.xml.stream.*;
36
import java.net.URL;
37
import org.txm.importer.filters.*;
38
// TODO: Auto-generated Javadoc
39

    
40
/**
41
 * The Class XMLTXM2WTC.
42
 *
43
 * @author mdecorde
44
 * simple transofmration of a xml-tei-txm file into wtc file
45
 */
46

    
47
class XMLTXM2WTC
48
{
49
        
50
        /** The url. */
51
        private def url;
52
        
53
        /** The input data. */
54
        private def inputData;
55
        
56
        /** The factory. */
57
        private def factory;
58
        
59
        /** The parser. */
60
        private XMLStreamReader parser;
61
        
62
        /** The output. */
63
        private def output;
64
        
65
        /** The hashmap of txm:form and txm:ana values and the attributes hash*/
66
        LinkedHashMap<String, String> anahash = new LinkedHashMap<String, String>();
67
        LinkedHashMap<String, String> formhash = new LinkedHashMap<String, String>();
68
        LinkedHashMap<String, String> wordattributes = new LinkedHashMap<String, String>();
69
        
70
        /** The balisesfound. */
71
        HashMap<String, List<String>> balisesfound;// = new HashMap<String, List<String>>();
72

    
73
        /** The balises to keep. */
74
        List<String> balisesToKeep;
75
        
76
        /** The send to p attributes. */
77
        HashMap <String, List<String>> sendToPAttributes;// = new HashMap<String, List<String>>();
78
        
79
        /** The injected p attributes. */
80
        List<String> injectedPAttributes = new ArrayList<String>();
81
        
82
        /** The default reference : a pattern + the properties to use */
83
        List<String> defaultReferences = new ArrayList<String>();
84
        String defaultReferencePattern;
85
        
86
        /** The injected p attributes values. */
87
        HashMap <String, String> injectedPAttributesValues;// = new ArrayList<String>();
88
        
89
        /** The addinfos. */
90
        boolean addinfos = false;
91
        
92
        /** The txtname. */
93
        String txtname;
94
        
95
        /** The base. */
96
        String base;
97
        
98
        /** The project. */
99
        String project;
100
        
101
        /** The lang. */
102
        public String lang= "fr";
103
        public String currentForm;
104
        public String currentAna;
105
        
106
        /**
107
         * Sets the lang.
108
         *
109
         * @param lang the lang
110
         * @return the java.lang. object
111
         */
112
        public setLang(String lang)
113
        {
114
                this.lang = lang;
115
        }
116
        
117
        /**
118
         * Instantiates a new xMLTX m2 wtc.
119
         *
120
         * @param url the url
121
         */
122
        public XMLTXM2WTC(URL url){
123
                try {
124
                        this.url = url;
125
                        inputData = url.openStream();
126
                        factory = XMLInputFactory.newInstance();
127
                        
128
                        parser = factory.createXMLStreamReader(inputData);
129
                        
130
                        
131
                } catch (XMLStreamException ex) {
132
                        System.out.println(ex);
133
                }catch (IOException ex) {
134
                        System.out.println("IOException while parsing ");
135
                }
136
        }
137
        
138
        /**
139
         * Sets the text info.
140
         *
141
         * @param name the name
142
         * @param base the base
143
         * @param project the project
144
         */
145
        public void setTextInfo(String name, String base, String project)
146
        {
147
                this.addinfos = true;
148
                this.txtname= name;
149
                this.base = base;
150
                this.project = project;
151
        }
152
        
153
        /**
154
         * Creates the output.
155
         *
156
         * @param outfile the outfile
157
         * @return true, if successful
158
         */
159
        private boolean createOutput(File outfile)
160
        {
161
                try {
162
                        output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile, outfile.exists()) , "UTF-8"));
163
                        return true;
164
                } catch (Exception e) {
165
                        System.err.println(e);
166
                        return false;
167
                }
168
        }
169
        
170
        /** The haspb. */
171
        boolean haspb = false;
172
        
173
        /** The haslb. */
174
        boolean haslb = false;
175
        
176
        /**
177
         * Transform file.
178
         *
179
         * @param outfile the outfile
180
         * @return true, if successful
181
         */
182
        public boolean transformFile(File outfile)
183
        {
184
                if(balisesToKeep == null)
185
                {
186
                        println "no element has been defined to be keeped"
187
                        return false;
188
                }
189
                
190
                haspb = false;
191
                haslb = false;        
192
                
193
                boolean flagAna;
194
                boolean flagForm;
195
                boolean flagWord;
196
                String vWord = "";
197
                String vForm = "";
198
                String vAna = "";
199
                
200
                String lb_id = "";
201
                String pb_id = "";
202
                
203
                wordattributes = [:];
204
                balisesfound = new HashMap<String, List<String>>();
205
                
206
                
207
                if(!createOutput(outfile))
208
                        return false;
209
                
210
                if(sendToPAttributes != null)
211
                {
212
                        for(String tag: sendToPAttributes.keySet())
213
                                for(String attr : sendToPAttributes.get(tag))
214
                                        injectedPAttributes.add(tag+attr);
215
                        injectedPAttributesValues = [:];
216
                }
217
                
218
                //output.write("<txmcorpus lang=\""+lang+"\">\n");
219
                balisesfound.put("txmcorpus",["lang"]);
220
                
221
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
222
                        switch (event) 
223
                        {
224
                                case XMLStreamConstants.START_ELEMENT:
225
                                        String localname = parser.getLocalName().toLowerCase();
226
                                
227
                                // we will only declare found tags in cwb registry
228
                                        if(balisesToKeep.contains(localname)) {
229
                                                if(!balisesfound.containsKey(localname)) {
230
                                                        balisesfound.put(localname, []);
231
                                                }
232
                                                
233
                                                List<String> attrlist = balisesfound.get(localname);
234
                                                for (int i= 0 ; i < parser.getAttributeCount() ;i++ )
235
                                                        if(!attrlist.contains(parser.getAttributeLocalName(i)))
236
                                                                attrlist.add(parser.getAttributeLocalName(i));
237
                                        }
238
                                        
239
                                        switch (localname) {                                                
240
                                                case "w": // get word id !!
241
                                                        wordattributes.put("id", parser.getAttributeValue(null, "id"));
242
                                                        break;
243
                                                
244
                                                case "form":
245
                                                        flagForm = true;
246
                                                        currentForm = parser.getAttributeValue(null, "type");
247
                                                        if(currentForm == null)
248
                                                                currentForm = "default";
249
                                                        vForm = "";
250
                                                        break;
251
                                                
252
                                                case "ana":
253
                                                        flagAna = true;
254
                                                        vAna ="";
255
                                                
256
                                                        currentAna = (parser.getAttributeValue(null,"type"));
257
                                                        if(currentAna != null)
258
                                                                currentAna = currentAna.substring(1)// remove the #
259
                                                        else
260
                                                                flagAna = false;
261
                                                        break;
262
                                                
263
                                                default:
264
                                                        
265
                                                        if (sendToPAttributes != null) {
266
                                                                //println "should store $localname ? with "+sendToPAttributes.keySet()
267
                                                                if (sendToPAttributes.keySet().contains(localname)) {
268
                                                                        //println "store attr of "+localname
269
                                                                        List<String> attrs = sendToPAttributes.get(localname);
270
                                                                        for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) {
271
                                                                                if (attrs.contains(parser.getAttributeLocalName(i))) {
272
                                                                                        injectedPAttributesValues.put(localname+parser.getAttributeLocalName(i).toLowerCase(),parser.getAttributeValue(i)) 
273
                                                                                }
274
                                                                        }
275
                                                                }
276
                                                        }
277
                                                        
278
                                                        if (balisesToKeep.contains(localname)) {
279
                                                                output.write("<"+localname);
280
                                                                //println "write <"+localname+"..."
281
                                                                //write attributes
282
                                                                boolean idwritten = false;
283
                                                                boolean basewritten = false;
284
                                                                boolean projectwritten = false;
285
                                                                for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) {
286
                                                                        String attrname = parser.getAttributeLocalName(i).toLowerCase();
287
                                                                        if (attrname == "id")
288
                                                                                idwritten = true;
289
                                                                        if (attrname == "base")
290
                                                                                basewritten = true;
291
                                                                        if (attrname == "project")
292
                                                                                projectwritten = true;
293
                                                                        output.write(" "+attrname+"=\""+parser.getAttributeValue(i).replace("\"", "'")+"\"" );
294
                                                                }
295
                                                                
296
                                                                if (localname.equals("text"))
297
                                                                        if (addinfos) {
298
                                                                                List<String> attrlist = balisesfound.get(localname);
299
        
300
                                                                                if (!idwritten) {
301
                                                                                        output.write(" id=\""+txtname+"\"")
302
                                                                                        attrlist.add("id");
303
                                                                                }
304
                                                                                if (!basewritten) {
305
                                                                                        output.write(" base=\""+base+"\"");
306
                                                                                        attrlist.add("base");
307
                                                                                }
308
                                                                                if (!projectwritten) {
309
                                                                                        output.write(" project=\""+project+"\"");
310
                                                                                        attrlist.add("project");
311
                                                                                }                                                                                        
312
                                                                        }        
313
                                                                                        
314
                                                                // finalize tag
315
                                                                output.write(">\n");
316
                                                        }
317
                                        }
318
                                        break;
319
                                
320
                                case XMLStreamConstants.END_ELEMENT:
321
                                        String localname = parser.getLocalName().toLowerCase();
322
                                        switch (localname) {
323
                                                case "form":
324
                                                        if(flagForm)
325
                                                                formhash.put(currentForm, vForm);
326
                                                        flagForm = false; 
327
                                                        break;
328
                                                
329
                                                case "ana":
330
                                                        if(flagAna)
331
                                                                anahash.put(currentAna, vAna);
332
                                                        flagAna = false;
333
                                                        break;
334
                                                
335
                                                case "w":
336
                                                        vWord = "";
337
                                                        vWord = formhash.get("default").replaceAll("&", "&amp;").replaceAll("<", "&lt;"); // get default form
338
                                                        for (String form : formhash.keySet()) // and the others
339
                                                                if (form != "default")
340
                                                                        vWord += "\t"+formhash.get(form);
341
                                                        
342
                                                        for (String type : wordattributes.keySet()) // only word id ?
343
                                                                vWord+="\t"+wordattributes.get(type)
344
                                                
345
                                                        if (sendToPAttributes != null) // word attributes from structure properties
346
                                                        {
347
                                                                //println "injectedPAttributesValues: "+injectedPAttributesValues
348
                                                                for(String pattr : injectedPAttributes)
349
                                                                        vWord+="\t"+injectedPAttributesValues.get(pattr) ;//les attributs injecter
350
                                                        }
351
                                                        
352
                                                        for (String type : anahash.keySet()) // word annotations in txm:ana
353
                                                                vWord+="\t"+anahash.get(type)
354
                                                        
355
                                                        output.write(vWord+"\n");
356
                                                        vWord= "";
357
                                                        break;
358
                                                
359
                                                default:
360
                                                        if (sendToPAttributes != null) // reset structure properties
361
                                                        {
362
                                                                if (sendToPAttributes.keySet().contains(localname)) {
363
                                                                        for (String attr : sendToPAttributes.get(localname)) {
364
                                                                                injectedPAttributesValues.put(attr, "N/A") 
365
                                                                        }
366
                                                                }
367
                                                        }
368
                                                        
369
                                                        if (balisesToKeep.contains(localname)) {
370
                                                                output.write("</"+localname+">\n");
371
                                                        }
372
                                        }
373
                                        break;
374
                                
375
                                case XMLStreamConstants.CHARACTERS:
376
                                        if (flagForm) {
377
                                                vForm += parser.getText().trim();
378
                                        }
379
                                        if (flagAna) {
380
                                                vAna += parser.getText().trim();
381
                                        }
382
                                        break;
383
                        }
384
                }
385
                //output.write("</txmcorpus>\n"); 
386
                output.close();
387
                parser.close();
388
                
389
                return true;
390
        }
391
        
392
        /**
393
         * Gets the p attributs.
394
         *
395
         * @return the p attributs
396
         */
397
        public List<String> getpAttributs()
398
        {
399
                def pAttributs = [];
400
                
401
                for (String wordattr : wordattributes.keySet()) {
402
                        pAttributs.add(wordattr);
403
                }
404
                
405
                if (sendToPAttributes != null)
406
                        for (String pAttr : this.injectedPAttributes)
407
                                pAttributs.add(pAttr);
408
                
409
                for (String anakey : anahash.keySet()) {
410
                        pAttributs.add(anakey);
411
                }
412
                
413
                return pAttributs;
414
        }
415
        
416
        /**
417
         * Gets the s attributs.
418
         *
419
         * @return the s attributs
420
         */
421
        public List<String> getsAttributs()
422
        {
423
                println balisesfound
424
                def sAttributs = [];
425
                for (String balise : this.balisesfound.keySet()) {
426
                        List<String> sAtt = this.balisesfound.get(balise);
427
                        String attributes = "";
428
                        for (String attr : sAtt) {
429
                                attributes+="+"+attr;
430
                        }
431
                        
432
                        if (sAtt.size() > 0)
433
                                sAttributs.add(balise +":"+attributes);
434
                        else
435
                                sAttributs.add(balise);
436
                }
437
                return sAttributs;
438
        }
439
        
440
        /**
441
         * Sets the balises to keep.
442
         *
443
         * @param balisesToKeep the new balises to keep
444
         */
445
        public void setBalisesToKeep(List<String> balisesToKeep)
446
        {
447
                if (balisesToKeep != null)
448
                        this.balisesToKeep = balisesToKeep;
449
                else
450
                        println("Warning: the list of elements to keep is null")
451
        }
452
        
453
        /**
454
         * Sets the defautl reference pattern
455
         * TODO: not implemented
456
         *
457
         * @param balisesToKeep the new balises to keep
458
         */
459
        public void setDefaultReference(String pattern, List<String> strucProperties)
460
        {
461
                if (defaultReferencePattern != null) {
462
                        this.defaultReferences = defaultReferences;
463
                        defaultReferencePattern = pattern;
464
                }
465
        }
466
        
467
        
468
        /**
469
         * Sets the send to p attributes.
470
         *
471
         * @param sendus the sendus
472
         */
473
        public void setSendToPAttributes(HashMap<String, List<String>> sendus)
474
        {
475
                if (sendus != null)
476
                        this.sendToPAttributes = sendus;
477
                else
478
                        println("Warning: the pAttributes to inject is null")
479
        }
480
        
481
        
482
        /**
483
         * The main method.
484
         *
485
         * @param args the arguments
486
         */
487
        public static void main(String[] args) {
488
                
489
                String rootDir = "/home/mdecorde/TXM/corpora/CORNEILLEMOLIERETER/txm/CORNEILLEMOLIERETER";
490
                
491
                File srcfile = new File(rootDir,"CORNEILLEP_AGESILAS_1666.xml");
492
                println srcfile.exists()
493
                File wtcfile = new File(rootDir, "out/CORNEILLEP_AGESILAS_1666.wtc");
494
                new File(rootDir,"out").deleteDir()
495
                new File(rootDir,"out").mkdir()
496
                
497
                System.out.println("XMLTXM2WTC : "+srcfile+" >> "+wtcfile);
498
                def builder = new XMLTXM2WTC(srcfile.toURL());
499
                def balises = ["text", "s"];
500
                builder.setBalisesToKeep(balises);
501
                builder.transformFile(wtcfile);
502
                
503
                println("SATTRIBUTS: "+builder.getsAttributs());
504
                println("PATTRIBUTS: "+builder.getpAttributs());
505
                return;
506
        }
507
}
508