Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / XMLTXM2WTC.groovy @ 187

History | View | Annotate | Download (12.7 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2016-03-29 09:51:35 +0200 (Tue, 29 Mar 2016) $
25
// $LastChangedRevision: 3185 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer
29

    
30
import java.text.DateFormat;
31
import java.util.Date;
32
import java.util.ArrayList;
33
import java.util.HashMap;
34
import java.util.LinkedHashMap;
35
import javax.xml.stream.*;
36
import java.net.URL;
37
import org.txm.importer.filters.*;
38
// TODO: Auto-generated Javadoc
39

    
40
/**
41
 * The Class XMLTXM2WTC.
42
 *
43
 * @author mdecorde
44
 * simple transofmration of a xml-tei-txm file into wtc file
45
 */
46

    
47
class XMLTXM2WTC
48
{
49
        
50
        /** The url. */
51
        private def url;
52
        
53
        /** The input data. */
54
        private def inputData;
55
        
56
        /** The factory. */
57
        private def factory;
58
        
59
        /** The parser. */
60
        private XMLStreamReader parser;
61
        
62
        /** The output. */
63
        private def output;
64
        
65
        /** The hashmap of txm:form and txm:ana values and the attributes hash*/
66
        LinkedHashMap<String, String> anahash = new LinkedHashMap<String, String>();
67
        LinkedHashMap<String, String> formhash = new LinkedHashMap<String, String>();
68
        LinkedHashMap<String, String> wordattributes = new LinkedHashMap<String, String>();
69
        
70
        /** The balisesfound. */
71
        HashMap<String, List<String>> balisesfound;// = new HashMap<String, List<String>>();
72

    
73
        /** The balises to keep. */
74
        List<String> balisesToKeep;
75
        
76
        /** The send to p attributes. */
77
        HashMap <String, List<String>> sendToPAttributes;// = new HashMap<String, List<String>>();
78
        
79
        /** The injected p attributes. */
80
        List<String> injectedPAttributes = new ArrayList<String>();
81
        
82
        /** The default reference : a pattern + the properties to use */
83
        List<String> defaultReferences = new ArrayList<String>();
84
        String defaultReferencePattern;
85
        
86
        /** The injected p attributes values. */
87
        HashMap <String, String> injectedPAttributesValues;// = new ArrayList<String>();
88
        
89
        /** The addinfos. */
90
        boolean addinfos = false;
91
        
92
        /** The txtname. */
93
        String txtname;
94
        
95
        /** The base. */
96
        String base;
97
        
98
        /** The project. */
99
        String project;
100
        
101
        /** The lang. */
102
        public String lang= "fr";
103
        public String currentForm;
104
        public String currentAna;
105
        
106
        /**
107
         * Sets the lang.
108
         *
109
         * @param lang the lang
110
         * @return the java.lang. object
111
         */
112
        public setLang(String lang)
113
        {
114
                this.lang = lang;
115
        }
116
        
117
        /**
118
         * Instantiates a new xMLTX m2 wtc.
119
         *
120
         * @param url the url
121
         */
122
        public XMLTXM2WTC(URL url){
123
                try {
124
                        this.url = url;
125
                        inputData = url.openStream();
126
                        factory = XMLInputFactory.newInstance();
127
                        
128
                        parser = factory.createXMLStreamReader(inputData);
129
                        
130
                        
131
                } catch (XMLStreamException ex) {
132
                        System.out.println(ex);
133
                }catch (IOException ex) {
134
                        System.out.println("IOException while parsing ");
135
                }
136
        }
137
        
138
        /**
139
         * Sets the text info.
140
         *
141
         * @param name the name
142
         * @param base the base
143
         * @param project the project
144
         */
145
        public void setTextInfo(String name, String base, String project)
146
        {
147
                this.addinfos = true;
148
                this.txtname= name;
149
                this.base = base;
150
                this.project = project;
151
        }
152
        
153
        /**
154
         * Creates the output.
155
         *
156
         * @param outfile the outfile
157
         * @return true, if successful
158
         */
159
        private boolean createOutput(File outfile)
160
        {
161
                try {
162
                        output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile, outfile.exists()) , "UTF-8"));
163
                        return true;
164
                } catch (Exception e) {
165
                        System.err.println(e);
166
                        return false;
167
                }
168
        }
169
        
170
        /** The haspb. */
171
        boolean haspb = false;
172
        
173
        /** The haslb. */
174
        boolean haslb = false;
175
        
176
        /**
177
         * Transform file.
178
         *
179
         * @param outfile the outfile
180
         * @return true, if successful
181
         */
182
        public boolean transformFile(File outfile)
183
        {
184
                if(balisesToKeep == null)
185
                {
186
                        println "no element has been defined to be keeped"
187
                        return false;
188
                }
189
                
190
                haspb = false;
191
                haslb = false;        
192
                
193
                boolean flagAna;
194
                boolean flagForm;
195
                boolean flagWord;
196
                String vWord = "";
197
                String vForm = "";
198
                String vAna = "";
199
                
200
                String lb_id = "";
201
                String pb_id = "";
202
                
203
                wordattributes = [:];
204
                balisesfound = new HashMap<String, List<String>>();
205
                
206
                
207
                if(!createOutput(outfile))
208
                        return false;
209
                
210
                if(sendToPAttributes != null)
211
                {
212
                        for(String tag: sendToPAttributes.keySet())
213
                                for(String attr : sendToPAttributes.get(tag))
214
                                        injectedPAttributes.add(tag+attr);
215
                        injectedPAttributesValues = [:];
216
                }
217
                
218
                //output.write("<txmcorpus lang=\""+lang+"\">\n");
219
                balisesfound.put("txmcorpus",["lang"]);
220
                
221
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
222
                        switch (event) 
223
                        {
224
                                case XMLStreamConstants.START_ELEMENT:
225
                                        String localname = parser.getLocalName().toLowerCase();
226
                                
227
                                // we will only declare found tags in cwb registry
228
                                        if(balisesToKeep.contains(localname))
229
                                        {
230
                                                if(!balisesfound.containsKey(localname))
231
                                                {
232
                                                        balisesfound.put(localname,[]);
233
                                                }
234
                                                
235
                                                List<String> attrlist = balisesfound.get(localname);
236
                                                for(int i= 0 ; i < parser.getAttributeCount() ;i++ )
237
                                                        if(!attrlist.contains(parser.getAttributeLocalName(i)))
238
                                                                attrlist.add(parser.getAttributeLocalName(i));
239
                                                
240
                                        }
241
                                        switch (localname) 
242
                                        {                                                
243
                                                case "w": // get word id !!
244
                                                        wordattributes.put("id", parser.getAttributeValue(null, "id"));
245
                                                        break;
246
                                                
247
                                                case "form":
248
                                                        flagForm = true;
249
                                                        currentForm = parser.getAttributeValue(null, "type");
250
                                                        if(currentForm == null)
251
                                                                currentForm = "default";
252
                                                        vForm = "";
253
                                                        break;
254
                                                
255
                                                case "ana":
256
                                                        flagAna = true;
257
                                                        vAna ="";
258
                                                
259
                                                        currentAna = (parser.getAttributeValue(null,"type"));
260
                                                        if(currentAna != null)
261
                                                                currentAna = currentAna.substring(1)// remove the #
262
                                                        else
263
                                                                flagAna = false;
264
                                                        break;
265
                                                
266
                                                default:
267
                                                        
268
                                                        if (sendToPAttributes != null) {
269
                                                                //println "should store $localname ? with "+sendToPAttributes.keySet()
270
                                                                if (sendToPAttributes.keySet().contains(localname)) {
271
                                                                        //println "store attr of "+localname
272
                                                                        List<String> attrs = sendToPAttributes.get(localname);
273
                                                                        for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) {
274
                                                                                if (attrs.contains(parser.getAttributeLocalName(i))) {
275
                                                                                        injectedPAttributesValues.put(localname+parser.getAttributeLocalName(i).toLowerCase(),parser.getAttributeValue(i)) 
276
                                                                                }
277
                                                                        }
278
                                                                }
279
                                                        }
280
                                                        
281
                                                        if (balisesToKeep.contains(localname)) {
282
                                                                output.write("<"+localname);
283
                                                                //println "write <"+localname+"..."
284
                                                                //write attributes
285
                                                                boolean idwritten = false;
286
                                                                boolean basewritten = false;
287
                                                                boolean projectwritten = false;
288
                                                                for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) {
289
                                                                        String attrname = parser.getAttributeLocalName(i).toLowerCase();
290
                                                                        if (attrname == "id")
291
                                                                                idwritten = true;
292
                                                                        if (attrname == "base")
293
                                                                                basewritten = true;
294
                                                                        if (attrname == "project")
295
                                                                                projectwritten = true;
296
                                                                        output.write(" "+attrname+"=\""+parser.getAttributeValue(i).replace("\"", "'")+"\"" );
297
                                                                }
298
                                                                
299
                                                                if (localname.equals("text"))
300
                                                                        if (addinfos) {
301
                                                                                List<String> attrlist = balisesfound.get(localname);
302
        
303
                                                                                if (!idwritten) {
304
                                                                                        output.write(" id=\""+txtname+"\"")
305
                                                                                        attrlist.add("id");
306
                                                                                }
307
                                                                                if (!basewritten) {
308
                                                                                        output.write(" base=\""+base+"\"");
309
                                                                                        attrlist.add("base");
310
                                                                                }
311
                                                                                if (!projectwritten) {
312
                                                                                        output.write(" project=\""+project+"\"");
313
                                                                                        attrlist.add("project");
314
                                                                                }                                                                                        
315
                                                                        }        
316
                                                                                        
317
                                                                // finalize tag
318
                                                                output.write(">\n");
319
                                                        }
320
                                        }
321
                                        break;
322
                                
323
                                case XMLStreamConstants.END_ELEMENT:
324
                                        String localname = parser.getLocalName().toLowerCase();
325
                                        switch (localname) {
326
                                                case "form":
327
                                                        if(flagForm)
328
                                                                formhash.put(currentForm, vForm);
329
                                                        flagForm = false; 
330
                                                        break;
331
                                                
332
                                                case "ana":
333
                                                        if(flagAna)
334
                                                                anahash.put(currentAna, vAna);
335
                                                        flagAna = false;
336
                                                        break;
337
                                                
338
                                                case "w":
339
                                                        vWord = "";
340
                                                        vWord = formhash.get("default").replaceAll("&", "&amp;").replaceAll("<", "&lt;"); // get default form
341
                                                        for (String form : formhash.keySet()) // and the others
342
                                                                if (form != "default")
343
                                                                        vWord += "\t"+formhash.get(form);
344
                                                        
345
                                                        for (String type : wordattributes.keySet()) // only word id ?
346
                                                                vWord+="\t"+wordattributes.get(type)
347
                                                
348
                                                        if (sendToPAttributes != null) // word attributes from structure properties
349
                                                        {
350
                                                                //println "injectedPAttributesValues: "+injectedPAttributesValues
351
                                                                for(String pattr : injectedPAttributes)
352
                                                                        vWord+="\t"+injectedPAttributesValues.get(pattr) ;//les attributs injecter
353
                                                        }
354
                                                        
355
                                                        for (String type : anahash.keySet()) // word annotations in txm:ana
356
                                                                vWord+="\t"+anahash.get(type)
357
                                                        
358
                                                        output.write(vWord+"\n");
359
                                                        vWord= "";
360
                                                        break;
361
                                                
362
                                                default:
363
                                                        if (sendToPAttributes != null) // reset structure properties
364
                                                        {
365
                                                                if (sendToPAttributes.keySet().contains(localname)) {
366
                                                                        for (String attr : sendToPAttributes.get(localname)) {
367
                                                                                injectedPAttributesValues.put(attr, "N/A") 
368
                                                                        }
369
                                                                }
370
                                                        }
371
                                                        
372
                                                        if (balisesToKeep.contains(localname)) {
373
                                                                output.write("</"+localname+">\n");
374
                                                        }
375
                                        }
376
                                        break;
377
                                
378
                                case XMLStreamConstants.CHARACTERS:
379
                                        if (flagForm) {
380
                                                vForm += parser.getText().trim();
381
                                        }
382
                                        if (flagAna) {
383
                                                vAna += parser.getText().trim();
384
                                        }
385
                                        break;
386
                        }
387
                }
388
                //output.write("</txmcorpus>\n"); 
389
                output.close();
390
                parser.close();
391
                
392
                return true;
393
        }
394
        
395
        /**
396
         * Gets the p attributs.
397
         *
398
         * @return the p attributs
399
         */
400
        public List<String> getpAttributs()
401
        {
402
                def pAttributs = [];
403
                
404
                for (String wordattr : wordattributes.keySet()) {
405
                        pAttributs.add(wordattr);
406
                }
407
                
408
                if (sendToPAttributes != null)
409
                        for (String pAttr : this.injectedPAttributes)
410
                                pAttributs.add(pAttr);
411
                
412
                for (String anakey : anahash.keySet()) {
413
                        pAttributs.add(anakey);
414
                }
415
                
416
                return pAttributs;
417
        }
418
        
419
        /**
420
         * Gets the s attributs.
421
         *
422
         * @return the s attributs
423
         */
424
        public List<String> getsAttributs()
425
        {
426
                def sAttributs = [];
427
                for (String balise : this.balisesfound.keySet()) {
428
                        List<String> sAtt = this.balisesfound.get(balise);
429
                        String attributes = "";
430
                        for (String attr : sAtt) {
431
                                attributes+="+"+attr;
432
                        }
433
                        
434
                        if (sAtt.size() > 0)
435
                                sAttributs.add(balise +":"+attributes);
436
                        else
437
                                sAttributs.add(balise);
438
                }
439
                return sAttributs;
440
        }
441
        
442
        /**
443
         * Sets the balises to keep.
444
         *
445
         * @param balisesToKeep the new balises to keep
446
         */
447
        public void setBalisesToKeep(List<String> balisesToKeep)
448
        {
449
                if (balisesToKeep != null)
450
                        this.balisesToKeep = balisesToKeep;
451
                else
452
                        println("Warning: the list of elements to keep is null")
453
        }
454
        
455
        /**
456
         * Sets the defautl reference pattern
457
         * TODO: not implemented
458
         *
459
         * @param balisesToKeep the new balises to keep
460
         */
461
        public void setDefaultReference(String pattern, List<String> strucProperties)
462
        {
463
                if (defaultReferencePattern != null) {
464
                        this.defaultReferences = defaultReferences;
465
                        defaultReferencePattern = pattern;
466
                }
467
        }
468
        
469
        
470
        /**
471
         * Sets the send to p attributes.
472
         *
473
         * @param sendus the sendus
474
         */
475
        public void setSendToPAttributes(HashMap<String, List<String>> sendus)
476
        {
477
                if (sendus != null)
478
                        this.sendToPAttributes = sendus;
479
                else
480
                        println("Warning: the pAttributes to inject is null")
481
        }
482
        
483
        
484
        /**
485
         * The main method.
486
         *
487
         * @param args the arguments
488
         */
489
        public static void main(String[] args) {
490
                
491
                String rootDir = "C:/Documents and Settings/H/xml/quick/txm";
492
                
493
                File srcfile = new File(rootDir,"quick.xml");
494
                File wtcfile = new File(rootDir+"/wtc","quick.wtc");
495
                new File(rootDir,"wtc").deleteDir()
496
                new File(rootDir,"wtc").mkdir()
497
                
498
                System.out.println("XMLTXM2WTC : "+srcfile+" >> "+wtcfile);
499
                def builder = new XMLTXM2WTC(srcfile.toURL());
500
                def balises = ["text","lb","pb"];
501
                builder.setBalisesToKeep(balises);
502
                builder.transformFile wtcfile;
503
                
504
                println(builder.getsAttributs());
505
                println(builder.getpAttributs());
506
                return;
507
        }
508
}
509