Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / XMLTXM2WTC.groovy @ 1094

History | View | Annotate | Download (12.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-04-11 15:30:35 +0200 (mar. 11 avril 2017) $
25
// $LastChangedRevision: 3426 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer
29

    
30
import java.text.DateFormat;
31
import java.util.Date;
32
import java.util.ArrayList;
33
import java.util.HashMap;
34
import java.util.LinkedHashMap;
35
import javax.xml.stream.*;
36
import java.net.URL;
37
import org.txm.importer.filters.*;
38
// TODO: Auto-generated Javadoc
39

    
40
/**
41
 * The Class XMLTXM2CQP.
42
 *
43
 * @author mdecorde
44
 * simple transofmration of a xml-tei-txm file into cqp file
45
 */
46

    
47
class XMLTXM2CQP
48
{
49

    
50
        /** The url. */
51
        private def url;
52

    
53
        /** The input data. */
54
        private def inputData;
55

    
56
        /** The factory. */
57
        private def factory;
58

    
59
        /** The parser. */
60
        private XMLStreamReader parser;
61

    
62
        /** The output. */
63
        private def output;
64

    
65
        /** The hashmap of txm:form and txm:ana values and the attributes hash*/
66
        LinkedHashMap<String, String> anahash = new LinkedHashMap<String, String>();
67
        LinkedHashMap<String, String> formhash = new LinkedHashMap<String, String>();
68
        LinkedHashMap<String, String> wordattributes = new LinkedHashMap<String, String>();
69

    
70
        /** The balisesfound. */
71
        HashMap<String, List<String>> balisesfound;// = new HashMap<String, List<String>>();
72

    
73
        /** The balises to keep. */
74
        List<String> balisesToKeep;
75

    
76
        /** The send to p attributes. */
77
        HashMap <String, List<String>> sendToPAttributes;// = new HashMap<String, List<String>>();
78

    
79
        /** The injected p attributes. */
80
        List<String> injectedPAttributes = new ArrayList<String>();
81

    
82
        /** The default reference : a pattern + the properties to use */
83
        List<String> defaultReferences = new ArrayList<String>();
84
        String defaultReferencePattern;
85

    
86
        /** The injected p attributes values. */
87
        HashMap <String, String> injectedPAttributesValues;// = new ArrayList<String>();
88

    
89
        /** The addinfos. */
90
        boolean addinfos = false;
91

    
92
        /** The txtname. */
93
        String txtname;
94

    
95
        /** The base. */
96
        String base;
97

    
98
        /** The project. */
99
        String project;
100

    
101
        /** The lang. */
102
        public String lang= "fr";
103
        public String currentForm;
104
        public String currentAna;
105

    
106
        /**
107
         * Sets the lang.
108
         *
109
         * @param lang the lang
110
         * @return the java.lang. object
111
         */
112
        public setLang(String lang)
113
        {
114
                this.lang = lang;
115
        }
116

    
117
        /**
118
         * Instantiates a new xMLTX m2 cqp.
119
         *
120
         * @param url the url
121
         */
122
        public XMLTXM2CQP(URL url){
123
                try {
124
                        this.url = url;
125
                        inputData = url.openStream();
126
                        factory = XMLInputFactory.newInstance();
127

    
128
                        parser = factory.createXMLStreamReader(inputData);
129

    
130

    
131
                } catch (XMLStreamException ex) {
132
                        System.out.println(ex);
133
                }catch (IOException ex) {
134
                        System.out.println("IOException while parsing ");
135
                }
136
        }
137

    
138
        /**
139
         * Sets the text info.
140
         *
141
         * @param name the name
142
         * @param base the base
143
         * @param project the project
144
         */
145
        public void setTextInfo(String name, String base, String project)
146
        {
147
                this.addinfos = true;
148
                this.txtname= name;
149
                this.base = base;
150
                this.project = project;
151
        }
152

    
153
        /**
154
         * Creates the output.
155
         *
156
         * @param outfile the outfile
157
         * @return true, if successful
158
         */
159
        private boolean createOutput(File outfile)
160
        {
161
                try {
162
                        output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile, outfile.exists()) , "UTF-8"));
163
                        return true;
164
                } catch (Exception e) {
165
                        System.err.println(e);
166
                        return false;
167
                }
168
        }
169

    
170
        /** The haspb. */
171
        boolean haspb = false;
172

    
173
        /** The haslb. */
174
        boolean haslb = false;
175

    
176
        /**
177
         * Transform file.
178
         *
179
         * @param outfile the outfile
180
         * @return true, if successful
181
         */
182
        public boolean transformFile(File outfile)
183
        {
184
                if(balisesToKeep == null)
185
                {
186
                        println "no element has been defined to be keeped"
187
                        return false;
188
                }
189

    
190
                haspb = false;
191
                haslb = false;
192

    
193
                boolean flagAna;
194
                boolean flagForm;
195
                boolean flagWord;
196
                String vWord = "";
197
                String vForm = "";
198
                String vAna = "";
199

    
200
                String lb_id = "";
201
                String pb_id = "";
202

    
203
                wordattributes = [:];
204
                balisesfound = new HashMap<String, List<String>>();
205

    
206

    
207
                if(!createOutput(outfile))
208
                        return false;
209

    
210
                if(sendToPAttributes != null)
211
                {
212
                        for(String tag: sendToPAttributes.keySet())
213
                                for(String attr : sendToPAttributes.get(tag))
214
                                        injectedPAttributes.add(tag+attr);
215
                        injectedPAttributesValues = [:];
216
                }
217

    
218
                //output.write("<txmcorpus lang=\""+lang+"\">\n");
219
                balisesfound.put("txmcorpus",["lang"]);
220
                try {
221
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
222
                                switch (event) {
223
                                        case XMLStreamConstants.START_ELEMENT:
224
                                                String localname = parser.getLocalName().toLowerCase();
225

    
226
                                        // we will only declare found tags in cwb registry
227
                                                if(balisesToKeep.contains(localname)) {
228
                                                        if(!balisesfound.containsKey(localname)) {
229
                                                                balisesfound.put(localname, []);
230
                                                        }
231

    
232
                                                        List<String> attrlist = balisesfound.get(localname);
233
                                                        for (int i= 0 ; i < parser.getAttributeCount() ;i++ )
234
                                                                if(!attrlist.contains(parser.getAttributeLocalName(i)))
235
                                                                        attrlist.add(parser.getAttributeLocalName(i));
236
                                                }
237

    
238
                                                switch (localname) {
239
                                                        case "w": // get word id !!
240
                                                                wordattributes.put("id", parser.getAttributeValue(null, "id"));
241
                                                                break;
242

    
243
                                                        case "form":
244
                                                                flagForm = true;
245
                                                                currentForm = parser.getAttributeValue(null, "type");
246
                                                                if(currentForm == null)
247
                                                                        currentForm = "default";
248
                                                                vForm = "";
249
                                                                break;
250

    
251
                                                        case "ana":
252
                                                                flagAna = true;
253
                                                                vAna ="";
254

    
255
                                                                currentAna = (parser.getAttributeValue(null,"type"));
256
                                                                if(currentAna != null)
257
                                                                        currentAna = currentAna.substring(1)// remove the #
258
                                                                else
259
                                                                        flagAna = false;
260
                                                                break;
261

    
262
                                                        default:
263

    
264
                                                                if (sendToPAttributes != null) {
265
                                                                        //println "should store $localname ? with "+sendToPAttributes.keySet()
266
                                                                        if (sendToPAttributes.keySet().contains(localname)) {
267
                                                                                //println "store attr of "+localname
268
                                                                                List<String> attrs = sendToPAttributes.get(localname);
269
                                                                                for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) {
270
                                                                                        if (attrs.contains(parser.getAttributeLocalName(i))) {
271
                                                                                                injectedPAttributesValues.put(localname+parser.getAttributeLocalName(i).toLowerCase(),parser.getAttributeValue(i))
272
                                                                                        }
273
                                                                                }
274
                                                                        }
275
                                                                }
276

    
277
                                                                if (balisesToKeep.contains(localname)) {
278
                                                                        output.write("<"+localname);
279
                                                                        //println "write <"+localname+"..."
280
                                                                        //write attributes
281
                                                                        boolean idwritten = false;
282
                                                                        boolean basewritten = false;
283
                                                                        boolean projectwritten = false;
284
                                                                        for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) {
285
                                                                                String attrname = parser.getAttributeLocalName(i).toLowerCase();
286
                                                                                if (attrname == "id")
287
                                                                                        idwritten = true;
288
                                                                                if (attrname == "base")
289
                                                                                        basewritten = true;
290
                                                                                if (attrname == "project")
291
                                                                                        projectwritten = true;
292
                                                                                output.write(" "+attrname+"=\""+parser.getAttributeValue(i).replace("\"", "'")+"\"" );
293
                                                                        }
294

    
295
                                                                        if (localname.equals("text"))
296
                                                                                if (addinfos) {
297
                                                                                        List<String> attrlist = balisesfound.get(localname);
298

    
299
                                                                                        if (!idwritten) {
300
                                                                                                output.write(" id=\""+txtname+"\"")
301
                                                                                                attrlist.add("id");
302
                                                                                        }
303
                                                                                        if (!basewritten) {
304
                                                                                                output.write(" base=\""+base+"\"");
305
                                                                                                attrlist.add("base");
306
                                                                                        }
307
                                                                                        if (!projectwritten) {
308
                                                                                                output.write(" project=\""+project+"\"");
309
                                                                                                attrlist.add("project");
310
                                                                                        }
311
                                                                                }
312

    
313
                                                                        // finalize tag
314
                                                                        output.write(">\n");
315
                                                                }
316
                                                }
317
                                                break;
318

    
319
                                        case XMLStreamConstants.END_ELEMENT:
320
                                                String localname = parser.getLocalName().toLowerCase();
321
                                                switch (localname) {
322
                                                        case "form":
323
                                                                if(flagForm)
324
                                                                        formhash.put(currentForm, vForm);
325
                                                                flagForm = false;
326
                                                                break;
327

    
328
                                                        case "ana":
329
                                                                if(flagAna)
330
                                                                        anahash.put(currentAna, vAna);
331
                                                                flagAna = false;
332
                                                                break;
333

    
334
                                                        case "w":
335
                                                                vWord = "";
336
                                                                vWord = formhash.get("default").replaceAll("&", "&amp;").replaceAll("<", "&lt;"); // get default form
337
                                                                for (String form : formhash.keySet()) // and the others
338
                                                                        if (form != "default")
339
                                                                                vWord += "\t"+formhash.get(form);
340

    
341
                                                                for (String type : wordattributes.keySet()) // only word id ?
342
                                                                        vWord+="\t"+wordattributes.get(type)
343

    
344
                                                                if (sendToPAttributes != null) // word attributes from structure properties
345
                                                                {
346
                                                                        //println "injectedPAttributesValues: "+injectedPAttributesValues
347
                                                                        for(String pattr : injectedPAttributes)
348
                                                                                vWord+="\t"+injectedPAttributesValues.get(pattr) ;//les attributs injecter
349
                                                                }
350

    
351
                                                                for (String type : anahash.keySet()) // word annotations in txm:ana
352
                                                                        vWord+="\t"+anahash.get(type)
353

    
354
                                                                output.write(vWord+"\n");
355
                                                                vWord= "";
356
                                                                break;
357

    
358
                                                        default:
359
                                                                if (sendToPAttributes != null) // reset structure properties
360
                                                                {
361
                                                                        if (sendToPAttributes.keySet().contains(localname)) {
362
                                                                                for (String attr : sendToPAttributes.get(localname)) {
363
                                                                                        injectedPAttributesValues.put(attr, "N/A")
364
                                                                                }
365
                                                                        }
366
                                                                }
367

    
368
                                                                if (balisesToKeep.contains(localname)) {
369
                                                                        output.write("</"+localname+">\n");
370
                                                                }
371
                                                }
372
                                                break;
373

    
374
                                        case XMLStreamConstants.CHARACTERS:
375
                                                if (flagForm) {
376
                                                        vForm += parser.getText().trim();
377
                                                }
378
                                                if (flagAna) {
379
                                                        vAna += parser.getText().trim();
380
                                                }
381
                                                break;
382
                                }
383
                        }
384
                        //output.write("</txmcorpus>\n");
385
                        output.close();
386
                        parser.close();
387
                } catch (Exception ex) {
388
                        println "Error while parsing $url : "+ex
389
                        ex.printStackTrace();
390
                        return false;
391
                }
392
                return true;
393
        }
394

    
395
        /**
396
         * Gets the p attributs.
397
         *
398
         * @return the p attributs
399
         */
400
        public List<String> getpAttributs()
401
        {
402
                def pAttributs = [];
403

    
404
                for (String wordattr : wordattributes.keySet()) {
405
                        pAttributs.add(wordattr);
406
                }
407

    
408
                if (sendToPAttributes != null)
409
                        for (String pAttr : this.injectedPAttributes)
410
                                pAttributs.add(pAttr);
411

    
412
                for (String anakey : anahash.keySet()) {
413
                        pAttributs.add(anakey);
414
                }
415

    
416
                return pAttributs;
417
        }
418

    
419
        /**
420
         * Gets the s attributs.
421
         *
422
         * @return the s attributs
423
         */
424
        public List<String> getsAttributs()
425
        {
426
                println balisesfound
427
                def sAttributs = [];
428
                for (String balise : this.balisesfound.keySet()) {
429
                        List<String> sAtt = this.balisesfound.get(balise);
430
                        String attributes = "";
431
                        for (String attr : sAtt) {
432
                                attributes+="+"+attr;
433
                        }
434

    
435
                        if (sAtt.size() > 0)
436
                                sAttributs.add(balise +":"+attributes);
437
                        else
438
                                sAttributs.add(balise);
439
                }
440
                return sAttributs;
441
        }
442

    
443
        /**
444
         * Sets the balises to keep.
445
         *
446
         * @param balisesToKeep the new balises to keep
447
         */
448
        public void setBalisesToKeep(List<String> balisesToKeep)
449
        {
450
                if (balisesToKeep != null)
451
                        this.balisesToKeep = balisesToKeep;
452
                else
453
                        println("Warning: the list of elements to keep is null")
454
        }
455

    
456
        /**
457
         * Sets the defautl reference pattern
458
         * TODO: not implemented
459
         *
460
         * @param balisesToKeep the new balises to keep
461
         */
462
        public void setDefaultReference(String pattern, List<String> strucProperties)
463
        {
464
                if (defaultReferencePattern != null) {
465
                        this.defaultReferences = defaultReferences;
466
                        defaultReferencePattern = pattern;
467
                }
468
        }
469

    
470

    
471
        /**
472
         * Sets the send to p attributes.
473
         *
474
         * @param sendus the sendus
475
         */
476
        public void setSendToPAttributes(HashMap<String, List<String>> sendus)
477
        {
478
                if (sendus != null)
479
                        this.sendToPAttributes = sendus;
480
                else
481
                        println("Warning: the pAttributes to inject is null")
482
        }
483

    
484

    
485
        /**
486
         * The main method.
487
         *
488
         * @param args the arguments
489
         */
490
        public static void main(String[] args) {
491

    
492
                String rootDir = "/home/mdecorde/TXM/corpora/CORNEILLEMOLIERETER/txm/CORNEILLEMOLIERETER";
493

    
494
                File srcfile = new File(rootDir,"CORNEILLEP_AGESILAS_1666.xml");
495
                println srcfile.exists()
496
                File cqpfile = new File(rootDir, "out/CORNEILLEP_AGESILAS_1666.cqp");
497
                new File(rootDir,"out").deleteDir()
498
                new File(rootDir,"out").mkdir()
499

    
500
                System.out.println("XMLTXM2CQP : "+srcfile+" >> "+cqpfile);
501
                def builder = new XMLTXM2CQP(srcfile.toURL());
502
                def balises = ["text", "s"];
503
                builder.setBalisesToKeep(balises);
504
                builder.transformFile(cqpfile);
505

    
506
                println("SATTRIBUTS: "+builder.getsAttributs());
507
                println("PATTRIBUTS: "+builder.getpAttributs());
508
                return;
509
        }
510
}
511