Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xmltxm / compiler.groovy @ 1000

History | View | Annotate | Download (12.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-05-26 17:42:36 +0200 (jeu. 26 mai 2016) $
25
// $LastChangedRevision: 3219 $
26
// $LastChangedBy: mdecorde $
27
//
28

    
29

    
30
package org.txm.scripts.importer.xmltxm;
31

    
32
import java.util.ArrayList;;
33

    
34
import org.txm.*;
35
import org.txm.core.engines.*;
36
import org.txm.importer.cwb.BuildCwbEncodeArgs;
37
import org.txm.importer.cwb.CwbEncode
38
import org.txm.importer.cwb.CwbMakeAll
39
import org.txm.scripts.importer.*;
40
import org.txm.scripts.*;
41
import org.txm.importer.scripts.xmltxm.*;
42
import org.txm.utils.treetagger.TreeTagger;
43

    
44
import javax.xml.stream.*;
45
import java.net.URL;
46
import java.io.File;
47
import java.util.HashMap;
48
import java.util.List;
49

    
50
// TODO: Auto-generated Javadoc
51
/**
52
 * The Class compiler.
53
 */
54
class compiler
55
{
56
        String sortMetadata;
57
        /** The debug. */
58
        private boolean debug= false;
59

    
60
        /** The input data. */
61
        private def inputData;
62

    
63
        /** The factory. */
64
        private def factory;
65

    
66
        /** The parser. */
67
        private XMLStreamReader parser;
68

    
69
        /** The dir. */
70
        private def dir;
71

    
72
        /** The output. */
73
        private Writer output;
74

    
75
        /** The url. */
76
        private def url;
77

    
78
        /** The text. */
79
        String text="";
80

    
81
        /** The base. */
82
        String base="";
83

    
84
        /** The project. */
85
        String project="";
86

    
87
        /** The lang. */
88
        private String lang ="fr";
89

    
90
        /** The s attribs. */
91
        private static HashMap<String, List<String>> sAttribs;
92

    
93
        /** The anatypes. */
94
        private static anatypes = []
95
        private static anavalues = [:]
96
        
97
        /**
98
         * initialize.
99
         *
100
         */
101
        public compiler(){}
102

    
103
        /**
104
         * Instantiates a new compiler.
105
         *
106
         * @param url the url
107
         * @param text the text
108
         * @param base the base
109
         * @param project the project
110
         */
111
        public compiler(URL url,String text,String base, String project)
112
        {
113
                this.text = text
114
                this.base = base;
115
                this.project = project;
116
                try {
117
                        this.url = url;
118
                        inputData = url.openStream();
119

    
120
                        factory = XMLInputFactory.newInstance();
121
                        parser = factory.createXMLStreamReader(inputData);
122
                } catch (Exception ex) {
123
                        System.out.println("Error while creating indexes: $ex");
124
                        ex.printStackTrace();
125
                }
126
        }
127

    
128
        public void setSortMetadata(String sortMetadata)
129
        {
130
                this.sortMetadata = sortMetadata;
131
        }
132

    
133
        /**
134
         * set the language of the corpus.
135
         *
136
         * @param lang the lang
137
         * @return the java.lang. object
138
         */
139
        public setLang(String lang)
140
        {
141
                this.lang = lang;
142
        }
143

    
144
        /**
145
         * Creates the output.
146
         *
147
         * @param dirPathName the dir path name
148
         * @param fileName the file name
149
         * @return true, if successful
150
         */
151
        private boolean createOutput(File f) {
152
                try {
153
                        //File f = new File(dirPathName, fileName)
154
                        output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f,f.exists())) , "UTF-8");
155
                        return true;
156
                } catch (Exception e) {
157
                println "Error while create CQP otput file: "+e
158
                        e.printStackTrace();
159

    
160
                        return false;
161
                }
162
        }
163

    
164
        /**
165
         * Go to text.
166
         */
167
        private boolean GoToText()
168
        {
169
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
170
                        if (event == XMLStreamConstants.END_ELEMENT)
171
                                if (parser.getLocalName().equals("teiHeader") || parser.getLocalName().equals("teiheader")) {
172
                                        return true;
173
                                }
174
                }
175
                return false;
176
        }
177

    
178
        /**
179
         * Transfom file cqp.
180
         *
181
         * @param dirPathName the dir path name
182
         * @param fileName the file name
183
         * @return true, if successful
184
         */
185
        public boolean transfomFileCqp(File cqpFile)
186
        {
187
                createOutput(cqpFile);
188
                String headvalue=""
189
                String vAna = "";
190
                String vForm = "";
191
                String wordid= "";
192
                String vHead = "";
193
                String anatype = null;
194
                String anavalue = null;
195
                int p_id = 0;
196
                int s_id = 0;
197

    
198
                boolean captureword = false;
199
                boolean flagForm = false;
200
                boolean flagAna = false;
201
                boolean inW = false;
202
                int wcounter = 1;
203
                if (!GoToText()) {
204
                        println "Error: no teiHeader tag found in text '"+this.text+"' (please check file format or content)"
205
                        return false;
206
                }
207
                int wc = 0; // TEMP FOR TEST ONLY
208
                try {
209
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
210
                        {
211
                                switch (event) {
212
                                        case XMLStreamConstants.START_ELEMENT:
213
                                                //println "start: "+parser.getLocalName()
214
                                                switch (parser.getLocalName()) {
215
                                                        case "w":
216
                                                        
217
                                                        inW = true;
218
                                                        anavalues = [:]
219
                                                        wordid = parser.getAttributeValue(null, "id")
220
                                                        if (wordid == null)
221
                                                                wordid = "w_"+text+"_"+(wcounter++)
222

    
223
                                                        vAna ="";
224
                                                        break;
225

    
226
                                                        case "form":
227
                                                        String type2 = parser.getAttributeValue(null, "type");
228
                                                        if(type2 == null || type2.equals("default")) {
229
                                                                flagForm = true;
230
                                                                vForm = "";
231
                                                        } else {
232
                                                                flagAna = true;
233
                                                                vAna += "\t";
234
                                                                if(!anatypes.contains(type2))
235
                                                                        anatypes << type2;
236
                                                        }
237
                                                        
238
                                                        break;
239

    
240
                                                        case "ana":
241
                                                        flagAna = true;
242
                                                        anavalue = "";
243
                                                        anatype = parser.getAttributeValue(null, "type");
244
                                                        if (anatype != null) {
245
                                                                if(anatype.startsWith("#"))
246
                                                                        anatype = anatype.substring(1)
247
                                                                break;
248
                                                        }
249
                                                        break;
250

    
251
                                                        default:
252
                                                        if (!inW) {
253
                                                                output.write("<"+parser.getLocalName().toLowerCase());
254
                                                                if (!sAttribs.containsKey(parser.getLocalName()))
255
                                                                        sAttribs.put(parser.getLocalName().toLowerCase(), []);
256

    
257
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
258
                                                                        String attrname = parser.getAttributeLocalName(i).toLowerCase();
259
                                                                        String attrvalue = parser.getAttributeValue(i);
260
                                                                        if (!(parser.getLocalName() == "text" && attrname == "id"))
261
                                                                                output.write(" "+attrname+"=\""+attrvalue.replace("\"", "'")+"\"");
262

    
263
                                                                        if (!sAttribs.get(parser.getLocalName().toLowerCase()).contains(attrname))
264
                                                                                sAttribs.get(parser.getLocalName().toLowerCase()).add(attrname)
265
                                                                }
266

    
267
                                                                if (parser.getLocalName() == "text") {
268
                                                                        output.write(" id=\""+text+"\" base=\""+base+"\" project=\""+project+"\"");
269
                                                                }
270
                                                                output.write(">\n");
271
                                                        }
272
                                                }
273
                                                break;
274

    
275
                                        case XMLStreamConstants.END_ELEMENT:
276
                                                switch (parser.getLocalName()) {
277
                                                        case "TEI":
278
                                                                break;
279
                                                        case "w":
280
                                                        for (String t : anatypes) {
281
                                                                def v = anavalues.get(t);
282
                                                                if (v != null) vAna +="\t"+v;
283
                                                                else vAna +="\t";
284
                                                        }
285
                                                        
286
                                                        output.write( vForm.replaceAll("&", "&amp;").replaceAll("<", "&lt;") +"\t"+wordid+vAna+"\n");
287
                                                        vAna = "";
288
                                                        vForm = "";
289
                                                        inW = false;
290
                                                        break;
291

    
292
                                                        case "form":
293
                                                        flagForm = false;
294
                                                        flagAna = false;
295
                                                        break;
296

    
297
                                                        case "ana":
298
                                                        anavalues.put(anatype, anavalue)
299
                                                        flagAna = false;
300
                                                        break;
301

    
302
                                                        default:
303
                                                        if(!inW)
304
                                                                output.write("</"+parser.getLocalName().toLowerCase()+">\n");
305

    
306
                                                }
307
                                                break;
308

    
309
                                        case XMLStreamConstants.CHARACTERS:
310
                                                if(inW)
311
                                                {        
312
                                                        if(flagForm) {        
313
                                                                vForm += parser.getText().trim();
314
                                                        }
315
                                                        else if (flagAna) {
316
                                                                anavalue += parser.getText().trim();
317
                                                        }
318
                                                }
319
                                                break;
320
                                }
321
                        }
322

    
323
                        output.close();
324
                        parser.close();
325
                }
326
                catch (Exception ex) {
327
                        System.out.println("Error while writing CQP file $ex");
328
                        ex.printStackTrace();
329
                        return false;
330
                }
331
                return true;
332
        }
333

    
334

    
335

    
336
        /**
337
         * Run.
338
         *
339
         * @param rootDirFile the root dir file
340
         * @param basename the basename
341
         * @return true, if successful
342
         */
343
        public boolean run(File binDir, File txmDir, String basename, String corpusname, List<File> files)
344
        {
345
                anatypes = new ArrayList<String>();// init only 1 time
346
                anavalues = [:]
347
                sAttribs = new HashMap<String, List<String>>();// init only 1 time
348
                String rootDir = binDir.getAbsolutePath();
349

    
350
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
351
                        println ("Error: CWB executables not well set.")
352
                        return false;
353
                }
354
                if(!new File(rootDir).exists()) {
355
                        println ("binary directory does not exists: "+rootDir)
356
                        return false;
357
                }
358

    
359
                File cqpFile = new File(binDir,"cqp/"+corpusname.toLowerCase()+".cqp");
360
                new File(rootDir,"/cqp/").deleteDir();
361
                new File(rootDir,"/cqp/").mkdir();
362
                new File(rootDir,"data/"+corpusname).deleteDir();
363
                new File(rootDir,"data/"+corpusname).mkdir();
364
                new File(rootDir,"registry/").mkdir();
365

    
366
                String textid = "";
367
                int counttext = 0;
368
                //List<File> files = txmDir.listFiles();
369
                
370
                // get all anatypes
371
                for (File f : files) {
372
                        getAnaTypes(f)
373
                }
374

    
375
                //0 set Lang
376
                if (createOutput(cqpFile)) {
377
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
378
                        output.close();
379
                }
380
                //1- Transform into CQP file
381
                def builder = null;
382
                for (File f : files) {
383
                        counttext++;
384
                        if (!f.exists()) {
385
                                println("file "+f+ " does not exists")
386
                        } else {
387
                                //println("process file "+f)
388
                                String txtname = f.getName().substring(0,f.getName().length()-4);
389
                                builder = new compiler(f.toURI().toURL(), txtname, corpusname.toLowerCase(), "default");
390
                                builder.setLang(lang);
391
                                if(!builder.transfomFileCqp(cqpFile))
392
                                        return false;
393
                        }
394
                }
395

    
396
                //end corpus
397
                if (createOutput(cqpFile)) {
398
                        output.write("</txmcorpus>\n");
399
                        output.close();
400
                }
401

    
402
                //2- Import into CWB
403

    
404
                CwbEncode cwbEn = new CwbEncode();
405
                cwbEn.setDebug(debug);
406
                CwbMakeAll cwbMa = new CwbMakeAll();
407
                cwbMa.setDebug(debug);
408
                List<String> pargs = ["id"];
409
                for(String ana : anatypes)
410
                        pargs.add(ana);
411

    
412
                List<String> sargs = [];
413
                //println "Found Sattributes "+this.sAttribs;
414
                if(sAttribs.containsKey("text")) {
415
                        if(!sAttribs.get("text").contains("id"))
416
                                sAttribs.get("text").add("id");
417
                        if(!sAttribs.get("text").contains("base"))
418
                                sAttribs.get("text").add("base");
419
                        if(!sAttribs.get("text").contains("project"))
420
                                sAttribs.get("text").add("project");
421
                } else {
422
                        sargs.add("text:0+id+base+project")
423
                }
424

    
425
                if (sAttribs.containsKey("txmcorpus")) {
426
                        if(!sAttribs.get("txmcorpus").contains("lang"))
427
                                sAttribs.get("txmcorpus").add("lang");
428
                } else {
429
                        sargs.add("txmcorpus:0+lang")
430
                }
431

    
432
                for (String tag : this.sAttribs.keySet()) {
433
                        String sAttr = tag;
434
                        if(sAttribs.get(tag).size() > 0)
435
                                sAttr += ":";
436
                        for(String attr : sAttribs.get(tag))
437
                                sAttr +="+"+attr;
438
                        sargs.add(sAttr)
439
                }
440

    
441

    
442

    
443
                String[] sAttributes = sargs;
444
                String[] pAttributes = pargs;
445
                println "sAttributes : "+sAttributes;
446
                println "pAttributes : "+pAttributes;
447
                try {
448
                        String regPath = rootDir + "/registry/"+corpusname.toLowerCase() 
449
                        cwbEn.run(
450
                                rootDir + "/data/$corpusname", 
451
                                cqpFile.getAbsolutePath(), 
452
                                regPath, pAttributes, sAttributes);
453
                        if (!new File(regPath).exists()) {
454
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
455
                                return false;
456
                        }
457
                        cwbMa.run(corpusname, rootDir + "/registry");
458

    
459
                } catch (Exception ex) {
460
                        System.out.println("Error while creating indexes with CQP tools: $ex");
461
                        ex.printStackTrace();
462
                        return false;
463
                }
464

    
465
                return true;
466
        }
467

    
468
        /**
469
         * Sets the debug.
470
         */
471
        public void setDebug()
472
        {
473
                this.debug = true;
474
        }
475

    
476
        /**
477
         * The main method.
478
         *
479
         * @param args the arguments
480
         */
481
        public static void main(String[] args)
482
        {
483
                File dir = new File("~/xml/geo");
484
                def c = new compiler();
485
                c.setDebug();
486
                c.setCwbPath("~/TXM/cwb/bin");
487
                c.run(dir,"geo");
488
        }
489
        
490
        private void getAnaTypes(File xmlFile) {
491
                inputData = xmlFile.toURI().toURL().openStream();
492
                factory = XMLInputFactory.newInstance();
493
                parser = factory.createXMLStreamReader(inputData);
494
                String ana = "ana"
495
                HashSet<String> types = new HashSet<String>();
496
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
497
                        if (event == XMLStreamConstants.START_ELEMENT) { // start elem
498
                                if (ana.equals(parser.getLocalName())) { // ana elem
499
                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type
500
                                                if ("type".equals(parser.getAttributeLocalName(i))) { // @type
501
                                                        types.add(parser.getAttributeValue(i).substring(1)); //remove the #
502
                                                        break;
503
                                                }
504
                                        }
505
                                }
506
                        }
507
                }
508
                parser.close()
509

    
510
                for (String type : types)
511
                        if (!anatypes.contains(type))
512
                                anatypes << type
513
        }
514
}