Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xmltxm / compiler.groovy @ 1688

History | View | Annotate | Download (12.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-05-26 17:42:36 +0200 (jeu. 26 mai 2016) $
25
// $LastChangedRevision: 3219 $
26
// $LastChangedBy: mdecorde $
27
//
28

    
29

    
30
package org.txm.scripts.importer.xmltxm;
31

    
32
import java.util.ArrayList;;
33

    
34
import org.txm.*;
35
import org.txm.core.engines.*;
36
import org.txm.importer.cwb.BuildCwbEncodeArgs;
37
import org.txm.importer.cwb.CwbEncode
38
import org.txm.importer.cwb.CwbMakeAll
39
import org.txm.scripts.importer.*;
40
import org.txm.objects.*;
41
import org.txm.scripts.*;
42
import org.txm.importer.scripts.xmltxm.*;
43
import org.txm.utils.treetagger.TreeTagger;
44
import org.txm.searchengine.cqp.corpus.*
45
import javax.xml.stream.*;
46
import java.net.URL;
47
import java.io.File;
48
import java.util.HashMap;
49
import java.util.List;
50

    
51
/**
52
 * The Class compiler.
53
 */
54
class compiler
55
{
56
        String sortMetadata;
57
        /** The debug. */
58
        private boolean debug= false;
59

    
60
        /** The input data. */
61
        private def inputData;
62

    
63
        /** The factory. */
64
        private def factory;
65

    
66
        /** The parser. */
67
        private XMLStreamReader parser;
68

    
69
        /** The dir. */
70
        private def dir;
71

    
72
        /** The output. */
73
        private Writer output;
74

    
75
        /** The url. */
76
        private def url;
77

    
78
        /** The text. */
79
        String text="";
80

    
81
        /** The base. */
82
        String base="";
83

    
84
        /** The project. */
85
        String projectName="";
86

    
87
        /** The lang. */
88
        private String lang ="fr";
89

    
90
        /** The s attribs. */
91
        private static HashMap<String, List<String>> sAttribs;
92

    
93
        /** The anatypes. */
94
        private static anatypes = []
95
        private static anavalues = [:]
96
        
97
        /**
98
         * initialize.
99
         *
100
         */
101
        public compiler(){}
102

    
103
        /**
104
         * Instantiates a new compiler.
105
         *
106
         * @param url the url
107
         * @param text the text
108
         * @param base the base
109
         * @param project the project
110
         */
111
        public compiler(URL url,String text,String base, String projectName)
112
        {
113
                this.text = text
114
                this.base = base;
115
                this.projectName = projectName;
116
                try {
117
                        this.url = url;
118
                        inputData = url.openStream();
119

    
120
                        factory = XMLInputFactory.newInstance();
121
                        parser = factory.createXMLStreamReader(inputData);
122
                } catch (Exception ex) {
123
                        System.out.println("Error while creating indexes: $ex");
124
                        ex.printStackTrace();
125
                }
126
        }
127

    
128
        public void setSortMetadata(String sortMetadata)
129
        {
130
                this.sortMetadata = sortMetadata;
131
        }
132

    
133
        /**
134
         * set the language of the corpus.
135
         *
136
         * @param lang the lang
137
         * @return the java.lang. object
138
         */
139
        public setLang(String lang)
140
        {
141
                this.lang = lang;
142
        }
143

    
144
        /**
145
         * Creates the output.
146
         *
147
         * @param dirPathName the dir path name
148
         * @param fileName the file name
149
         * @return true, if successful
150
         */
151
        private boolean createOutput(File f) {
152
                try {
153
                        //File f = new File(dirPathName, fileName)
154
                        output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f,f.exists())) , "UTF-8");
155
                        return true;
156
                } catch (Exception e) {
157
                println "Error while create CQP otput file: "+e
158
                        e.printStackTrace();
159

    
160
                        return false;
161
                }
162
        }
163

    
164
        /**
165
         * Go to text.
166
         */
167
        private boolean GoToText()
168
        {
169
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
170
                        if (event == XMLStreamConstants.END_ELEMENT)
171
                                if (parser.getLocalName().equals("teiHeader") || parser.getLocalName().equals("teiheader")) {
172
                                        return true;
173
                                }
174
                }
175
                return false;
176
        }
177

    
178
        /**
179
         * Transfom file cqp.
180
         *
181
         * @param dirPathName the dir path name
182
         * @param fileName the file name
183
         * @return true, if successful
184
         */
185
        public boolean transfomFileCqp(File cqpFile)
186
        {
187
                createOutput(cqpFile);
188
                String headvalue=""
189
                String vAna = "";
190
                String vForm = "";
191
                String wordid= "";
192
                String vHead = "";
193
                String anatype = null;
194
                String anavalue = null;
195
                int p_id = 0;
196
                int s_id = 0;
197

    
198
                boolean captureword = false;
199
                boolean flagForm = false;
200
                boolean flagAna = false;
201
                boolean inW = false;
202
                int wcounter = 1;
203
                if (!GoToText()) {
204
                        println "Error: no teiHeader tag found in text '"+this.text+"' (please check file format or content)"
205
                        return false;
206
                }
207
                int wc = 0; // TEMP FOR TEST ONLY
208
                try {
209
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
210
                        {
211
                                switch (event) {
212
                                        case XMLStreamConstants.START_ELEMENT:
213
                                                //println "start: "+parser.getLocalName()
214
                                                switch (parser.getLocalName()) {
215
                                                        case "w":
216
                                                        
217
                                                        inW = true;
218
                                                        anavalues = [:]
219
                                                        wordid = parser.getAttributeValue(null, "id")
220
                                                        if (wordid == null)
221
                                                                wordid = "w_"+text+"_"+(wcounter++)
222

    
223
                                                        vAna ="";
224
                                                        break;
225

    
226
                                                        case "form":
227
                                                        String type2 = parser.getAttributeValue(null, "type");
228
                                                        if(type2 == null || type2.equals("default")) {
229
                                                                flagForm = true;
230
                                                                vForm = "";
231
                                                        } else {
232
                                                                flagAna = true;
233
                                                                vAna += "\t";
234
                                                                if(!anatypes.contains(type2))
235
                                                                        anatypes << type2;
236
                                                        }
237
                                                        
238
                                                        break;
239

    
240
                                                        case "ana":
241
                                                        flagAna = true;
242
                                                        anavalue = "";
243
                                                        anatype = parser.getAttributeValue(null, "type");
244
                                                        if (anatype != null) {
245
                                                                if(anatype.startsWith("#"))
246
                                                                        anatype = anatype.substring(1)
247
                                                                break;
248
                                                        }
249
                                                        break;
250

    
251
                                                        default:
252
                                                        if (!inW) {
253
                                                                output.write("<"+parser.getLocalName().toLowerCase());
254
                                                                if (!sAttribs.containsKey(parser.getLocalName()))
255
                                                                        sAttribs.put(parser.getLocalName().toLowerCase(), []);
256

    
257
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
258
                                                                        String attrname = parser.getAttributeLocalName(i).toLowerCase();
259
                                                                        String attrvalue = parser.getAttributeValue(i);
260
                                                                        if (!(parser.getLocalName() == "text" && attrname == "id"))
261
                                                                                output.write(" "+attrname+"=\""+attrvalue.replace("\"", "'")+"\"");
262

    
263
                                                                        if (!sAttribs.get(parser.getLocalName().toLowerCase()).contains(attrname))
264
                                                                                sAttribs.get(parser.getLocalName().toLowerCase()).add(attrname)
265
                                                                }
266

    
267
                                                                if (parser.getLocalName() == "text") {
268
                                                                        output.write(" id=\""+text+"\" base=\""+base+"\" project=\""+projectName+"\"");
269
                                                                }
270
                                                                output.write(">\n");
271
                                                        }
272
                                                }
273
                                                break;
274

    
275
                                        case XMLStreamConstants.END_ELEMENT:
276
                                                switch (parser.getLocalName()) {
277
                                                        case "TEI":
278
                                                                break;
279
                                                        case "w":
280
                                                        for (String t : anatypes) {
281
                                                                def v = anavalues.get(t);
282
                                                                if (v != null) vAna +="\t"+v;
283
                                                                else vAna +="\t";
284
                                                        }
285
                                                        
286
                                                        output.write( vForm.replaceAll("&", "&amp;").replaceAll("<", "&lt;") +"\t"+wordid+vAna+"\n");
287
                                                        vAna = "";
288
                                                        vForm = "";
289
                                                        inW = false;
290
                                                        break;
291

    
292
                                                        case "form":
293
                                                        flagForm = false;
294
                                                        flagAna = false;
295
                                                        break;
296

    
297
                                                        case "ana":
298
                                                        anavalues.put(anatype, anavalue)
299
                                                        flagAna = false;
300
                                                        break;
301

    
302
                                                        default:
303
                                                        if(!inW)
304
                                                                output.write("</"+parser.getLocalName().toLowerCase()+">\n");
305

    
306
                                                }
307
                                                break;
308

    
309
                                        case XMLStreamConstants.CHARACTERS:
310
                                                if(inW)
311
                                                {        
312
                                                        if(flagForm) {        
313
                                                                vForm += parser.getText().trim();
314
                                                        }
315
                                                        else if (flagAna) {
316
                                                                anavalue += parser.getText().trim();
317
                                                        }
318
                                                }
319
                                                break;
320
                                }
321
                        }
322

    
323
                        output.close();
324
                        if (parser != null) parser.close();
325
                if (inputData != null) inputData.close();
326
                }
327
                catch (Exception ex) {
328
                        System.out.println("Error while writing CQP file $ex");
329
                        ex.printStackTrace();
330
                        if (parser != null) parser.close();
331
                        if (inputData != null) inputData.close();
332
                        return false;
333
                }
334
                return true;
335
        }
336

    
337

    
338

    
339
        /**
340
         * Run.
341
         *
342
         * @param rootDirFile the root dir file
343
         * @param basename the basename
344
         * @return true, if successful
345
         */
346
        public boolean run(Project project, File binDir, File txmDir, String basename, String corpusname, List<File> files)
347
        {
348
                anatypes = new ArrayList<String>();// init only 1 time
349
                anavalues = [:]
350
                sAttribs = new HashMap<String, List<String>>();// init only 1 time
351
                String rootDir = binDir.getAbsolutePath();
352

    
353
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
354
                        println ("Error: CWB executables not well set.")
355
                        return false;
356
                }
357
                CorpusBuild corpus = project.getCorpusBuild(project.getName());
358
                if (corpus != null) {
359
                        //println "CLEAN PREVIOUS CORPUS"
360
                        corpus.delete(); // remove old files
361
                }
362
                        
363
                // make new one
364
                corpus = new MainCorpus(project);
365
                corpus.setID(project.getName());
366
                corpus.setName(project.getName());
367
                corpus.setDescription("Built with the XML-TXM import module");
368
                
369
                File cqpFile = new File(binDir,"cqp/"+corpusname+".cqp");
370
cqpFile.delete()
371
                new File(binDir,"cqp").mkdirs()
372
                new File(binDir,"data").mkdirs()
373
                new File(binDir,"registry").mkdirs()
374

    
375
                String textid = "";
376
                int counttext = 0;
377
                //List<File> files = txmDir.listFiles();
378
                
379
                // get all anatypes
380
                for (File f : files) {
381
                        getAnaTypes(f)
382
                }
383

    
384
                //0 set Lang
385
                if (createOutput(cqpFile)) {
386
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
387
                        output.close();
388
                }
389
                //1- Transform into CQP file
390
                def builder = null;
391
                for (File f : files) {
392
                        counttext++;
393
                        if (!f.exists()) {
394
                                println("file "+f+ " does not exists")
395
                        } else {
396
                                //println("process file "+f)
397
                                String txtname = f.getName().substring(0,f.getName().length()-4);
398
                                builder = new compiler(f.toURI().toURL(), txtname, corpusname.toLowerCase(), "default");
399
                                builder.setLang(lang);
400
                                if(!builder.transfomFileCqp(cqpFile))
401
                                        return false;
402
                        }
403
                }
404

    
405
                //end corpus
406
                if (createOutput(cqpFile)) {
407
                        output.write("</txmcorpus>\n");
408
                        output.close();
409
                }
410

    
411
                //2- Import into CWB
412

    
413
                CwbEncode cwbEn = new CwbEncode();
414
                cwbEn.setDebug(debug);
415
                CwbMakeAll cwbMa = new CwbMakeAll();
416
                cwbMa.setDebug(debug);
417
                List<String> pargs = ["id"];
418
                for(String ana : anatypes)
419
                        pargs.add(ana);
420

    
421
                List<String> sargs = [];
422
                //println "Found Sattributes "+this.sAttribs;
423
                if(sAttribs.containsKey("text")) {
424
                        if(!sAttribs.get("text").contains("id"))
425
                                sAttribs.get("text").add("id");
426
                        if(!sAttribs.get("text").contains("base"))
427
                                sAttribs.get("text").add("base");
428
                        if(!sAttribs.get("text").contains("project"))
429
                                sAttribs.get("text").add("project");
430
                } else {
431
                        sargs.add("text:0+id+base+project")
432
                }
433

    
434
                if (sAttribs.containsKey("txmcorpus")) {
435
                        if(!sAttribs.get("txmcorpus").contains("lang"))
436
                                sAttribs.get("txmcorpus").add("lang");
437
                } else {
438
                        sargs.add("txmcorpus:0+lang")
439
                }
440

    
441
                for (String tag : this.sAttribs.keySet()) {
442
                        String sAttr = tag;
443
                        if(sAttribs.get(tag).size() > 0)
444
                                sAttr += ":";
445
                        for(String attr : sAttribs.get(tag))
446
                                sAttr +="+"+attr;
447
                        sargs.add(sAttr)
448
                }
449

    
450

    
451

    
452
                String[] sAttributes = sargs;
453
                String[] pAttributes = pargs;
454
                println "sAttributes : "+sAttributes;
455
                println "pAttributes : "+pAttributes;
456
                try {
457
                        String regPath = rootDir + "/registry/"+corpusname.toLowerCase() 
458
                        cwbEn.run(
459
                                rootDir + "/data/$corpusname", 
460
                                cqpFile.getAbsolutePath(), 
461
                                regPath, pAttributes, sAttributes);
462
                        if (!new File(regPath).exists()) {
463
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
464
                                return false;
465
                        }
466
                        cwbMa.run(corpusname, rootDir + "/registry");
467

    
468
                } catch (Exception ex) {
469
                        System.out.println("Error while creating indexes with CQP tools: $ex");
470
                        ex.printStackTrace();
471
                        return false;
472
                }
473

    
474
                return true;
475
        }
476

    
477
        /**
478
         * Sets the debug.
479
         */
480
        public void setDebug()
481
        {
482
                this.debug = true;
483
        }
484

    
485
        /**
486
         * The main method.
487
         *
488
         * @param args the arguments
489
         */
490
        public static void main(String[] args)
491
        {
492
                File dir = new File("~/xml/geo");
493
                def c = new compiler();
494
                c.setDebug();
495
                c.setCwbPath("~/TXM/cwb/bin");
496
                c.run(dir,"geo");
497
        }
498
        
499
        private void getAnaTypes(File xmlFile) {
500
                inputData = xmlFile.toURI().toURL().openStream();
501
                factory = XMLInputFactory.newInstance();
502
                parser = factory.createXMLStreamReader(inputData);
503
                String ana = "ana"
504
                HashSet<String> types = new HashSet<String>();
505
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
506
                        if (event == XMLStreamConstants.START_ELEMENT) { // start elem
507
                                if (ana.equals(parser.getLocalName())) { // ana elem
508
                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type
509
                                                if ("type".equals(parser.getAttributeLocalName(i))) { // @type
510
                                                        types.add(parser.getAttributeValue(i).substring(1)); //remove the #
511
                                                        break;
512
                                                }
513
                                        }
514
                                }
515
                        }
516
                }
517
                
518
                if (parser != null) parser.close();
519
                if (inputData != null) inputData.close();
520

    
521
                for (String type : types)
522
                        if (!anatypes.contains(type))
523
                                anatypes << type
524
        }
525
}