Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xmltxm / compiler.groovy @ 1804

History | View | Annotate | Download (13 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-05-26 17:42:36 +0200 (jeu. 26 mai 2016) $
25
// $LastChangedRevision: 3219 $
26
// $LastChangedBy: mdecorde $
27
//
28

    
29

    
30
package org.txm.scripts.importer.xmltxm;
31

    
32
import java.util.ArrayList;;
33

    
34
import org.txm.*;
35
import org.txm.core.engines.*;
36
import org.txm.importer.cwb.BuildCwbEncodeArgs;
37
import org.txm.importer.cwb.CwbEncode
38
import org.txm.importer.cwb.CwbMakeAll
39
import org.txm.scripts.importer.*;
40
import org.txm.objects.*;
41
import org.txm.scripts.*;
42
import org.txm.importer.scripts.xmltxm.*;
43
import org.txm.utils.treetagger.TreeTagger;
44
import org.txm.searchengine.cqp.corpus.*
45
import javax.xml.stream.*;
46
import java.net.URL;
47
import java.io.File;
48
import java.util.HashMap;
49
import java.util.List;
50

    
51
/**
52
 * The Class compiler.
53
 */
54
class compiler
55
{
56
        String sortMetadata;
57
        /** The debug. */
58
        private boolean debug= false;
59

    
60
        /** The input data. */
61
        private def inputData;
62

    
63
        /** The factory. */
64
        private def factory;
65

    
66
        /** The parser. */
67
        private XMLStreamReader parser;
68

    
69
        /** The dir. */
70
        private def dir;
71

    
72
        /** The output. */
73
        private Writer output;
74

    
75
        /** The url. */
76
        private def url;
77

    
78
        /** The text. */
79
        String text="";
80

    
81
        /** The base. */
82
        String base="";
83

    
84
        /** The project. */
85
        String projectName="";
86

    
87
        /** The lang. */
88
        private String lang ="fr";
89

    
90
        /** The s attribs. */
91
        private static HashMap<String, List<String>> sAttribs;
92

    
93
        /** The anatypes. */
94
        private static anatypes = []
95
        private static anavalues = [:]
96
        
97
        /**
98
         * initialize.
99
         *
100
         */
101
        public compiler(){}
102

    
103
        /**
104
         * Instantiates a new compiler.
105
         *
106
         * @param url the url
107
         * @param text the text
108
         * @param base the base
109
         * @param project the project
110
         */
111
        public compiler(URL url,String text,String base, String projectName)
112
        {
113
                this.text = text
114
                this.base = base;
115
                this.projectName = projectName;
116
                try {
117
                        this.url = url;
118
                        inputData = url.openStream();
119

    
120
                        factory = XMLInputFactory.newInstance();
121
                        parser = factory.createXMLStreamReader(inputData);
122
                } catch (Exception ex) {
123
                        System.out.println("Error while creating indexes: $ex");
124
                        ex.printStackTrace();
125
                }
126
        }
127

    
128
        public void setSortMetadata(String sortMetadata)
129
        {
130
                this.sortMetadata = sortMetadata;
131
        }
132

    
133
        /**
134
         * set the language of the corpus.
135
         *
136
         * @param lang the lang
137
         * @return the java.lang. object
138
         */
139
        public setLang(String lang)
140
        {
141
                this.lang = lang;
142
        }
143

    
144
        /**
145
         * Creates the output.
146
         *
147
         * @param dirPathName the dir path name
148
         * @param fileName the file name
149
         * @return true, if successful
150
         */
151
        private boolean createOutput(File f) {
152
                try {
153
                        //File f = new File(dirPathName, fileName)
154
                        output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f,f.exists())) , "UTF-8");
155
                        return true;
156
                } catch (Exception e) {
157
                println "Error while create CQP otput file: "+e
158
                        e.printStackTrace();
159

    
160
                        return false;
161
                }
162
        }
163

    
164
        /**
165
         * Go to text.
166
         */
167
        private boolean GoToText()
168
        {
169
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
170
                        if (event == XMLStreamConstants.END_ELEMENT)
171
                                if (parser.getLocalName().equals("teiHeader") || parser.getLocalName().equals("teiheader")) {
172
                                        return true;
173
                                }
174
                }
175
                return false;
176
        }
177

    
178
        /**
179
         * Transfom file cqp.
180
         *
181
         * @param dirPathName the dir path name
182
         * @param fileName the file name
183
         * @return true, if successful
184
         */
185
        public boolean transfomFileCqp(File cqpFile)
186
        {
187
                createOutput(cqpFile);
188
                String headvalue=""
189
                String vAna = "";
190
                String vForm = "";
191
                String wordid= "";
192
                String vHead = "";
193
                String anatype = null;
194
                String anavalue = null;
195
                int p_id = 0;
196
                int s_id = 0;
197

    
198
                boolean captureword = false;
199
                boolean flagForm = false;
200
                boolean flagAna = false;
201
                boolean inW = false;
202
                int wcounter = 1;
203
                if (!GoToText()) {
204
                        println "Error: no teiHeader tag found in text '"+this.text+"' (please check file format or content)"
205
                        return false;
206
                }
207
                int wc = 0; // TEMP FOR TEST ONLY
208
                try {
209
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
210
                        {
211
                                switch (event) {
212
                                        case XMLStreamConstants.START_ELEMENT:
213
                                                //println "start: "+parser.getLocalName()
214
                                                switch (parser.getLocalName()) {
215
                                                        case "w":
216
                                                        
217
                                                        inW = true;
218
                                                        anavalues = [:]
219
                                                        wordid = parser.getAttributeValue(null, "id")
220
                                                        if (wordid == null)
221
                                                                wordid = "w_"+text+"_"+(wcounter++)
222

    
223
                                                        vAna ="";
224
                                                        break;
225

    
226
                                                        case "form":
227
                                                        String type2 = parser.getAttributeValue(null, "type");
228
                                                        if(type2 == null || type2.equals("default")) {
229
                                                                flagForm = true;
230
                                                                vForm = "";
231
                                                        } else {
232
                                                                flagAna = true;
233
                                                                vAna += "\t";
234
                                                                if(!anatypes.contains(type2))
235
                                                                        anatypes << type2;
236
                                                        }
237
                                                        
238
                                                        break;
239

    
240
                                                        case "ana":
241
                                                        flagAna = true;
242
                                                        anavalue = "";
243
                                                        anatype = parser.getAttributeValue(null, "type");
244
                                                        if (anatype != null) {
245
                                                                if(anatype.startsWith("#"))
246
                                                                        anatype = anatype.substring(1)
247
                                                                break;
248
                                                        }
249
                                                        break;
250

    
251
                                                        default:
252
                                                        if (!inW) {
253
                                                                output.write("<"+parser.getLocalName().toLowerCase());
254
                                                                if (!sAttribs.containsKey(parser.getLocalName()))
255
                                                                        sAttribs.put(parser.getLocalName().toLowerCase(), []);
256

    
257
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
258
                                                                        String attrname = parser.getAttributeLocalName(i).toLowerCase();
259
                                                                        String attrvalue = parser.getAttributeValue(i);
260
                                                                        if (!(parser.getLocalName() == "text" && attrname == "id"))
261
                                                                                output.write(" "+attrname+"=\""+attrvalue.replace("\"", "'")+"\"");
262

    
263
                                                                        if (!sAttribs.get(parser.getLocalName().toLowerCase()).contains(attrname))
264
                                                                                sAttribs.get(parser.getLocalName().toLowerCase()).add(attrname)
265
                                                                }
266

    
267
                                                                if (parser.getLocalName() == "text") {
268
                                                                        output.write(" id=\""+text+"\" base=\""+base+"\" project=\""+projectName+"\"");
269
                                                                }
270
                                                                output.write(">\n");
271
                                                        }
272
                                                }
273
                                                break;
274

    
275
                                        case XMLStreamConstants.END_ELEMENT:
276
                                                switch (parser.getLocalName()) {
277
                                                        case "TEI":
278
                                                                break;
279
                                                        case "w":
280
                                                        for (String t : anatypes) {
281
                                                                def v = anavalues.get(t);
282
                                                                if (v != null) vAna +="\t"+v;
283
                                                                else vAna +="\t";
284
                                                        }
285
                                                        
286
                                                        output.write( vForm.replaceAll("&", "&amp;").replaceAll("<", "&lt;") +"\t"+wordid+vAna+"\n");
287
                                                        vAna = "";
288
                                                        vForm = "";
289
                                                        inW = false;
290
                                                        break;
291

    
292
                                                        case "form":
293
                                                        flagForm = false;
294
                                                        flagAna = false;
295
                                                        break;
296

    
297
                                                        case "ana":
298
                                                        anavalues.put(anatype, anavalue)
299
                                                        flagAna = false;
300
                                                        break;
301

    
302
                                                        default:
303
                                                        if(!inW)
304
                                                                output.write("</"+parser.getLocalName().toLowerCase()+">\n");
305

    
306
                                                }
307
                                                break;
308

    
309
                                        case XMLStreamConstants.CHARACTERS:
310
                                                if(inW)
311
                                                {        
312
                                                        if(flagForm) {        
313
                                                                vForm += parser.getText().trim();
314
                                                        }
315
                                                        else if (flagAna) {
316
                                                                anavalue += parser.getText().trim();
317
                                                        }
318
                                                }
319
                                                break;
320
                                }
321
                        }
322

    
323
                        output.close();
324
                        if (parser != null) parser.close();
325
                if (inputData != null) inputData.close();
326
                }
327
                catch (Exception ex) {
328
                        System.out.println("Error while writing CQP file $ex");
329
                        ex.printStackTrace();
330
                        if (parser != null) parser.close();
331
                        if (inputData != null) inputData.close();
332
                        return false;
333
                }
334
                return true;
335
        }
336

    
337

    
338

    
339
        /**
340
         * Run.
341
         *
342
         * @param rootDirFile the root dir file
343
         * @param basename the basename
344
         * @return true, if successful
345
         */
346
        public boolean run(Project project, File binDir, File txmDir, String basename, String corpusname, List<File> files)
347
        {
348
                anatypes = new ArrayList<String>();// init only 1 time
349
                anavalues = [:]
350
                sAttribs = new HashMap<String, List<String>>();// init only 1 time
351
                String rootDir = binDir.getAbsolutePath();
352

    
353
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
354
                        println ("Error: CWB executables not well set.")
355
                        return false;
356
                }
357
                CorpusBuild corpus = project.getCorpusBuild(project.getName(), MainCorpus.class);
358
                if (corpus != null) {
359
                        if (project.getDoUpdate()) {
360
                                corpus.clean(); // remove old files
361
                        } else {
362
                                corpus.delete(); // remove old files and TXMResult children
363
                        }
364
                } else {
365
                        corpus = new MainCorpus(project);
366
                        corpus.setID(project.getName());
367
                        corpus.setName(project.getName());
368
                }
369
                corpus.setDescription("Built with the XML-TXM import module");
370
                
371
                File cqpFile = new File(binDir,"cqp/"+corpusname+".cqp");
372
cqpFile.delete()
373
                new File(binDir,"cqp").mkdirs()
374
                new File(binDir,"data").mkdirs()
375
                new File(binDir,"registry").mkdirs()
376

    
377
                String textid = "";
378
                int counttext = 0;
379
                //List<File> files = txmDir.listFiles();
380
                
381
                // get all anatypes
382
                for (File f : files) {
383
                        getAnaTypes(f)
384
                }
385

    
386
                //0 set Lang
387
                if (createOutput(cqpFile)) {
388
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
389
                        output.close();
390
                }
391
                //1- Transform into CQP file
392
                def builder = null;
393
                for (File f : files) {
394
                        counttext++;
395
                        if (!f.exists()) {
396
                                println("file "+f+ " does not exists")
397
                        } else {
398
                                //println("process file "+f)
399
                                String txtname = f.getName().substring(0,f.getName().length()-4);
400
                                builder = new compiler(f.toURI().toURL(), txtname, corpusname.toLowerCase(), "default");
401
                                builder.setLang(lang);
402
                                if(!builder.transfomFileCqp(cqpFile))
403
                                        return false;
404
                        }
405
                }
406

    
407
                //end corpus
408
                if (createOutput(cqpFile)) {
409
                        output.write("</txmcorpus>\n");
410
                        output.close();
411
                }
412

    
413
                //2- Import into CWB
414

    
415
                CwbEncode cwbEn = new CwbEncode();
416
                cwbEn.setDebug(debug);
417
                CwbMakeAll cwbMa = new CwbMakeAll();
418
                cwbMa.setDebug(debug);
419
                List<String> pargs = ["id"];
420
                for(String ana : anatypes)
421
                        pargs.add(ana);
422

    
423
                List<String> sargs = [];
424
                //println "Found Sattributes "+this.sAttribs;
425
                if(sAttribs.containsKey("text")) {
426
                        if(!sAttribs.get("text").contains("id"))
427
                                sAttribs.get("text").add("id");
428
                        if(!sAttribs.get("text").contains("base"))
429
                                sAttribs.get("text").add("base");
430
                        if(!sAttribs.get("text").contains("project"))
431
                                sAttribs.get("text").add("project");
432
                } else {
433
                        sargs.add("text:0+id+base+project")
434
                }
435

    
436
                if (sAttribs.containsKey("txmcorpus")) {
437
                        if(!sAttribs.get("txmcorpus").contains("lang"))
438
                                sAttribs.get("txmcorpus").add("lang");
439
                } else {
440
                        sargs.add("txmcorpus:0+lang")
441
                }
442

    
443
                for (String tag : this.sAttribs.keySet()) {
444
                        String sAttr = tag;
445
                        if(sAttribs.get(tag).size() > 0)
446
                                sAttr += ":";
447
                        for(String attr : sAttribs.get(tag))
448
                                sAttr +="+"+attr;
449
                        sargs.add(sAttr)
450
                }
451

    
452

    
453

    
454
                String[] sAttributes = sargs;
455
                String[] pAttributes = pargs;
456
                println "sAttributes : "+sAttributes;
457
                println "pAttributes : "+pAttributes;
458
                try {
459
                        String regPath = rootDir + "/registry/"+corpusname.toLowerCase() 
460
                        cwbEn.run(
461
                                rootDir + "/data/$corpusname", 
462
                                cqpFile.getAbsolutePath(), 
463
                                regPath, pAttributes, sAttributes);
464
                        if (!new File(regPath).exists()) {
465
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
466
                                return false;
467
                        }
468
                        cwbMa.run(corpusname, rootDir + "/registry");
469

    
470
                } catch (Exception ex) {
471
                        System.out.println("Error while creating indexes with CQP tools: $ex");
472
                        ex.printStackTrace();
473
                        return false;
474
                }
475

    
476
                return true;
477
        }
478

    
479
        /**
480
         * Sets the debug.
481
         */
482
        public void setDebug()
483
        {
484
                this.debug = true;
485
        }
486

    
487
        /**
488
         * The main method.
489
         *
490
         * @param args the arguments
491
         */
492
        public static void main(String[] args)
493
        {
494
                File dir = new File("~/xml/geo");
495
                def c = new compiler();
496
                c.setDebug();
497
                c.setCwbPath("~/TXM/cwb/bin");
498
                c.run(dir,"geo");
499
        }
500
        
501
        private void getAnaTypes(File xmlFile) {
502
                inputData = xmlFile.toURI().toURL().openStream();
503
                factory = XMLInputFactory.newInstance();
504
                parser = factory.createXMLStreamReader(inputData);
505
                String ana = "ana"
506
                HashSet<String> types = new HashSet<String>();
507
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
508
                        if (event == XMLStreamConstants.START_ELEMENT) { // start elem
509
                                if (ana.equals(parser.getLocalName())) { // ana elem
510
                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type
511
                                                if ("type".equals(parser.getAttributeLocalName(i))) { // @type
512
                                                        types.add(parser.getAttributeValue(i).substring(1)); //remove the #
513
                                                        break;
514
                                                }
515
                                        }
516
                                }
517
                        }
518
                }
519
                
520
                if (parser != null) parser.close();
521
                if (inputData != null) inputData.close();
522

    
523
                for (String type : types)
524
                        if (!anatypes.contains(type))
525
                                anatypes << type
526
        }
527
}