Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / xmltxm / compiler.groovy @ 479

History | View | Annotate | Download (12.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-05-26 17:42:36 +0200 (jeu. 26 mai 2016) $
25
// $LastChangedRevision: 3219 $
26
// $LastChangedBy: mdecorde $
27
//
28

    
29

    
30
package org.txm.importer.xmltxm;
31

    
32
import java.util.ArrayList;;
33

    
34
import org.txm.*;
35
import org.txm.importer.cwb.BuildCwbEncodeArgs;
36
import org.txm.importer.cwb.CwbEncode
37
import org.txm.importer.cwb.CwbMakeAll
38
import org.txm.importer.*;
39
import org.txm.scripts.*;
40
import org.txm.scripts.teitxm.BuildTTSrc;
41
import org.txm.scripts.teitxm.*;
42
import org.txm.utils.treetagger.TreeTagger;
43

    
44
import javax.xml.stream.*;
45
import java.net.URL;
46
import java.io.File;
47
import java.util.HashMap;
48
import java.util.List;
49

    
50
// TODO: Auto-generated Javadoc
51
/**
52
 * The Class compiler.
53
 */
54
class compiler
55
{
56
        String sortMetadata;
57
        /** The debug. */
58
        private boolean debug= false;
59

    
60
        /** The input data. */
61
        private def inputData;
62

    
63
        /** The factory. */
64
        private def factory;
65

    
66
        /** The parser. */
67
        private XMLStreamReader parser;
68

    
69
        /** The dir. */
70
        private def dir;
71

    
72
        /** The output. */
73
        private Writer output;
74

    
75
        /** The url. */
76
        private def url;
77

    
78
        /** The text. */
79
        String text="";
80

    
81
        /** The base. */
82
        String base="";
83

    
84
        /** The project. */
85
        String project="";
86

    
87
        /** The cwb loc. */
88
        String cwbLoc;
89

    
90
        /** The lang. */
91
        private String lang ="fr";
92

    
93
        /** The s attribs. */
94
        private static HashMap<String, List<String>> sAttribs;
95

    
96
        /** The anatypes. */
97
        private static anatypes = []
98
        private static anavalues = [:]
99
        
100
        /**
101
         * initialize.
102
         *
103
         */
104
        public compiler(){}
105

    
106
        /**
107
         * Instantiates a new compiler.
108
         *
109
         * @param url the url
110
         * @param text the text
111
         * @param base the base
112
         * @param project the project
113
         */
114
        public compiler(URL url,String text,String base, String project)
115
        {
116
                this.text = text
117
                this.base = base;
118
                this.project = project;
119
                try {
120
                        this.url = url;
121
                        inputData = url.openStream();
122

    
123
                        factory = XMLInputFactory.newInstance();
124
                        parser = factory.createXMLStreamReader(inputData);
125
                } catch (Exception ex) {
126
                        System.out.println("Error while creating indexes: $ex");
127
                        ex.printStackTrace();
128
                }
129
        }
130

    
131
        public void setSortMetadata(String sortMetadata)
132
        {
133
                this.sortMetadata = sortMetadata;
134
        }
135

    
136
        /**
137
         * set the language of the corpus.
138
         *
139
         * @param lang the lang
140
         * @return the java.lang. object
141
         */
142
        public setLang(String lang)
143
        {
144
                this.lang = lang;
145
        }
146

    
147
        /**
148
         * Sets the cwb path.
149
         *
150
         * @param path the new cwb path
151
         */
152
        public void setCwbPath(String path)
153
        {
154
                if(!new File(path).exists())
155
                        System.err.println("CWB Path : "+path+" does not exists")
156
                cwbLoc = path;
157
        }
158

    
159
        /**
160
         * Creates the output.
161
         *
162
         * @param dirPathName the dir path name
163
         * @param fileName the file name
164
         * @return true, if successful
165
         */
166
        private boolean createOutput(File f) {
167
                try {
168
                        //File f = new File(dirPathName, fileName)
169
                        output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f,f.exists())) , "UTF-8");
170
                        return true;
171
                } catch (Exception e) {
172
                println "Error while create WTC otput file: "+e
173
                        e.printStackTrace();
174

    
175
                        return false;
176
                }
177
        }
178

    
179
        /**
180
         * Go to text.
181
         */
182
        private boolean GoToText()
183
        {
184
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
185
                        if (event == XMLStreamConstants.END_ELEMENT)
186
                                if (parser.getLocalName().equals("teiHeader") || parser.getLocalName().equals("teiheader")) {
187
                                        return true;
188
                                }
189
                }
190
                return false;
191
        }
192

    
193
        /**
194
         * Transfom file wtc.
195
         *
196
         * @param dirPathName the dir path name
197
         * @param fileName the file name
198
         * @return true, if successful
199
         */
200
        public boolean transfomFileWtc(File wtcFile)
201
        {
202
                createOutput(wtcFile);
203
                String headvalue=""
204
                String vAna = "";
205
                String vForm = "";
206
                String wordid= "";
207
                String vHead = "";
208
                String anatype = null;
209
                String anavalue = null;
210
                int p_id = 0;
211
                int s_id = 0;
212

    
213
                boolean captureword = false;
214
                boolean flagForm = false;
215
                boolean flagAna = false;
216
                boolean inW = false;
217
                int wcounter = 1;
218
                if (!GoToText()) {
219
                        println "Error: no teiHeader tag found in text '"+this.text+"' (please check file format or content)"
220
                        return false;
221
                }
222
                int wc = 0; // TEMP FOR TEST ONLY
223
                try {
224
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
225
                        {
226
                                switch (event) {
227
                                        case XMLStreamConstants.START_ELEMENT:
228
                                                //println "start: "+parser.getLocalName()
229
                                                switch (parser.getLocalName()) {
230
                                                        case "w":
231
                                                        
232
                                                        inW = true;
233
                                                        anavalues = [:]
234
                                                        wordid = parser.getAttributeValue(null, "id")
235
                                                        if (wordid == null)
236
                                                                wordid = "w_"+text+"_"+(wcounter++)
237

    
238
                                                        vAna ="";
239
                                                        break;
240

    
241
                                                        case "form":
242
                                                        String type2 = parser.getAttributeValue(null, "type");
243
                                                        if(type2 == null || type2.equals("default")) {
244
                                                                flagForm = true;
245
                                                                vForm = "";
246
                                                        } else {
247
                                                                flagAna = true;
248
                                                                vAna += "\t";
249
                                                                if(!anatypes.contains(type2))
250
                                                                        anatypes << type2;
251
                                                        }
252
                                                        
253
                                                        break;
254

    
255
                                                        case "ana":
256
                                                        flagAna = true;
257
                                                        anavalue = "";
258
                                                        anatype = parser.getAttributeValue(null, "type");
259
                                                        if (anatype != null) {
260
                                                                if(anatype.startsWith("#"))
261
                                                                        anatype = anatype.substring(1)
262
                                                                break;
263
                                                        }
264
                                                        break;
265

    
266
                                                        default:
267
                                                        if (!inW) {
268
                                                                output.write("<"+parser.getLocalName().toLowerCase());
269
                                                                if (!sAttribs.containsKey(parser.getLocalName()))
270
                                                                        sAttribs.put(parser.getLocalName().toLowerCase(), []);
271

    
272
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
273
                                                                        String attrname = parser.getAttributeLocalName(i).toLowerCase();
274
                                                                        String attrvalue = parser.getAttributeValue(i);
275
                                                                        if (!(parser.getLocalName() == "text" && attrname == "id"))
276
                                                                                output.write(" "+attrname+"=\""+attrvalue.replace("\"", "'")+"\"");
277

    
278
                                                                        if (!sAttribs.get(parser.getLocalName().toLowerCase()).contains(attrname))
279
                                                                                sAttribs.get(parser.getLocalName().toLowerCase()).add(attrname)
280
                                                                }
281

    
282
                                                                if (parser.getLocalName() == "text") {
283
                                                                        output.write(" id=\""+text+"\" base=\""+base+"\" project=\""+project+"\"");
284
                                                                }
285
                                                                output.write(">\n");
286
                                                        }
287
                                                }
288
                                                break;
289

    
290
                                        case XMLStreamConstants.END_ELEMENT:
291
                                                switch (parser.getLocalName()) {
292
                                                        case "TEI":
293
                                                                break;
294
                                                        case "w":
295
                                                        for (String t : anatypes) {
296
                                                                def v = anavalues.get(t);
297
                                                                if (v != null) vAna +="\t"+v;
298
                                                                else vAna +="\t";
299
                                                        }
300
                                                        
301
                                                        output.write( vForm.replaceAll("&", "&amp;").replaceAll("<", "&lt;") +"\t"+wordid+vAna+"\n");
302
                                                        vAna = "";
303
                                                        vForm = "";
304
                                                        inW = false;
305
                                                        break;
306

    
307
                                                        case "form":
308
                                                        flagForm = false;
309
                                                        flagAna = false;
310
                                                        break;
311

    
312
                                                        case "ana":
313
                                                        anavalues.put(anatype, anavalue)
314
                                                        flagAna = false;
315
                                                        break;
316

    
317
                                                        default:
318
                                                        if(!inW)
319
                                                                output.write("</"+parser.getLocalName().toLowerCase()+">\n");
320

    
321
                                                }
322
                                                break;
323

    
324
                                        case XMLStreamConstants.CHARACTERS:
325
                                                if(inW)
326
                                                {        
327
                                                        if(flagForm) {        
328
                                                                vForm += parser.getText().trim();
329
                                                        }
330
                                                        else if (flagAna) {
331
                                                                anavalue += parser.getText().trim();
332
                                                        }
333
                                                }
334
                                                break;
335
                                }
336
                        }
337

    
338
                        output.close();
339
                        parser.close();
340
                }
341
                catch (Exception ex) {
342
                        System.out.println("Error while writing WTC file $ex");
343
                        ex.printStackTrace();
344
                        return false;
345
                }
346
                return true;
347
        }
348

    
349

    
350

    
351
        /**
352
         * Run.
353
         *
354
         * @param rootDirFile the root dir file
355
         * @param basename the basename
356
         * @return true, if successful
357
         */
358
        public boolean run(File binDir, File txmDir, String basename, String corpusname, List<File> files)
359
        {
360
                anatypes = new ArrayList<String>();// init only 1 time
361
                anavalues = [:]
362
                sAttribs = new HashMap<String, List<String>>();// init only 1 time
363
                String rootDir = binDir.getAbsolutePath();
364

    
365
                if(cwbLoc == null)
366
                        cwbLoc = org.txm.Toolbox.getParam(org.txm.Toolbox.CQI_SERVER_PATH_TO_CQPLIB)+File.separator;
367

    
368
                if(!new File(cwbLoc).exists()) {
369
                        println ("CWB path error: "+cwbLoc)
370
                        return false;
371
                }
372
                if(!new File(rootDir).exists()) {
373
                        println ("binary directory does not exists: "+rootDir)
374
                        return false;
375
                }
376

    
377
                File wtcFile = new File(binDir,"wtc/"+corpusname.toLowerCase()+".wtc");
378
                new File(rootDir,"/wtc/").deleteDir();
379
                new File(rootDir,"/wtc/").mkdir();
380
                new File(rootDir,"data/"+corpusname).deleteDir();
381
                new File(rootDir,"data/"+corpusname).mkdir();
382
                new File(rootDir,"registry/").mkdir();
383

    
384
                String textid = "";
385
                int counttext = 0;
386
                //List<File> files = txmDir.listFiles();
387
                
388
                // get all anatypes
389
                for (File f : files) {
390
                        getAnaTypes(f)
391
                }
392

    
393
                //0 set Lang
394
                if (createOutput(wtcFile)) {
395
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
396
                        output.close();
397
                }
398
                //1- Transform into WTC file
399
                def builder = null;
400
                for (File f : files) {
401
                        counttext++;
402
                        if (!f.exists()) {
403
                                println("file "+f+ " does not exists")
404
                        } else {
405
                                //println("process file "+f)
406
                                String txtname = f.getName().substring(0,f.getName().length()-4);
407
                                builder = new compiler(f.toURI().toURL(), txtname, corpusname.toLowerCase(), "default");
408
                                builder.setLang(lang);
409
                                if(!builder.transfomFileWtc(wtcFile))
410
                                        return false;
411
                        }
412
                }
413

    
414
                //end corpus
415
                if (createOutput(wtcFile)) {
416
                        output.write("</txmcorpus>\n");
417
                        output.close();
418
                }
419

    
420
                //2- Import into CWB
421

    
422
                CwbEncode cwbEn = new CwbEncode();
423
                cwbEn.setDebug(debug);
424
                CwbMakeAll cwbMa = new CwbMakeAll();
425
                cwbMa.setDebug(debug);
426
                List<String> pargs = ["id"];
427
                for(String ana : anatypes)
428
                        pargs.add(ana);
429

    
430
                List<String> sargs = [];
431
                //println "Found Sattributes "+this.sAttribs;
432
                if(sAttribs.containsKey("text")) {
433
                        if(!sAttribs.get("text").contains("id"))
434
                                sAttribs.get("text").add("id");
435
                        if(!sAttribs.get("text").contains("base"))
436
                                sAttribs.get("text").add("base");
437
                        if(!sAttribs.get("text").contains("project"))
438
                                sAttribs.get("text").add("project");
439
                } else {
440
                        sargs.add("text:0+id+base+project")
441
                }
442

    
443
                if (sAttribs.containsKey("txmcorpus")) {
444
                        if(!sAttribs.get("txmcorpus").contains("lang"))
445
                                sAttribs.get("txmcorpus").add("lang");
446
                } else {
447
                        sargs.add("txmcorpus:0+lang")
448
                }
449

    
450
                for (String tag : this.sAttribs.keySet()) {
451
                        String sAttr = tag;
452
                        if(sAttribs.get(tag).size() > 0)
453
                                sAttr += ":";
454
                        for(String attr : sAttribs.get(tag))
455
                                sAttr +="+"+attr;
456
                        sargs.add(sAttr)
457
                }
458

    
459

    
460

    
461
                String[] sAttributes = sargs;
462
                String[] pAttributes = pargs;
463
                println "sAttributes : "+sAttributes;
464
                println "pAttributes : "+pAttributes;
465
                try {
466
                        println cwbLoc
467
                        String regPath = rootDir + "/registry/"+corpusname.toLowerCase() 
468
                        cwbEn.run(new File(cwbLoc,"cwb-encode").getAbsolutePath(), 
469
                                rootDir + "/data/$corpusname", 
470
                                wtcFile.getAbsolutePath(), 
471
                                regPath, pAttributes, sAttributes);
472
                        if (!new File(regPath).exists()) {
473
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
474
                                return false;
475
                        }
476
                        cwbMa.run(new File(cwbLoc,"cwb-makeall").getAbsolutePath(), corpusname, rootDir + "/registry");
477

    
478
                } catch (Exception ex) {
479
                        System.out.println("Error while creating indexes with CQP tools: $ex");
480
                        ex.printStackTrace();
481
                        return false;
482
                }
483

    
484
                return true;
485
        }
486

    
487
        /**
488
         * Sets the debug.
489
         */
490
        public void setDebug()
491
        {
492
                this.debug = true;
493
        }
494

    
495
        /**
496
         * The main method.
497
         *
498
         * @param args the arguments
499
         */
500
        public static void main(String[] args)
501
        {
502
                File dir = new File("~/xml/geo");
503
                def c = new compiler();
504
                c.setDebug();
505
                c.setCwbPath("~/TXM/cwb/bin");
506
                c.run(dir,"geo");
507
        }
508
        
509
        private void getAnaTypes(File xmlFile) {
510
                inputData = xmlFile.toURI().toURL().openStream();
511
                factory = XMLInputFactory.newInstance();
512
                parser = factory.createXMLStreamReader(inputData);
513
                String ana = "ana"
514
                HashSet<String> types = new HashSet<String>();
515
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
516
                        if (event == XMLStreamConstants.START_ELEMENT) { // start elem
517
                                if (ana.equals(parser.getLocalName())) { // ana elem
518
                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type
519
                                                if ("type".equals(parser.getAttributeLocalName(i))) { // @type
520
                                                        types.add(parser.getAttributeValue(i).substring(1)); //remove the #
521
                                                        break;
522
                                                }
523
                                        }
524
                                }
525
                        }
526
                }
527
                parser.close()
528

    
529
                for (String type : types)
530
                        if (!anatypes.contains(type))
531
                                anatypes << type
532
        }
533
}