Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / tmx / compiler.groovy @ 1000

History | View | Annotate | Download (17.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.tmx;
29

    
30
import org.txm.importer.cwb.BuildAlignOut;
31
import org.txm.importer.cwb.CwbAlign;
32
import org.txm.importer.cwb.CwbEncode
33
import org.txm.importer.cwb.CwbMakeAll
34
import org.txm.importer.cwb.PatchCwbRegistry;
35
import org.txm.scripts.importer.*;
36
import org.txm.scripts.*;
37
import org.txm.importer.scripts.xmltxm.*;
38
import org.txm.utils.logger.Log;
39
import org.txm.utils.treetagger.TreeTagger;
40

    
41
import javax.xml.stream.*;
42

    
43
import java.net.URL;
44
import java.io.File;
45
import java.io.IOException;
46
import java.util.ArrayList;
47
import java.util.HashMap;
48
import java.util.List;
49

    
50
// TODO: Auto-generated Javadoc
51
/**
52
 * The Class compiler.
53
 */
54
class compiler
55
{
56
        /** The debug. */
57
        private boolean debug= false;
58

    
59
        /** The input data. */
60
        private def inputData;
61

    
62
        /** The factory. */
63
        private def factory;
64

    
65
        /** The parser. */
66
        private XMLStreamReader parser;
67

    
68
        /** The dir. */
69
        private def dir;
70

    
71
        /** The output. */
72
        private def output;
73

    
74
        /** The url. */
75
        private def url;
76

    
77
        /** The text. */
78
        String text="";
79

    
80
        /** The base. */
81
        String base="";
82

    
83
        /** The project. */
84
        String project="";
85

    
86
        /** The anahash. */
87
        static boolean firstWord = true;
88
        private def anaTypes = [];
89
        public def getAnaTypes() { return anaTypes; }
90
        private HashMap<String,String> anahash = new HashMap<String,String>() ;
91

    
92
        private static SAttributesListener sattrsListener;
93
        private static HashMap<String,ArrayList<String>> structs;
94
        private static HashMap<String, Integer> structsProf;
95
        def headerAttrs; // only for TMX
96

    
97
        /** The tuprops. */
98
        List<String> tuprops;
99

    
100
        /** The corporanames. */
101
        List<String> corporanames = new ArrayList<String>();
102

    
103
        /** The align structure. */
104
        String alignStructure;
105

    
106
        /** The align attribute. */
107
        String alignAttribute;
108

    
109
        /** The do align. */
110
        boolean doAlign = false;
111
        public static HashMap<String, Integer> segs_id = [:]; // static so the id are uniques
112
        public static int seg_id = 0;
113

    
114
        /**
115
         * initialize.
116
         *
117
         */
118
        public compiler(){}
119

    
120
        /**
121
         * Instantiates a new compiler.
122
         *
123
         * @param url the url
124
         * @param text the text
125
         * @param base the base
126
         * @param project the project
127
         * @param tuprops the tuprops
128
         */
129
        public compiler(URL url, String text, String base, String project, List<String> tuprops)
130
        {
131
                this.text = text
132
                this.base = base;
133
                this.project = project;
134
                this.tuprops = tuprops;
135
                try {
136
                        this.url = url;
137
                        inputData = url.openStream();
138
                        factory = XMLInputFactory.newInstance();
139
                        parser = factory.createXMLStreamReader(inputData);
140

    
141
                        if (sattrsListener == null)
142
                                sattrsListener = new SAttributesListener(parser);
143
                        else
144
                                sattrsListener.start(parser)
145
                } catch (XMLStreamException ex) {
146
                        System.out.println(ex);
147
                }catch (IOException ex) {
148
                        System.err.println("IOException while parsing ");
149
                }
150
        }
151

    
152
        /**
153
         * Creates the output.
154
         *
155
         * @param dirPathName the dir path name
156
         * @param fileName the file name
157
         * @return true, if successful
158
         */
159
        private boolean createOutput(String dirPathName, String fileName){
160
                try {
161
                        File f = new File(dirPathName, fileName)
162
                        //We don't want to concatenate the cqp files
163
                        output = new OutputStreamWriter(new FileOutputStream(f, f.exists()) , "UTF-8");
164
                        return true;
165
                } catch (Exception e) {
166
                        System.err.println(e);
167

    
168
                        return false;
169
                }
170
        }
171

    
172
        /**
173
         * Go to text.
174
         */
175
        private void GoToText()
176
        {
177
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
178
                {
179
                        if(event == XMLStreamConstants.END_ELEMENT)
180
                                if(parser.getLocalName().equals("teiHeader"))
181
                                        return;
182
                }
183
        }
184

    
185
        /**
186
         * Transfom file cqp.
187
         *
188
         * @param dirPathName the dir path name
189
         * @param fileName the file name
190
         * @return true, if successful
191
         */
192
        public boolean transfomFileCqp(String dirPathName, String fileName)
193
        {
194
                createOutput(dirPathName, fileName);
195

    
196
                GoToText();
197
                firstWord = true;
198
                String headvalue = ""
199
                String vAna = "";
200
                String vForm = "";
201
                String wordid= "";
202
                String vHead = "";
203

    
204
                boolean captureword = false;
205
                boolean flagForm = false;
206
                boolean flagAna = false;
207

    
208
                String anaType;
209
                String tuRef;
210
                try {
211
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
212
                        {
213
                                switch (event) {
214
                                        case XMLStreamConstants.START_ELEMENT:
215
                                                sattrsListener.startElement(parser.getLocalName());
216
                                                switch (parser.getLocalName()) {
217
                                                        case "header": // capture header attributes
218
                                                                headerAttrs = [:];
219
                                                                for ( int i = 0 ; i < parser.getAttributeCount() ; i++) {
220
                                                                        headerAttrs[parser.getAttributeLocalName(i)] =  parser.getAttributeValue(i);
221
                                                                }
222
                                                                break;
223
                                                        case "text":
224
                                                                output.write("<text id=\""+text+"\" base=\""+base+"\"" +
225
                                                                " project=\""+project+"\"");
226
                                                                def textAttrs = [];
227
                                                                for ( int i = 0 ; i < parser.getAttributeCount() ; i++)
228
                                                                {
229
                                                                        String attrname = parser.getAttributeLocalName(i);
230
                                                                        String attrvalue = parser.getAttributeValue(i);
231
                                                                        if(attrname != "id")
232
                                                                                output.write(" "+attrname+"=\""+attrvalue+"\"")
233
                                                                        textAttrs << attrname
234
                                                                }
235
                                                                if (headerAttrs != null)
236
                                                                        for( String key : headerAttrs.keySet()) {
237
                                                                                if (!textAttrs.contains(key))
238
                                                                                        output.write(" "+key+"=\""+headerAttrs[key]+"\"")
239
                                                                        }
240
                                                                output.write(">\n");
241

    
242
                                                                break;
243

    
244
                                                        case "tu":
245
                                                                output.write("<tu");
246
                                                                for( String attrname : tuprops)
247
                                                                {
248
                                                                        String attrvalue = parser.getAttributeValue(null, attrname);
249
                                                                        attrname = attrname.toLowerCase();
250
                                                                        if(attrvalue == null)
251
                                                                                output.write(" "+attrname+"=\"N/A\"")
252
                                                                        else
253
                                                                                output.write(" "+attrname+"=\""+attrvalue+"\"")
254
                                                                }
255
                                                                tuRef = text;
256
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
257
                                                                        output.write(" "+parser.getAttributeLocalName(i)+"=\""+parser.getAttributeValue(i)+"\"")
258
                                                                        if (parser.getAttributeLocalName(i) == "ref")
259
                                                                                tuRef = parser.getAttributeValue(i);
260
                                                                }
261
                                                                output.write(">\n");
262
                                                                break;
263
                                                        case "seg":
264
                                                                output.write("<seg")
265
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
266
                                                                        output.write(" "+parser.getAttributeLocalName(i)+"=\""+parser.getAttributeValue(i)+"\"")
267
                                                                }
268
                                                                output.write(">\n")
269
                                                                break;
270

    
271
                                                        case "w":
272
                                                                for(int i = 0 ; i < parser.getAttributeCount(); i++)
273
                                                                        if(parser.getAttributeLocalName(i).equals("id"))
274
                                                                {
275
                                                                        wordid = parser.getAttributeValue(i);
276
                                                                }
277
                                                                break;
278
                                                        case "form":
279
                                                                flagForm = true;
280
                                                                vForm = "";
281
                                                                anahash.clear();
282
                                                                break;
283

    
284
                                                        case "ana":
285
                                                                flagAna = true;
286
                                                                anaType = parser.getAttributeValue(null, "type")
287
                                                                if (anaType.length() > 0) anaType = anaType.substring(1);
288
                                                                //println "anatype $anaType"
289
                                                                anahash.put(anaType, "");
290
                                                                if (firstWord) {
291
                                                                        anaTypes << anaType;
292
                                                                }
293
                                                                break;
294
                                                }
295
                                                break;
296

    
297
                                        case XMLStreamConstants.END_ELEMENT:
298
                                                sattrsListener.endElement(parser.getLocalName());
299
                                                switch (parser.getLocalName())
300
                                                {
301
                                                        case "text":
302
                                                                output.write("</text>\n");
303
                                                                break;
304

    
305
                                                        case "tu":
306
                                                                output.write("</tu>\n")
307
                                                                break;
308
                                                        case "seg":
309
                                                                output.write("</seg>\n")
310
                                                                break;
311

    
312
                                                        case "w":
313
                                                                firstWord = false;
314
                                                                output.write( vForm.replaceAll("&", "&amp;").replaceAll("<", "&lt;")
315
                                                                         +"\t"+wordid+"\t"+tuRef);
316
                                                                for(String type : anaTypes) {
317
                                                                        output.write("\t"+anahash.get(type));
318
                                                                }
319
                                                                output.write("\n")
320

    
321
                                                                vForm = "";
322
                                                                break;
323

    
324
                                                        case "form":
325
                                                                flagForm = false;
326
                                                                break;
327

    
328
                                                        case "ana":
329
                                                                anahash.put(anaType, vAna);
330
                                                                vAna = "";
331
                                                                flagAna = false;
332
                                                                break;
333
                                                }
334
                                                break;
335

    
336
                                        case XMLStreamConstants.CHARACTERS:
337
                                                if(flagForm)
338
                                                        vForm += parser.getText().trim();
339
                                                if (flagAna) {
340
                                                        vAna += parser.getText().trim()
341
                                                }
342
                                                break;
343
                                }
344
                        }
345
                        output.close();
346
                        parser.close();
347
                }
348
                catch (XMLStreamException ex) {
349
                        System.out.println(ex);
350
                }
351
                catch (IOException ex) {
352
                        System.out.println("IOException while parsing " + inputData);
353
                }
354

    
355
                return true;
356
        }
357

    
358
        /**
359
         * Sets the alignment.
360
         *
361
         * @param structure the structure
362
         * @param attribute the attribute
363
         */
364
        public void setAlignment(String structure, String attribute)
365
        {
366
                doAlign = true;
367
                this.alignStructure = structure;
368
                this.alignAttribute = attribute;
369
        }
370

    
371
        /**
372
         * Gets the tu props.
373
         *
374
         * @param files the files
375
         * @return the tu props
376
         */
377
        public List<String> getTuProps(List<File> files)
378
        {
379
                Set<String> tuprops = new HashSet<String>();
380
                for (File f : files) {
381
                        def inputData = f.toURI().toURL().openStream();
382
                        def factory = XMLInputFactory.newInstance();
383
                        def parser = factory.createXMLStreamReader(inputData);
384
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
385
                        {
386
                                switch (event) {
387
                                        case XMLStreamConstants.START_ELEMENT:
388
                                                switch (parser.getLocalName()) {
389
                                                        case "tu":
390
                                                        for ( int i = 0 ; i < parser.getAttributeCount() ; i++) {
391
                                                                tuprops.add(parser.getAttributeLocalName(i));
392
                                                        }
393
                                                        break;
394
                                                }
395
                                }
396
                        }
397
                }
398
                return new ArrayList<String>(tuprops);
399
        }
400

    
401
        public static HashMap<String, String> langs;
402
        public void setLangs(HashMap<String, String> langs)
403
        {
404
                this.langs = langs;
405
        }
406

    
407
        public static HashMap<Integer, ArrayList<String>> langGroups;
408
        public void setLangGroups(HashMap<Integer, ArrayList<String>> groups)
409
        {
410
                this.langGroups = groups;
411
        }
412
        
413
        public static HashMap<Integer, ArrayList<String>> corpusIDS;
414
        public void setCorpusIDS(HashMap<Integer, ArrayList<String>> corpusIDS)
415
        {
416
                this.corpusIDS = corpusIDS;
417
        }
418
        
419
        /**
420
         * Run.
421
         *
422
         * @param rootDirFile the root dir file
423
         * @param basename the basename
424
         * @param textAttributes the text attributes
425
         * @return true, if successful
426
         */
427
        public boolean run(File binDir, File txmDir, String basename)
428
        {
429
                sattrsListener = null; // reset SAttribute Listener for each new import
430
                String rootDir = binDir.getAbsolutePath();
431
                seg_id=1;
432
                
433
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
434
                        println ("Error: CWB executables not well set.")
435
                        return false;
436
                }
437
                if (!new File(rootDir).exists()) {
438
                        println ("binary directory does not exists: "+rootDir)
439
                        return false;
440
                }
441

    
442
                new File(binDir,"/cqp/").deleteDir();
443
                new File(binDir,"/cqp/").mkdir();
444
                new File(binDir,"/data/").deleteDir();
445
                new File(binDir,"/data/").mkdir();
446
                new File(binDir,"registry/").mkdir();
447

    
448
                String textid="";
449
                int counttext =0;
450
                List<File> files = txmDir.listFiles();
451

    
452
                //0- get all tu props
453
                //List<String> tuprops = getTuProps(files);
454

    
455
                if (corpusIDS == null || corpusIDS.size() == 0) {
456
                        corpusIDS = [:];
457
                        for (int group : langGroups.keySet()) {
458
                                String lang = langs.get(langGroups.get(group)[0]);
459
                                if (lang ==null) {
460
                                        println "ERROR: no lang defined for group $group . Aborting."
461
                                }
462
                                corpusIDS[group] = (lang+group).toLowerCase();
463
                        }
464
                }
465
                println "Using corpus ID: $corpusIDS"
466
                
467
                String cqpName;
468
                for (int group : langGroups.keySet()) {
469
                        String lang = langs.get(langGroups.get(group)[0]);
470
                        segs_id.put(group, 0);
471
                        cqpName = basename+"_"+corpusIDS.get(group);
472
                        createOutput(rootDir+"/cqp", "${cqpName}.cqp");
473
                        output.write("<txmcorpus id=\"${cqpName}\" lang=\"$lang\">\n")
474
                        output.close();
475

    
476
                        //create txmDirs
477
                        new File(rootDir, "txm/"+cqpName.toUpperCase()).mkdir();
478
                }
479

    
480
                //1- Transform into CQP file and build a corpus per file
481
                Collections.sort(files);
482
                def anaTypesPerCqp = [:];
483
                def builder = null;
484
                for (int group : langGroups.keySet()) {
485
                        //String lang = langs.get(langGroups.get(group)[0]);
486
                        cqpName = basename+"_"+corpusIDS.get(group);
487
                        def filenames = langGroups.get(group);
488
                        filenames.sort()
489
                        //println("Process group no $group of files "+filenames)
490
                        for (String filename : filenames) {
491
                                File f = new File(txmDir, filename);
492
                                print "."
493
                                if (!f.exists()) {
494
                                        println("COMPILER: file "+f+ " does not exists -> stop")
495
                                        return false;
496
                                }
497

    
498
                                filename = filename.substring(0, filename.length()-4);
499
                                String corpusname = (cqpName).toLowerCase();
500
                                counttext++;
501

    
502
                                String txtname = f.getName();
503
                                txtname = txtname.substring(0, txtname.lastIndexOf("_"));
504
                                seg_id = segs_id.get(group);
505
                                builder = new compiler(f.toURI().toURL(), txtname, basename, "default", tuprops);
506
                                builder.transfomFileCqp(rootDir+"/cqp",cqpName+".cqp");
507
                                anaTypesPerCqp[cqpName] = builder.getAnaTypes();
508
                                segs_id.put(group, seg_id);
509

    
510
                                // move xml-txm file
511
                                File txmCorpusDir = new File(rootDir, "txm/"+cqpName.toUpperCase())
512
                                f.renameTo(new File(txmCorpusDir, f.getName()));
513
                        }
514
                }
515
                println ""
516
                
517
                for (int group : langGroups.keySet()) {
518
                        //String lang = langs.get(langGroups.get(group)[0]);
519
                        createOutput(rootDir+"/cqp", basename+"_"+corpusIDS.get(group)+".cqp");
520
                        output.write("</txmcorpus>")
521
                        output.close();
522
                }
523

    
524
                if(builder == null) {
525
                        System.out.println("No TXM files to process Stop import");
526
                        return false;
527
                }
528
                //2- Import into CWB
529
                def outDir = rootDir;
530
                def outDirTxm = rootDir;
531

    
532
                def cqpFiles = [:]
533
                for (File cqpfile : new File(rootDir, "cqp").listFiles()) {
534
                        String corpusname = cqpfile.getName();
535
                        
536
                        corpusname = corpusname.substring(0, corpusname.length()-4);
537
                        def corpusAnaTypes = anaTypesPerCqp[corpusname];
538
                        
539
                        CwbEncode cwbEn = new CwbEncode();
540
                        cwbEn.setDebug(debug);
541
                        CwbMakeAll cwbMa = new CwbMakeAll();
542
                        cwbMa.setDebug(debug);
543

    
544
                        def pAttrs = ["id", "ref"];
545
                        for(String type : corpusAnaTypes) {
546
                                int i = 2;
547
                                while (pAttrs.contains(type))
548
                                        type = type+(i++);
549
                                pAttrs.add(type);
550
                        }
551

    
552
                        structs = sattrsListener.getStructs();
553
                        structsProf = sattrsListener.getProfs();
554
                        // add structures+properties found in sources
555
                        List<String> sargs = new ArrayList<String>();
556

    
557
                        for (String name : structs.keySet()) {
558
                                if (name == "header") continue;
559
                                //if ( name == "text") continue; // added after
560
                                //if ( name == "q") continue; // added after
561
                                //if ( name == "foreign") continue; // added after
562
                                String concat = name+":"+structsProf.get(name); // append the depth
563
                                for (String value : structs.get(name)) // append the attributes
564
                                        concat += "+"+value;
565
                                if ((name == "text") &&
566
                                !(concat.endsWith("+id") || concat.contains("+id+")))
567
                                        concat += "+id"
568
                                if ((name == "text") &&
569
                                !(concat.endsWith("+base") || concat.contains("+base+")))
570
                                        concat += "+base"
571
                                if ((name == "text") &&
572
                                !(concat.endsWith("+project") || concat.contains("+project+")))
573
                                        concat += "+project"
574
                                sargs.add(concat);
575
                        }
576

    
577
                        sargs.add("txmcorpus:0+id+lang")
578
                        sargs.sort();
579

    
580
                        String[] sAttributes = sargs;
581
                        String[] pAttributes = pAttrs;
582
                        println "P-attributes: "+pAttrs
583
                        println "S-attributes: "+sargs
584

    
585
                        try {
586
                                cqpFiles[corpusname.toLowerCase()] = outDir + "/cqp/"+cqpfile.getName();
587
                                String regPath = outDirTxm + "/registry/"+corpusname.toLowerCase()
588
                                cwbEn.run(outDirTxm + "/data/"+corpusname.toUpperCase()+"/", outDir + "/cqp/"+cqpfile.getName(), regPath, pAttributes, sAttributes);
589
                                if (!new File(regPath).exists()) {
590
                                        println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
591
                                        return false;
592
                                }
593
                                cwbMa.run(corpusname.toUpperCase(), outDirTxm + "/registry");
594
                        } catch (Exception ex) {System.out.println(ex); return false;}
595
                }
596

    
597
                // 3- do alignement
598
                if (doAlign) {
599
                        File registryDirectory = new File(rootDir,"registry");
600
                        def registryFiles = registryDirectory.listFiles();
601
                        for (File corpusName : registryFiles) {
602
                                for (File targetName : registryFiles) {
603
                                        if (!corpusName.equals(targetName)) {
604
                                                try {
605
                                                        PatchCwbRegistry.patchAlignment(new File(registryDirectory, corpusName.getName()), targetName.getName());
606
                                                        
607
                                                        File cqpFile1 = new File(cqpFiles[corpusName.getName()]);
608
                                                        File cqpFile2 = new File(cqpFiles[targetName.getName()]);
609
                                                        File alignOutFile = new File(outDir, "align.out");
610
                                                        
611
                                                        BuildAlignOut bao = new BuildAlignOut(cqpFile1, cqpFile2);
612
                                                        if (!bao.process(alignOutFile, "seg", "id")) {
613
                                                                println "Error while creating alignement file of $corpusName. Aborting."
614
                                                                return false;
615
                                                        }
616
                                                        
617
                                                        CwbAlign tt = new CwbAlign();
618
                                                        // ./cwb-align-encode -D -r ~/TXM/corpora/tmxtest/registry/ -v out.align
619
                                                        tt.setD();
620
                                                        tt.setv();
621
                                                        tt.setr(new File(rootDir,"registry").getAbsolutePath());
622
                                                        tt.cwbalignencode(alignOutFile.getAbsolutePath());
623
                                                } catch (IOException e) {
624
                                                        Log.severe("Error while calling TreeTagger: "+e.getLocalizedMessage());
625
                                                        return false;
626
                                                }
627
                                        }
628
                                }
629
                        }
630
                }
631

    
632
                return true;
633
        }
634

    
635
        /**
636
         * Gets the corpora names.
637
         *
638
         * @return the corpora names
639
         */
640
        public List<String> getCorporaNames()
641
        {
642
                return corporanames;
643
        }
644

    
645
        /**
646
         * Sets the debug.
647
         */
648
        public void setDebug()
649
        {
650
                this.debug = true;
651
        }
652

    
653
        /**
654
         * The main method.
655
         *
656
         * @param args the arguments
657
         */
658
        public static void main(String[] args)
659
        {
660
                File dir = new File("~/xml/geo");
661
                def c = new compiler();
662
                c.setDebug();
663
                c.setCwbPath("~/TXM/cwb/bin");
664
                c.run(dir,"geo");
665
        }
666
}