Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / transcriber / compiler.groovy @ 1000

History | View | Annotate | Download (17.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$
27
//
28
package org.txm.scripts.importer.transcriber
29

    
30
import java.io.File;
31

    
32
import javax.xml.stream.*;
33

    
34
import java.io.OutputStreamWriter;
35
import java.util.LinkedHashMap;
36

    
37
import org.txm.importer.cwb.CwbEncode
38
import org.txm.importer.cwb.CwbMakeAll
39
import org.txm.importer.cwb.CwbProcess;
40
import org.txm.importer.cwb.PatchCwbRegistry;
41
import org.txm.utils.Pair;
42

    
43
// TODO: Auto-generated Javadoc
44
/**
45
 * The Class compiler.
46
 */
47
class compiler {
48

    
49
        boolean ADD_TEXTID_TO_REF = true
50
        
51
        /** The input data. */
52
        private def inputData;
53

    
54
        /** The factory. */
55
        private def factory;
56

    
57
        /** The parser. */
58
        private XMLStreamReader parser;
59

    
60
        /** The output. */
61
        OutputStreamWriter output;
62

    
63
        /** The basename. */
64
        String corpusname;
65

    
66
        /** The projectname. */
67
        String projectname
68

    
69
        /** The outdir. */
70
        String outdir;
71

    
72
        /** The debug. */
73
        boolean debug = false;
74

    
75
        /** The annotation succes. */
76
        boolean annotationSucces = false;
77

    
78
        /** The removeinterviewers. */
79
        boolean removeinterviewers = false;
80

    
81
        /** The trans. */
82
        HashMap<String, ArrayList<Pair<String, String>>> trans;
83

    
84
        /** The speakers. */
85
        HashMap<String, ArrayList<Pair<String, String>>> speakers;
86

    
87
        /** The speakersname. */
88
        HashMap<String, String> speakersname = new HashMap<String, String>();
89

    
90
        /** The topics. */
91
        HashMap<String, ArrayList<Pair<String, String>>> topics;
92

    
93

    
94
        /** The interviewers. */
95
        ArrayList<String> interviewers = [];
96
        static HashSet<String> sectionAttrs;
97

    
98
        /** The anatypes. */
99
        private static anatypes = []
100
        private static anavalues = [:]
101

    
102
        /**
103
         * Removes the interviewers.
104
         *
105
         * @param value the value
106
         * @return the java.lang. object
107
         */
108
        public removeInterviewers(boolean value) {
109
                this.removeinterviewers = value;
110
        }
111

    
112
        /**
113
         * Run.
114
         *
115
         * @param xmlfiles the xmlfiles
116
         * @param basename the basename
117
         * @param projectname the projectname
118
         * @param outdir the outdir
119
         * @return true, if successful
120
         */
121
        public boolean run(List<File> xmlfiles, String corpusname, String projectname, File binDir)
122
        {
123
                Collections.sort(xmlfiles);
124
                //println "run compiler with $xmlfiles, $basename and $outdir"
125
                this.outdir = binDir;
126
                this.corpusname = corpusname;
127
                this.projectname = projectname;
128

    
129
                anatypes = ["event"] // reset
130
                anavalues = [:] // reset
131

    
132
                sectionAttrs = new HashSet<String>() // reset section attributs set
133

    
134
                File cqpDir = new File(binDir, "cqp")
135
                new File(binDir, "cqp").delete();
136
                cqpDir.mkdir();
137
                if (!new File(binDir, "cqp").exists()) {
138
                        println "Can't create cqp directory"
139
                        return false;
140
                }
141

    
142
                File cqpFile = new File(cqpDir, corpusname.toLowerCase()+".cqp")
143

    
144
                // get all anatypes
145
                for (File f : xmlfiles) {
146
                        getAnaTypes(f)
147
                }
148
                //println "ANATYPES: "+anatypes
149
                if (!createOutput(cqpFile)) return false;
150
                output.write("<txmcorpus lang=\"fr\">\n")
151
                output.close();
152

    
153
                println("Compiling "+xmlfiles.size()+" files")
154
                for (File f :xmlfiles) {
155
                        if (f.exists())
156
                                if (!process(f)) {
157
                                        println("Failed to compile "+f)
158
                                }
159
                }
160

    
161
                if (!createOutput(cqpFile)) return false;
162
                output.write("</txmcorpus>\n")
163
                output.close();
164

    
165
                //2- Import into CWB
166
                File registryFile = new File(binDir, "registry/"+corpusname.toLowerCase())
167
                File dataDir = new File(binDir, "data/$corpusname")
168

    
169
                new File(binDir, "registry").mkdir();
170
                if (!new File(binDir, "registry").exists()) {
171
                        println "Can't create registry directory"
172
                        return false;
173
                }
174

    
175
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
176
                        println ("Error: CWB executables not well set.")
177
                        return false;
178
                }
179
                CwbEncode cwbEn = new CwbEncode();
180
                CwbMakeAll cwbMa = new CwbMakeAll();
181
                cwbEn.setDebug(debug);
182
                cwbMa.setDebug(debug);
183

    
184
                String uAttr = "u:0+spkid+spk+scope+accent+s+time+check+dialect+type";
185
                String textAttr ="text:0+base+project"
186
                if (trans != null) {
187
                        for (String key : trans.keySet()) {
188
                                for (Pair p : trans.get(key)) {
189
                                        if (ignoreTranscriberMetadata) {
190
                                                String meta =p.getFirst();
191
                                                if (        meta != "scribe" && meta != "audio_filename" &&
192
                                                meta != "version" && meta != "version_date")
193
                                                        textAttr+="+"+meta
194
                                        } else {
195
                                                textAttr+="+"+p.getFirst()
196
                                        }
197
                                }
198
                                break;
199
                        }
200
                }
201

    
202
                String sectionAttr = "div:0" // "div:0+id+topic+endtime+starttime+type"
203
                for (String attr : sectionAttrs) {
204
                        sectionAttr += "+"+attr
205
                }
206

    
207
                List<String> pargs = ["spk", "ref", "id", "entitytype", "entityid"]
208
                for (String ana : anatypes) pargs.add(ana)
209

    
210
                String[] pAttributes = pargs
211

    
212
                String[] sAttributes = ["txmcorpus:0+lang", uAttr , textAttr, "event:0+id+desc+type+extent", sectionAttr, "sp:0+id+speaker+endtime+starttime+overlap+time"];
213

    
214
                println "pAttributes: $pAttributes"
215
                println "sAttributes: $sAttributes"
216
                //return;
217
                try {
218
                        cwbEn.run(dataDir.getAbsolutePath(),
219
                                        cqpFile.getAbsolutePath(),
220
                                        registryFile.getAbsolutePath(), pAttributes, sAttributes);
221
                        if (!registryFile.exists()) {
222
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
223
                                return false;
224
                        }
225
                        cwbMa.run(corpusname, registryFile.getParent());
226
                        
227

    
228
                } catch (Exception ex) {System.out.println(ex); return false;}
229

    
230

    
231
                return true;
232
        }
233

    
234
        /**
235
         * Creates the output.
236
         *
237
         * @param dirPathName the dir path name
238
         * @param fileName the file name
239
         * @return true, if successful
240
         */
241
        private boolean createOutput(File f){
242
                try {
243
                        //File f = new File(dirPathName, fileName)
244
                        output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f,f.exists())) , "UTF-8");
245
                        return true;
246
                } catch (Exception e) {
247
                        System.out.println(e.getLocalizedMessage());
248
                        return false;
249
                }
250
        }
251

    
252
        /** The text_id. */
253
        String text_id
254

    
255
        /** The u opened. */
256
        boolean uOpened = false;
257

    
258
        /** The idturn. */
259
        int idturn = 1;
260

    
261
        /** The idsection. */
262
        int idsection = 1;
263

    
264
        /** The idu. */
265
        int idu = 1;
266

    
267
        /** The idevent. */
268
        int idevent = 1;
269

    
270
        /** The events. */
271
        List<String> events = [];
272
        static int vEntityId = 0;
273
        static int vEntityIdCount = 1;
274

    
275
        /**
276
         * Process.
277
         *
278
         * @param xmlfile the xmlfile
279
         * @return true, if successful
280
         */
281
        private boolean process(File xmlfile) {
282
                text_id = xmlfile.getName();
283
                text_id = text_id.substring(0, text_id.length() -4);
284

    
285
                idturn = 1;
286
                idsection = 1;
287
                idu = 1;
288

    
289
                boolean flagAna;
290
                boolean flagForm;
291
                boolean flagWord;
292
                String vWord="";
293
                String vForm="";
294
                String vAna="";
295
                String vEvents = "N/A";
296
                String vEntityType = "N/A"
297
                String wordid= "";
298
                //def wordattributes = [:];
299
                String anatype = "";
300
                String anavalue = "";
301

    
302
                String formatedTime;
303

    
304
                LinkedHashMap<String, String> anahash = new LinkedHashMap<String, String>();
305
                String currentType;
306

    
307
                URL url = xmlfile.toURI().toURL();
308
                inputData = url.openStream();
309
                factory = XMLInputFactory.newInstance();
310
                parser = factory.createXMLStreamReader(inputData);
311
                String filename = xmlfile.getName()
312
                String textid = filename.substring(0, filename.length() - 4);
313

    
314
                createOutput(new File(outdir, "cqp/"+corpusname.toLowerCase()+".cqp"));
315
                String localname;
316

    
317
                //get all metadatas declared before Episode tag
318
                speakers = new HashMap<String, ArrayList<Pair<String, String>>>();
319
                trans = new HashMap<String, ArrayList<Pair<String, String>>>();
320
                topics = new HashMap<String, ArrayList<Pair<String, String>>>();
321
                //println "parse infos"
322
                parseInfos();
323

    
324
                //                println "Trans: $trans"
325
                //                println "Topics: $topics"
326
                //                println "Speakers: $speakers"
327
                //                def transproperties = ""
328
                //                for (String key : trans.keySet()) {
329
                //                        for (Pair p : trans.get(key))
330
                //                                transproperties+="\t"+p.getSecond();
331
                //                        break;
332
                //                }
333
                //                println "Trans properties: "+transproperties
334
                List<String> localspeakers;
335

    
336

    
337
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
338
                        //print "event: "+event +" "
339
                        switch (event) {
340
                                case XMLStreamConstants.START_ELEMENT:
341
                                        localname = parser.getLocalName();
342
                                //println localname
343
                                        switch(localname) {
344
                                                case "div":
345
                                                        output.write("<div");
346
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i ++) {
347
                                                                String name = parser.getAttributeLocalName(i).replace("_","").toLowerCase()
348
                                                                output.write(" "+name+"=\""+parser.getAttributeValue(i)+"\"");
349
                                                                sectionAttrs << name
350
                                                        }
351
                                                        output.write ">\n"
352
                                                        break;
353
                                                case "sp":
354
                                                        output.write("<sp");
355
                                                        writeAttributes();
356
                                                        output.write ">\n"
357
                                                        break;
358
                                                case "u":
359
                                                        output.write("<u");
360
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i ++) {
361
                                                                String name = parser.getAttributeLocalName(i).replace("_","").toLowerCase()
362
                                                                output.write(" "+name+"=\""+parser.getAttributeValue(i)+"\"");
363
                                                                if (name == "time") formatedTime = parser.getAttributeValue(i)
364
                                                                else if (name == "spk") u_name = parser.getAttributeValue(i)
365
                                                        }
366
                                                        output.write ">\n"
367
                                                        break;
368
                                                case "event":
369
                                                        output.write("<event");
370
                                                        writeAttributes();
371
                                                        output.write ">\n"
372

    
373
                                                        if (parser.getAttributeValue(null, "type") == "entities") {
374
                                                                if (parser.getAttributeValue(null, "extent") == "begin") {
375
                                                                        vEntityType = parser.getAttributeValue(null, "desc");
376
                                                                        vEntityId = vEntityIdCount++;
377
                                                                } else {
378
                                                                        vEntityType = "N/A";
379
                                                                        vEntityId = 0;
380
                                                                }
381
                                                        } else if (parser.getAttributeValue(null, "type") == "pronounce") {
382
                                                                if (parser.getAttributeValue(null, "extent") == "begin")
383
                                                                        events.add(parser.getAttributeValue(null, "desc"))
384
                                                                else if (parser.getAttributeValue(null, "extent") == "end")
385
                                                                        events.remove(parser.getAttributeValue(null, "desc"))
386
                                                                vEvents = "";
387
                                                                for (String s : events)
388
                                                                        vEvents += s+"#";
389
                                                                if (vEvents.length() > 0)
390
                                                                        vEvents = vEvents.substring(0, vEvents.length()-1);
391
                                                                else
392
                                                                        vEvents = ""
393
                                                        }
394
                                                        break;
395
                                                case "w":
396
                                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) {
397
                                                                if (parser.getAttributeLocalName(i).equals("id")) {
398
                                                                        wordid = parser.getAttributeValue(i);
399
                                                                        break;
400
                                                                }
401
                                                        }
402
                                                        anavalues = [:];
403
                                                        break;
404
                                                case "form":
405
                                                        flagForm = true;
406
                                                        vForm = "";
407
                                                        vAna ="";
408
                                                        break;
409
                                                case "ana":
410
                                                        flagAna = true;
411
                                                        anavalue = "";
412
                                                        for (int i = 0 ; i < parser.getAttributeCount(); i++)
413
                                                                if (parser.getAttributeLocalName(i).equals("type")) {
414
                                                                        anatype = parser.getAttributeValue(i).substring(1);//remove the #
415
                                                                        break;
416
                                                                }
417
                                                        break;
418
                                        }
419
                                        break;
420
                                case XMLStreamConstants.END_ELEMENT:
421
                                        localname = parser.getLocalName();
422

    
423
                                        switch (localname) {
424
                                                case "text":
425
                                                        output.write("</text>\n")
426
                                                        break;
427
                                                case "Topics":
428
                                                        break;
429
                                                case "Topic":
430
                                                        break;
431
                                                case "Speakers":
432
                                                        break;
433
                                                case "Speaker":
434
                                                        break;
435
                                                case "Episode":
436
                                                        break;
437
                                                case "div":
438
                                                        output.write("</div>\n")
439
                                                        break;
440
                                                case "sp":
441
                                                        output.write("</sp>\n")
442
                                                        break;
443
                                                case "u":
444
                                                        output.write("</u>\n")
445
                                                        break;
446
                                                case "event":
447
                                                        output.write("</event>\n")
448
                                                        break;
449
                                                case "form":
450
                                                        flagForm = false;
451
                                                        break;
452
                                                case "ana":
453
                                                        anavalues.put(anatype, anavalue)
454
                                                        flagAna = false;
455
                                                        break;
456
                                                case "w":
457
                                                // concat spk id and ref
458
                                                        String isEnq = (interviewers.contains(u_name))?"*":"";
459
                                                        String ref = (u_name+", "+formatedTime+""+isEnq)
460
                                                        if (ADD_TEXTID_TO_REF) ref = textid+", "+ref
461
                                                        vForm +="\t"+u_name+"\t"+ref
462

    
463
                                                // concat entity and entity ID
464
                                                        vAna+= "\t"+vEntityType+"\t"+vEntityId;
465

    
466
                                                //concat ana values
467
                                                        for (String type : anatypes) {
468
                                                                def v = anavalues.get(type);
469
                                                                if (v == null) v = "";
470

    
471
                                                                if ("event" == type) {
472
                                                                        if (v.length() > 0)
473
                                                                                vAna+="\t#"+v;
474
                                                                        else
475
                                                                                vAna+="\t";
476

    
477
                                                                        //concat <Event> values
478
                                                                        if (vEvents != null && vEvents.length() > 0 && vEvents != "N/A")
479
                                                                                vAna += "#"+vEvents;
480
                                                                } else {
481
                                                                        vAna+="\t"+v;
482
                                                                }
483
                                                        }
484

    
485

    
486
                                                        vForm = vForm.replaceAll("\n", "").replaceAll("&", "&amp;").replaceAll("<", "&lt;");
487

    
488
                                                        if (removeinterviewers) {
489
                                                                if (!interviewers.contains(u_name))
490
                                                                        output.write(vForm+"\t"+wordid+vAna+"\n");
491
                                                        } else {
492
                                                                output.write(vForm+"\t"+wordid+vAna+"\n");
493
                                                        }
494

    
495
                                                        vAna = "";
496
                                                        vForm = "";
497
                                                        break;
498
                                        }
499
                                        break
500
                                case XMLStreamConstants.CHARACTERS:
501
                                        if (flagForm)
502
                                                vForm += parser.getText().trim();
503
                                        if (flagAna) {
504
                                                anavalue += parser.getText().trim();
505
                                        }
506
                                        break;
507
                        }
508
                }
509

    
510
                parser.close();
511
                inputData.close();
512
                output.close();
513
                return true;
514
        }
515

    
516
        /** The u_name. */
517
        String u_name;
518

    
519
        /**
520
         * Write start tag.
521
         */
522
        private void writeStartTag() {
523
                output.write("<"+parser.getLocalName());
524
                writeAttributes();
525
                output.write ">\n"
526
        }
527

    
528
        /**
529
         * Write attributes.
530
         */
531
        private void writeAttributes() {
532
                for (int i = 0 ; i < parser.getAttributeCount() ; i ++) {
533
                        output.write(" "+parser.getAttributeLocalName(i).replace("_","").toLowerCase()+"=\""+parser.getAttributeValue(i)+"\"");
534
                }
535
        }
536

    
537
        private void getAnaTypes(File xmlFile) {
538
                inputData = xmlFile.toURI().toURL().openStream();
539
                factory = XMLInputFactory.newInstance();
540
                parser = factory.createXMLStreamReader(inputData);
541
                String ana = "ana"
542
                HashSet<String> types = new HashSet<String>();
543
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
544
                        if (event == XMLStreamConstants.START_ELEMENT) { // start elem
545
                                if (ana.equals(parser.getLocalName())) { // ana elem
546
                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type
547
                                                if ("type".equals(parser.getAttributeLocalName(i))) { // @type
548
                                                        types.add(parser.getAttributeValue(i).substring(1)); //remove the #
549
                                                        break;
550
                                                }
551
                                        }
552
                                }
553
                        }
554
                }
555
                parser.close()
556

    
557
                for (String type : types)
558
                        if (!anatypes.contains(type))
559
                                anatypes << type
560
        }
561

    
562
        /**
563
         * Write start tag.
564
         *
565
         * @param id the id
566
         */
567
        private void writeStartTag(int id) {
568
                output.write("<"+parser.getLocalName().toLowerCase());
569
                output.write(" id=\""+id+"\"");
570
                writeAttributes();
571
                output.write ">\n"
572
        }
573

    
574
        /**
575
         * Write end tag.
576
         */
577
        private void writeEndTag() {
578
                output.write("</"+parser.getLocalName().toLowerCase()+">\n");
579
        }
580

    
581
        /** The ignore transcriber metadata. */
582
        boolean ignoreTranscriberMetadata = false;
583

    
584
        /**
585
         * Sets the ignore transcriber metadata.
586
         *
587
         * @param state the new ignore transcriber metadata
588
         */
589
        public void setIgnoreTranscriberMetadata(boolean state) {
590
                this.ignoreTranscriberMetadata = state;
591
        }
592

    
593
        /**
594
         * Parses the infos.
595
         */
596
        private void parseInfos() { //until tag Episode
597
                String localname;
598
                //assert(parser != null);
599
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
600
                        if (event == XMLStreamConstants.START_ELEMENT) {
601
                                localname = parser.getLocalName();
602
                                switch (localname) {
603
                                        case "text":
604
                                                output.write("<text project=\""+projectname+"\" base=\""+corpusname+"\"")
605
                                                for (int i = 0 ; i < parser.getAttributeCount() ; i ++) {
606
                                                        if (ignoreTranscriberMetadata) {
607
                                                                if (parser.getAttributeLocalName(i) != "scribe" &&
608
                                                                parser.getAttributeLocalName(i) != "audio_filename" &&
609
                                                                parser.getAttributeLocalName(i) != "version" &&
610
                                                                parser.getAttributeLocalName(i) != "version_date")
611
                                                                        output.write(" "+parser.getAttributeLocalName(i).replace("_","").toLowerCase()+"=\""+parser.getAttributeValue(i)+"\"");
612
                                                        } else {
613
                                                                output.write(" "+parser.getAttributeLocalName(i).replace("_","").toLowerCase()+"=\""+parser.getAttributeValue(i)+"\"");
614
                                                        }
615
                                                }
616

    
617
                                                output.write ">\n"
618

    
619
                                                ArrayList list = new ArrayList<Pair<String, String>>()
620
                                                trans.put("trans", list);
621

    
622
                                                for (int i = 0 ; i < parser.getAttributeCount() ; i ++) {
623
                                                        list.add(new Pair(parser.getAttributeLocalName(i).replace("_","").toLowerCase(), parser.getAttributeValue(i)));
624
                                                        if(parser.getAttributeLocalName(i).startsWith("enq"))
625
                                                                interviewers.add(parser.getAttributeValue(i));
626
                                                }
627
                                                return
628
                                        case "Topic":
629
                                                String id = parser.getAttributeValue(null, "id");
630
                                                if (id != null) {
631
                                                        ArrayList list = new ArrayList<Pair<String, String>>()
632
                                                        topics.put(id, list);
633
                                                } else {
634
                                                        println "found tag $localname with no id"
635
                                                }
636
                                                break;
637
                                        case "Speaker":
638
                                        //case "Trans":
639
                                                String id = parser.getAttributeValue(null, "id");
640
                                                String name = parser.getAttributeValue(null, "name");
641
                                                if (id != null && name != null) {
642
                                                        speakersname.put(id, name);
643
                                                } else {
644
                                                        println "found tag $localname with no id ($id)or name ($name)"
645
                                                        return;
646
                                                }
647
                                                if (id != null) {
648
                                                        ArrayList list = new ArrayList<Pair<String, String>>()
649
                                                        speakers.put(id, list);
650

    
651
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i ++) {
652
                                                                list.add(new Pair(parser.getAttributeLocalName(i), parser.getAttributeValue(i)));
653
                                                        }
654
                                                } else {
655
                                                        println "found tag $localname with no id"
656
                                                        return;
657
                                                }
658
                                                break;
659
                                }
660
                        }
661
                }
662
        }
663

    
664
        /**
665
         * Sets the debug.
666
         */
667
        public void setDebug() {
668
                debug = true;
669
        }
670

    
671
        /**
672
         * Sets the annotation success.
673
         *
674
         * @param val the new annotation success
675
         */
676
        public void setAnnotationSuccess(boolean val) {
677
                annotationSucces = val
678
        }
679
}