Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / transcriber / compiler.groovy @ 187

History | View | Annotate | Download (17.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$
27
//
28
package org.txm.importer.transcriber
29

    
30
import java.io.File;
31

    
32
import javax.xml.stream.*;
33

    
34
import java.io.OutputStreamWriter;
35
import java.util.LinkedHashMap;
36

    
37
import org.txm.importer.cwb.CwbEncode
38
import org.txm.importer.cwb.CwbMakeAll
39
import org.txm.importer.cwb.CwbProcess;
40
import org.txm.importer.cwb.PatchCwbRegistry;
41
import org.txm.utils.Pair;
42

    
43
// TODO: Auto-generated Javadoc
44
/**
45
 * The Class compiler.
46
 */
47
class compiler {
48

    
49
        boolean ADD_TEXTID_TO_REF = true
50
        
51
        /** The input data. */
52
        private def inputData;
53

    
54
        /** The factory. */
55
        private def factory;
56

    
57
        /** The parser. */
58
        private XMLStreamReader parser;
59

    
60
        /** The output. */
61
        OutputStreamWriter output;
62

    
63
        /** The basename. */
64
        String corpusname;
65

    
66
        /** The projectname. */
67
        String projectname
68

    
69
        /** The outdir. */
70
        String outdir;
71

    
72
        /** The debug. */
73
        boolean debug = false;
74

    
75
        /** The annotation succes. */
76
        boolean annotationSucces = false;
77

    
78
        /** The removeinterviewers. */
79
        boolean removeinterviewers = false;
80

    
81
        /** The trans. */
82
        HashMap<String, ArrayList<Pair<String, String>>> trans;
83

    
84
        /** The speakers. */
85
        HashMap<String, ArrayList<Pair<String, String>>> speakers;
86

    
87
        /** The speakersname. */
88
        HashMap<String, String> speakersname = new HashMap<String, String>();
89

    
90
        /** The topics. */
91
        HashMap<String, ArrayList<Pair<String, String>>> topics;
92

    
93

    
94
        /** The interviewers. */
95
        ArrayList<String> interviewers = [];
96
        static HashSet<String> sectionAttrs;
97

    
98
        /** The anatypes. */
99
        private static anatypes = []
100
        private static anavalues = [:]
101

    
102
        /**
103
         * Removes the interviewers.
104
         *
105
         * @param value the value
106
         * @return the java.lang. object
107
         */
108
        public removeInterviewers(boolean value) {
109
                this.removeinterviewers = value;
110
        }
111

    
112
        /**
113
         * Run.
114
         *
115
         * @param xmlfiles the xmlfiles
116
         * @param basename the basename
117
         * @param projectname the projectname
118
         * @param outdir the outdir
119
         * @return true, if successful
120
         */
121
        public boolean run(List<File> xmlfiles, String corpusname, String projectname, File binDir)
122
        {
123
                Collections.sort(xmlfiles);
124
                //println "run compiler with $xmlfiles, $basename and $outdir"
125
                this.outdir = binDir;
126
                this.corpusname = corpusname;
127
                this.projectname = projectname;
128

    
129
                anatypes = ["event"] // reset
130
                anavalues = [:] // reset
131

    
132
                sectionAttrs = new HashSet<String>() // reset section attributs set
133

    
134
                File wtcDir = new File(binDir, "wtc")
135
                new File(binDir, "wtc").delete();
136
                wtcDir.mkdir();
137
                if (!new File(binDir, "wtc").exists()) {
138
                        println "Can't create wtc directory"
139
                        return false;
140
                }
141

    
142
                File wtcFile = new File(wtcDir, corpusname.toLowerCase()+".wtc")
143

    
144
                // get all anatypes
145
                for (File f : xmlfiles) {
146
                        getAnaTypes(f)
147
                }
148
                //println "ANATYPES: "+anatypes
149
                if (!createOutput(wtcFile)) return false;
150
                output.write("<txmcorpus lang=\"fr\">\n")
151
                output.close();
152

    
153
                println("Compiling "+xmlfiles.size()+" files")
154
                for (File f :xmlfiles) {
155
                        if (f.exists())
156
                                if (!process(f)) {
157
                                        println("Failed to compile "+f)
158
                                }
159
                }
160

    
161
                if (!createOutput(wtcFile)) return false;
162
                output.write("</txmcorpus>\n")
163
                output.close();
164

    
165
                //2- Import into CWB
166
                File registryFile = new File(binDir, "registry/"+corpusname.toLowerCase())
167
                File dataDir = new File(binDir, "data/$corpusname")
168

    
169
                new File(binDir, "registry").mkdir();
170
                if (!new File(binDir, "registry").exists()) {
171
                        println "Can't create registry directory"
172
                        return false;
173
                }
174

    
175
                String cwbLoc = CwbProcess.getCWBLocation();
176
                if (cwbLoc == null) return false;
177

    
178
                CwbEncode cwbEn = new CwbEncode();
179
                CwbMakeAll cwbMa = new CwbMakeAll();
180
                cwbEn.setDebug(debug);
181
                cwbMa.setDebug(debug);
182

    
183
                String uAttr = "u:0+spkid+spk+scope+accent+s+time+check+dialect+type";
184
                String textAttr ="text:0+base+project"
185
                if (trans != null) {
186
                        for (String key : trans.keySet()) {
187
                                for (Pair p : trans.get(key)) {
188
                                        if (ignoreTranscriberMetadata) {
189
                                                String meta =p.getFirst();
190
                                                if (        meta != "scribe" && meta != "audio_filename" &&
191
                                                meta != "version" && meta != "version_date")
192
                                                        textAttr+="+"+meta
193
                                        } else {
194
                                                textAttr+="+"+p.getFirst()
195
                                        }
196
                                }
197
                                break;
198
                        }
199
                }
200

    
201
                String sectionAttr = "div:0" // "div:0+id+topic+endtime+starttime+type"
202
                for (String attr : sectionAttrs) {
203
                        sectionAttr += "+"+attr
204
                }
205

    
206
                List<String> pargs = ["spk", "ref", "id", "entitytype", "entityid"]
207
                for (String ana : anatypes) pargs.add(ana)
208

    
209
                String[] pAttributes = pargs
210

    
211
                String[] sAttributes = ["txmcorpus:0+lang", uAttr , textAttr, "event:0+id+desc+type+extent", sectionAttr, "sp:0+id+speaker+endtime+starttime+overlap+time"];
212

    
213
                println "pAttributes: $pAttributes"
214
                println "sAttributes: $sAttributes"
215
                //return;
216
                try {
217
                        cwbEn.run(cwbLoc + "cwb-encode", dataDir.getAbsolutePath(),
218
                                        wtcFile.getAbsolutePath(),
219
                                        registryFile.getAbsolutePath(), pAttributes, sAttributes);
220
                        if (!registryFile.exists()) {
221
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
222
                                return false;
223
                        }
224
                        cwbMa.run(cwbLoc + "cwb-makeall", corpusname, registryFile.getParent());
225
                        
226

    
227
                } catch (Exception ex) {System.out.println(ex); return false;}
228

    
229

    
230
                return true;
231
        }
232

    
233
        /**
234
         * Creates the output.
235
         *
236
         * @param dirPathName the dir path name
237
         * @param fileName the file name
238
         * @return true, if successful
239
         */
240
        private boolean createOutput(File f){
241
                try {
242
                        //File f = new File(dirPathName, fileName)
243
                        output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f,f.exists())) , "UTF-8");
244
                        return true;
245
                } catch (Exception e) {
246
                        System.out.println(e.getLocalizedMessage());
247
                        return false;
248
                }
249
        }
250

    
251
        /** The text_id. */
252
        String text_id
253

    
254
        /** The u opened. */
255
        boolean uOpened = false;
256

    
257
        /** The idturn. */
258
        int idturn = 1;
259

    
260
        /** The idsection. */
261
        int idsection = 1;
262

    
263
        /** The idu. */
264
        int idu = 1;
265

    
266
        /** The idevent. */
267
        int idevent = 1;
268

    
269
        /** The events. */
270
        List<String> events = [];
271
        static int vEntityId = 0;
272
        static int vEntityIdCount = 1;
273

    
274
        /**
275
         * Process.
276
         *
277
         * @param xmlfile the xmlfile
278
         * @return true, if successful
279
         */
280
        private boolean process(File xmlfile) {
281
                text_id = xmlfile.getName();
282
                text_id = text_id.substring(0, text_id.length() -4);
283

    
284
                idturn = 1;
285
                idsection = 1;
286
                idu = 1;
287

    
288
                boolean flagAna;
289
                boolean flagForm;
290
                boolean flagWord;
291
                String vWord="";
292
                String vForm="";
293
                String vAna="";
294
                String vEvents = "N/A";
295
                String vEntityType = "N/A"
296
                String wordid= "";
297
                //def wordattributes = [:];
298
                String anatype = "";
299
                String anavalue = "";
300

    
301
                String formatedTime;
302

    
303
                LinkedHashMap<String, String> anahash = new LinkedHashMap<String, String>();
304
                String currentType;
305

    
306
                URL url = xmlfile.toURI().toURL();
307
                inputData = url.openStream();
308
                factory = XMLInputFactory.newInstance();
309
                parser = factory.createXMLStreamReader(inputData);
310
                String filename = xmlfile.getName()
311
                String textid = filename.substring(0, filename.length() - 4);
312

    
313
                createOutput(new File(outdir, "wtc/"+corpusname.toLowerCase()+".wtc"));
314
                String localname;
315

    
316
                //get all metadatas declared before Episode tag
317
                speakers = new HashMap<String, ArrayList<Pair<String, String>>>();
318
                trans = new HashMap<String, ArrayList<Pair<String, String>>>();
319
                topics = new HashMap<String, ArrayList<Pair<String, String>>>();
320
                //println "parse infos"
321
                parseInfos();
322

    
323
                //                println "Trans: $trans"
324
                //                println "Topics: $topics"
325
                //                println "Speakers: $speakers"
326
                //                def transproperties = ""
327
                //                for (String key : trans.keySet()) {
328
                //                        for (Pair p : trans.get(key))
329
                //                                transproperties+="\t"+p.getSecond();
330
                //                        break;
331
                //                }
332
                //                println "Trans properties: "+transproperties
333
                List<String> localspeakers;
334

    
335

    
336
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
337
                        //print "event: "+event +" "
338
                        switch (event) {
339
                                case XMLStreamConstants.START_ELEMENT:
340
                                        localname = parser.getLocalName();
341
                                //println localname
342
                                        switch(localname) {
343
                                                case "div":
344
                                                        output.write("<div");
345
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i ++) {
346
                                                                String name = parser.getAttributeLocalName(i).replace("_","").toLowerCase()
347
                                                                output.write(" "+name+"=\""+parser.getAttributeValue(i)+"\"");
348
                                                                sectionAttrs << name
349
                                                        }
350
                                                        output.write ">\n"
351
                                                        break;
352
                                                case "sp":
353
                                                        output.write("<sp");
354
                                                        writeAttributes();
355
                                                        output.write ">\n"
356
                                                        break;
357
                                                case "u":
358
                                                        output.write("<u");
359
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i ++) {
360
                                                                String name = parser.getAttributeLocalName(i).replace("_","").toLowerCase()
361
                                                                output.write(" "+name+"=\""+parser.getAttributeValue(i)+"\"");
362
                                                                if (name == "time") formatedTime = parser.getAttributeValue(i)
363
                                                                else if (name == "spk") u_name = parser.getAttributeValue(i)
364
                                                        }
365
                                                        output.write ">\n"
366
                                                        break;
367
                                                case "event":
368
                                                        output.write("<event");
369
                                                        writeAttributes();
370
                                                        output.write ">\n"
371

    
372
                                                        if (parser.getAttributeValue(null, "type") == "entities") {
373
                                                                if (parser.getAttributeValue(null, "extent") == "begin") {
374
                                                                        vEntityType = parser.getAttributeValue(null, "desc");
375
                                                                        vEntityId = vEntityIdCount++;
376
                                                                } else {
377
                                                                        vEntityType = "N/A";
378
                                                                        vEntityId = 0;
379
                                                                }
380
                                                        } else if (parser.getAttributeValue(null, "type") == "pronounce") {
381
                                                                if (parser.getAttributeValue(null, "extent") == "begin")
382
                                                                        events.add(parser.getAttributeValue(null, "desc"))
383
                                                                else if (parser.getAttributeValue(null, "extent") == "end")
384
                                                                        events.remove(parser.getAttributeValue(null, "desc"))
385
                                                                vEvents = "";
386
                                                                for (String s : events)
387
                                                                        vEvents += s+"#";
388
                                                                if (vEvents.length() > 0)
389
                                                                        vEvents = vEvents.substring(0, vEvents.length()-1);
390
                                                                else
391
                                                                        vEvents = ""
392
                                                        }
393
                                                        break;
394
                                                case "w":
395
                                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) {
396
                                                                if (parser.getAttributeLocalName(i).equals("id")) {
397
                                                                        wordid = parser.getAttributeValue(i);
398
                                                                        break;
399
                                                                }
400
                                                        }
401
                                                        anavalues = [:];
402
                                                        break;
403
                                                case "form":
404
                                                        flagForm = true;
405
                                                        vForm = "";
406
                                                        vAna ="";
407
                                                        break;
408
                                                case "ana":
409
                                                        flagAna = true;
410
                                                        anavalue = "";
411
                                                        for (int i = 0 ; i < parser.getAttributeCount(); i++)
412
                                                                if (parser.getAttributeLocalName(i).equals("type")) {
413
                                                                        anatype = parser.getAttributeValue(i).substring(1);//remove the #
414
                                                                        break;
415
                                                                }
416
                                                        break;
417
                                        }
418
                                        break;
419
                                case XMLStreamConstants.END_ELEMENT:
420
                                        localname = parser.getLocalName();
421

    
422
                                        switch (localname) {
423
                                                case "text":
424
                                                        output.write("</text>\n")
425
                                                        break;
426
                                                case "Topics":
427
                                                        break;
428
                                                case "Topic":
429
                                                        break;
430
                                                case "Speakers":
431
                                                        break;
432
                                                case "Speaker":
433
                                                        break;
434
                                                case "Episode":
435
                                                        break;
436
                                                case "div":
437
                                                        output.write("</div>\n")
438
                                                        break;
439
                                                case "sp":
440
                                                        output.write("</sp>\n")
441
                                                        break;
442
                                                case "u":
443
                                                        output.write("</u>\n")
444
                                                        break;
445
                                                case "event":
446
                                                        output.write("</event>\n")
447
                                                        break;
448
                                                case "form":
449
                                                        flagForm = false;
450
                                                        break;
451
                                                case "ana":
452
                                                        anavalues.put(anatype, anavalue)
453
                                                        flagAna = false;
454
                                                        break;
455
                                                case "w":
456
                                                // concat spk id and ref
457
                                                        String isEnq = (interviewers.contains(u_name))?"*":"";
458
                                                        String ref = (u_name+", "+formatedTime+""+isEnq)
459
                                                        if (ADD_TEXTID_TO_REF) ref = textid+", "+ref
460
                                                        vForm +="\t"+u_name+"\t"+ref
461

    
462
                                                // concat entity and entity ID
463
                                                        vAna+= "\t"+vEntityType+"\t"+vEntityId;
464

    
465
                                                //concat ana values
466
                                                        for (String type : anatypes) {
467
                                                                def v = anavalues.get(type);
468
                                                                if (v == null) v = "";
469

    
470
                                                                if ("event" == type) {
471
                                                                        if (v.length() > 0)
472
                                                                                vAna+="\t#"+v;
473
                                                                        else
474
                                                                                vAna+="\t";
475

    
476
                                                                        //concat <Event> values
477
                                                                        if (vEvents != null && vEvents.length() > 0 && vEvents != "N/A")
478
                                                                                vAna += "#"+vEvents;
479
                                                                } else {
480
                                                                        vAna+="\t"+v;
481
                                                                }
482
                                                        }
483

    
484

    
485
                                                        vForm = vForm.replaceAll("\n", "").replaceAll("&", "&amp;").replaceAll("<", "&lt;");
486

    
487
                                                        if (removeinterviewers) {
488
                                                                if (!interviewers.contains(u_name))
489
                                                                        output.write(vForm+"\t"+wordid+vAna+"\n");
490
                                                        } else {
491
                                                                output.write(vForm+"\t"+wordid+vAna+"\n");
492
                                                        }
493

    
494
                                                        vAna = "";
495
                                                        vForm = "";
496
                                                        break;
497
                                        }
498
                                        break
499
                                case XMLStreamConstants.CHARACTERS:
500
                                        if (flagForm)
501
                                                vForm += parser.getText().trim();
502
                                        if (flagAna) {
503
                                                anavalue += parser.getText().trim();
504
                                        }
505
                                        break;
506
                        }
507
                }
508

    
509
                parser.close();
510
                inputData.close();
511
                output.close();
512
                return true;
513
        }
514

    
515
        /** The u_name. */
516
        String u_name;
517

    
518
        /**
519
         * Write start tag.
520
         */
521
        private void writeStartTag() {
522
                output.write("<"+parser.getLocalName());
523
                writeAttributes();
524
                output.write ">\n"
525
        }
526

    
527
        /**
528
         * Write attributes.
529
         */
530
        private void writeAttributes() {
531
                for (int i = 0 ; i < parser.getAttributeCount() ; i ++) {
532
                        output.write(" "+parser.getAttributeLocalName(i).replace("_","").toLowerCase()+"=\""+parser.getAttributeValue(i)+"\"");
533
                }
534
        }
535

    
536
        private void getAnaTypes(File xmlFile) {
537
                inputData = xmlFile.toURI().toURL().openStream();
538
                factory = XMLInputFactory.newInstance();
539
                parser = factory.createXMLStreamReader(inputData);
540
                String ana = "ana"
541
                HashSet<String> types = new HashSet<String>();
542
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
543
                        if (event == XMLStreamConstants.START_ELEMENT) { // start elem
544
                                if (ana.equals(parser.getLocalName())) { // ana elem
545
                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type
546
                                                if ("type".equals(parser.getAttributeLocalName(i))) { // @type
547
                                                        types.add(parser.getAttributeValue(i).substring(1)); //remove the #
548
                                                        break;
549
                                                }
550
                                        }
551
                                }
552
                        }
553
                }
554
                parser.close()
555

    
556
                for (String type : types)
557
                        if (!anatypes.contains(type))
558
                                anatypes << type
559
        }
560

    
561
        /**
562
         * Write start tag.
563
         *
564
         * @param id the id
565
         */
566
        private void writeStartTag(int id) {
567
                output.write("<"+parser.getLocalName().toLowerCase());
568
                output.write(" id=\""+id+"\"");
569
                writeAttributes();
570
                output.write ">\n"
571
        }
572

    
573
        /**
574
         * Write end tag.
575
         */
576
        private void writeEndTag() {
577
                output.write("</"+parser.getLocalName().toLowerCase()+">\n");
578
        }
579

    
580
        /** The ignore transcriber metadata. */
581
        boolean ignoreTranscriberMetadata = false;
582

    
583
        /**
584
         * Sets the ignore transcriber metadata.
585
         *
586
         * @param state the new ignore transcriber metadata
587
         */
588
        public void setIgnoreTranscriberMetadata(boolean state) {
589
                this.ignoreTranscriberMetadata = state;
590
        }
591

    
592
        /**
593
         * Parses the infos.
594
         */
595
        private void parseInfos() { //until tag Episode
596
                String localname;
597
                //assert(parser != null);
598
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
599
                        if (event == XMLStreamConstants.START_ELEMENT) {
600
                                localname = parser.getLocalName();
601
                                switch (localname) {
602
                                        case "text":
603
                                                output.write("<text project=\""+projectname+"\" base=\""+corpusname+"\"")
604
                                                for (int i = 0 ; i < parser.getAttributeCount() ; i ++) {
605
                                                        if (ignoreTranscriberMetadata) {
606
                                                                if (parser.getAttributeLocalName(i) != "scribe" &&
607
                                                                parser.getAttributeLocalName(i) != "audio_filename" &&
608
                                                                parser.getAttributeLocalName(i) != "version" &&
609
                                                                parser.getAttributeLocalName(i) != "version_date")
610
                                                                        output.write(" "+parser.getAttributeLocalName(i).replace("_","").toLowerCase()+"=\""+parser.getAttributeValue(i)+"\"");
611
                                                        } else {
612
                                                                output.write(" "+parser.getAttributeLocalName(i).replace("_","").toLowerCase()+"=\""+parser.getAttributeValue(i)+"\"");
613
                                                        }
614
                                                }
615

    
616
                                                output.write ">\n"
617

    
618
                                                ArrayList list = new ArrayList<Pair<String, String>>()
619
                                                trans.put("trans", list);
620

    
621
                                                for (int i = 0 ; i < parser.getAttributeCount() ; i ++) {
622
                                                        list.add(new Pair(parser.getAttributeLocalName(i).replace("_","").toLowerCase(), parser.getAttributeValue(i)));
623
                                                        if(parser.getAttributeLocalName(i).startsWith("enq"))
624
                                                                interviewers.add(parser.getAttributeValue(i));
625
                                                }
626
                                                return
627
                                        case "Topic":
628
                                                String id = parser.getAttributeValue(null, "id");
629
                                                if (id != null) {
630
                                                        ArrayList list = new ArrayList<Pair<String, String>>()
631
                                                        topics.put(id, list);
632
                                                } else {
633
                                                        println "found tag $localname with no id"
634
                                                }
635
                                                break;
636
                                        case "Speaker":
637
                                        //case "Trans":
638
                                                String id = parser.getAttributeValue(null, "id");
639
                                                String name = parser.getAttributeValue(null, "name");
640
                                                if (id != null && name != null) {
641
                                                        speakersname.put(id, name);
642
                                                } else {
643
                                                        println "found tag $localname with no id ($id)or name ($name)"
644
                                                        return;
645
                                                }
646
                                                if (id != null) {
647
                                                        ArrayList list = new ArrayList<Pair<String, String>>()
648
                                                        speakers.put(id, list);
649

    
650
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i ++) {
651
                                                                list.add(new Pair(parser.getAttributeLocalName(i), parser.getAttributeValue(i)));
652
                                                        }
653
                                                } else {
654
                                                        println "found tag $localname with no id"
655
                                                        return;
656
                                                }
657
                                                break;
658
                                }
659
                        }
660
                }
661
        }
662

    
663
        /**
664
         * Sets the debug.
665
         */
666
        public void setDebug() {
667
                debug = true;
668
        }
669

    
670
        /**
671
         * Sets the annotation success.
672
         *
673
         * @param val the new annotation success
674
         */
675
        public void setAnnotationSuccess(boolean val) {
676
                annotationSucces = val
677
        }
678
}