Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / bvh / compiler.groovy @ 479

History | View | Annotate | Download (14.3 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.bvh
29

    
30

    
31
import org.txm.importer.cwb.CwbEncode
32
import org.txm.importer.cwb.CwbMakeAll
33
import org.txm.importer.*;
34
import org.txm.scripts.*;
35
import org.txm.scripts.teitxm.BuildTTSrc;
36
import org.txm.scripts.teitxm.*;
37
import org.txm.utils.treetagger.TreeTagger;
38

    
39
import javax.xml.stream.*;
40
import java.net.URL;
41
import java.io.File;
42
import java.util.HashMap;
43
import java.util.List;
44

    
45
// TODO: Auto-generated Javadoc
46
/**
47
 * Produce WTC files from TEI-TXM files. <br/>
48
 * Add the following word properties : "frpos","frlemme","afrpos","afrlemme","id","page","line","orig","sic","abbr". <br/>
49
 * 
50
 * @author mdecorde
51
 */
52
class compiler 
53
{
54
        
55
        /** The debug. */
56
        private boolean debug= false;
57
        
58
        /** The input data. */
59
        private def inputData;
60
        
61
        /** The factory. */
62
        private def factory;
63
        
64
        /** The parser. */
65
        private XMLStreamReader parser;
66
        
67
        /** The dir. */
68
        private def dir;
69
        
70
        /** The output. */
71
        private def output;
72
        
73
        /** The url. */
74
        private def url;
75
        
76
        /** The anahash. */
77
        private HashMap<String,String> anahash =new HashMap<String,String>() ;
78
        
79
        /** The text. */
80
        String text="";
81
        
82
        /** The base. */
83
        String base="";
84
        
85
        /** The project. */
86
        String project="";
87
        
88
        /** The cwb loc. */
89
        String cwbLoc;
90
        
91
        /** The lang. */
92
        private String lang ="fr";
93
        
94
        /**
95
         * Instantiates a new compiler.
96
         */
97
        public compiler(){}
98
        
99
        /**
100
         * initialize the compiler.
101
         *
102
         * @param url the fiel to read
103
         * @param text the Texte's name
104
         * @param base the Base's name
105
         * @param project the Project's name
106
         */
107
        public compiler(URL url,String text,String base, String project)
108
        {
109
                this.text = text
110
                this.base = base;
111
                this.project = project;
112
                try {
113
                        this.url = url;
114
                        inputData = url.openStream();
115
                        
116
                        factory = XMLInputFactory.newInstance();
117
                        parser = factory.createXMLStreamReader(inputData);
118
                } catch (XMLStreamException ex) {
119
                        System.out.println(ex);
120
                }catch (IOException ex) {
121
                        System.out.println("IOException while parsing ");
122
                }
123
        }
124
        
125
        /**
126
         * set the language of the corpus.
127
         *
128
         * @param lang the lang
129
         * @return the java.lang. object
130
         */
131
        public setLang(String lang)
132
        {
133
                this.lang = lang;
134
        }
135
        
136
        /**
137
         * Sets the cwb path.
138
         *
139
         * @param path the path to cwb utils binaries
140
         */
141
        public void setCwbPath(String path)
142
        {
143
                if(!new File(path).exists())
144
                        System.err.println("CWB Path : "+path+" does not exists")
145
                cwbLoc = path;
146
        }
147
        
148
        /**
149
         * Creates the output.
150
         *
151
         * @param dirPathName the dir path name
152
         * @param fileName the file name
153
         * @return true, if successful
154
         */
155
        private boolean createOutput(String dirPathName, String fileName){
156
                try {
157
                        File f = new File(dirPathName, fileName)
158
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
159
                        return true;
160
                } catch (Exception e) {
161
                        System.out.println(e.getLocalizedMessage());
162
                        return false;
163
                }
164
        }
165
        
166
        /**
167
         * Go to text.
168
         */
169
        private void GoToText()
170
        {
171
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
172
                {
173
                        if(event == XMLStreamConstants.END_ELEMENT)
174
                                if(parser.getLocalName().equals("teiHeader"))
175
                                        return;
176
                }
177
        }
178
        
179
        /**
180
         * Transfom file wtc.
181
         *
182
         * @param dirPathName the dir path name
183
         * @param fileName the file name
184
         * @return true, if successful
185
         */
186
        private boolean transfomFileWtc(String dirPathName, String fileName)
187
        {
188
                createOutput(dirPathName, fileName);
189
                
190
                String headvalue=""
191
                String vAna = "";
192
                String vForm = "";
193
                String wordid= "";
194
                String vHead = "";
195
                
196
                String vExpan = "";
197
                String vCorr = "";
198
                String vReg = "";
199
                String vOrig = "";
200
                String vSic = "";
201
                String vAbbr = "";
202
                
203
                String divtype = "";
204
                
205
                int p_id = 0;
206
                int q_id = 0;
207
                int lg_id = 0;
208
                int l_id = 0;
209
                int pb_id = 0;
210
                int lb_id = 0;
211
                int said_id = 0;
212
                int foreign_id = 0;
213
                int sp_id = 0;
214
                int speaker_id = 0;
215
                int stage_id = 0;
216
                
217
                boolean captureword = false;
218
                boolean fwcapture = false;
219
                boolean choicecapture = false;
220
                boolean divcapture = false;
221
                
222
                boolean flagchoice = false;
223
                boolean flagcorr = false;
224
                boolean flagsic = false;
225
                boolean flagreg = false;
226
                boolean flagexpan = false;
227
                boolean flagorig = false;
228
                boolean flagabbr = false;
229
                boolean flagfw = false;
230
                
231
                boolean flaglg = false;
232
                boolean flaghead = false;
233
                boolean flagAuthor = false;
234
                boolean flagDate = false;
235
                boolean flagForm = false;
236
                boolean flagAna = false;
237
                boolean
238
                
239
                this.GoToText()
240
                output.write("<txmcorpus lang=\""+lang+"\">\n"); 
241
                try 
242
                {
243
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
244
                        {
245
                                switch (event) 
246
                                {
247
                                        case XMLStreamConstants.START_ELEMENT:
248
                                                switch (parser.getLocalName()) 
249
                                                {                                
250
                                                        case "text":
251
                                                        captureword=true;
252
                                                        output.write("<text id=\""+text+"\" type=\""+parser.getAttributeValue(null,"type")+"\" base=\""+base+"\" project=\""+project+"\">\n");
253
                                                        break;
254
                                                        
255
                                                        case "div":
256
                                                                divtype = parser.getAttributeValue(null,"type")
257
                                                        switch(divtype)
258
                                                        {
259
                                                                case "toa":
260
                                                                case "toc":
261
                                                                break;
262
                                                                case null:
263
                                                                break;
264
                                                                default:
265
                                                                output.write("<div id=\""+parser.getAttributeValue(null,"n")+"\" type=\""+parser.getAttributeValue(null,"type")+"\">\n");                
266
                                                        }
267
                                                        
268
                                                        break;
269
                                                        
270
                                                        case "p":
271
                                                        output.write("<p id=\""+(p_id++)+"\">\n");
272
                                                        break;
273
                                                        
274
                                                        case "lg":
275
                                                        output.write("<lg id=\""+(lg_id++)+"\">\n");
276
                                                        break;
277
                                                        
278
                                                        case "l":
279
                                                        output.write("<l id=\""+(l_id++)+"\">\n");
280
                                                        break;
281
                                                        
282
                                                        case "q":
283
                                                        output.write("<q id=\""+(q_id++)+"\">\n");
284
                                                        break;
285
                                                        
286
                                                        case "said":
287
                                                        output.write("<said id=\""+(said_id++)+"\">\n");
288
                                                        break;
289
                                                        
290
                                                        case "foreign":
291
                                                        output.write("<foreign id=\""+(foreign_id++)+"\">\n");
292
                                                        break;
293
                                                        
294
                                                        case "stage":
295
                                                        output.write("<stage id=\""+(stage_id++)+"\">\n");
296
                                                        break;
297
                                                        
298
                                                        case "sp":
299
                                                        output.write("<sp id=\""+(sp_id++)+"\">\n");
300
                                                        break;
301
                                                        
302
                                                        case "speaker":
303
                                                        output.write("<speaker id=\""+(speaker_id++)+"\">\n");
304
                                                        break;
305
                                                        
306
                                                        case "fw":
307
                                                        flagfw = true;
308
                                                        break;
309
                                                        case "choice":
310
                                                        flagchoice = true;
311
                                                        break;
312
                                                        case "corr":
313
                                                        flagcorr = true;
314
                                                        vCorr= "";
315
                                                        break;
316
                                                        case "reg":
317
                                                        flagreg = true;
318
                                                        vReg= "";
319
                                                        break;
320
                                                        case "expan":
321
                                                        flagexpan = true;
322
                                                        vExpan= "";
323
                                                        break;
324
                                                        case "orig":
325
                                                        flagreg = true;
326
                                                        vOrig= "";
327
                                                        break;
328
                                                        case "sic":
329
                                                        flagsic = true;
330
                                                        vSic= "";
331
                                                        break;
332
                                                        case "abbr":
333
                                                        flagreg = true;
334
                                                        vAbbr= "";
335
                                                        break;
336
                                                        
337
                                                        case "w":
338
                                                        for(int i = 0 ; i < parser.getAttributeCount(); i++)
339
                                                                if(parser.getAttributeLocalName(i).equals("id"))
340
                                                                {        
341
                                                                        wordid = parser.getAttributeValue(i);
342
                                                                }
343
                                                        break;
344
                                                        case "form":
345
                                                        flagForm = true;
346
                                                        vForm = "";
347
                                                        vAna ="";
348
                                                        break;
349
                                                        
350
                                                        case "ana":
351
                                                        flagAna = true;
352
                                                        break;
353
                                                }
354
                                                break;
355
                                        
356
                                        case XMLStreamConstants.END_ELEMENT:
357
                                                switch (parser.getLocalName()) 
358
                                                {        
359
                                                        case "body":
360
                                                        captureword=false;
361
                                                        break;
362
                                                        
363
                                                        case "text":
364
                                                        output.write("</text>\n");
365
                                                        break;
366
                                                        
367
                                                        case "div":
368
                                                        switch(divtype)
369
                                                        {
370
                                                                case "toa":
371
                                                                case "toc":
372
                                                                break;
373
                                                                default:
374
                                                                        output.write("</div>\n");                
375
                                                        }
376
                                                        
377
                                                        break;
378
                                                        
379
                                                        case "p":
380
                                                        output.write("</p>\n");
381
                                                        break;
382
                                                        
383
                                                        case "q":
384
                                                        output.write("</q>\n");
385
                                                        break;
386
                                                        
387
                                                        case "said":
388
                                                        output.write("</said>\n");
389
                                                        break;
390
                                                        
391
                                                        case "foreign":
392
                                                        output.write("</foreign>\n");
393
                                                        break;
394
                                                        
395
                                                        case "lg":
396
                                                        output.write("</lg>\n");
397
                                                        break;
398
                                                        
399
                                                        case "pb":
400
                                                        pb_id++;
401
                                                        break;
402
                                                        
403
                                                        case "lb":
404
                                                                lb_id++;
405
                                                                break;
406
                                                        
407
                                                        case "l":
408
                                                        output.write("</l>\n");
409
                                                        break;
410
                                                        
411
                                                        case "stage":
412
                                                        output.write("</stage>\n");
413
                                                        break;
414
                                                        
415
                                                        case "sp":
416
                                                        output.write("</sp>\n");
417
                                                        break;
418
                                                        
419
                                                        case "speaker":
420
                                                        output.write("</speaker>\n");
421
                                                        break;
422
                                                        
423
                                                        
424
                                                        case "fw":
425
                                                        flagfw = false;
426
                                                        break;
427
                                                        case "choice":
428
                                                        flagchoice = false;
429
                                                        break;
430
                                                        case "corr":
431
                                                        flagcorr = false;
432
                                                        vCorr= "";
433
                                                        break;
434
                                                        case "reg":
435
                                                        flagreg = false;
436
                                                        vReg= "";
437
                                                        break;
438
                                                        case "expan":
439
                                                        flagexpan = false;
440
                                                        vExpan= "";
441
                                                        break;
442
                                                        case "orig":
443
                                                        flagreg = false;
444
                                                        vOrig= "";
445
                                                        break;
446
                                                        case "sic":
447
                                                        flagsic = false;
448
                                                        vSic= "";
449
                                                        break;
450
                                                        case "abbr":
451
                                                        flagreg = false;
452
                                                        vAbbr= "";
453
                                                        break;
454
                                                        
455
                                                        case "w":
456
                                                        if(vAna != null)
457
                                                                if(captureword)
458
                                                                {
459
                                                                        if(flagchoice)
460
                                                                        {
461
                                                                                if(flagcorr)
462
                                                                                {
463
                                                                                        output.write( vCorr +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
464
                                                                                }
465
                                                                                else if(flagreg)
466
                                                                                {
467
                                                                                        output.write( vReg +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
468
                                                                                }
469
                                                                                else if(flagsic)
470
                                                                                {
471
                                                                                        
472
                                                                                }
473
                                                                                else if(flagexpan)
474
                                                                                {
475
                                                                                        output.write( vExpan +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
476
                                                                                }        
477
                                                                        }
478
                                                                        else if(flagfw)
479
                                                                        {
480
                                                                                
481
                                                                        }
482
                                                                        else
483
                                                                        {
484
                                                                                output.write( vForm +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
485
                                                                        }
486
                                                                        
487
                                                                }
488
                                                        vAna = "";
489
                                                        vForm = "";
490
                                                        break;
491
                                                        
492
                                                        case "form":
493
                                                        flagForm = false;
494
                                                        break;
495
                                                        
496
                                                        case "ana":
497
                                                        vAna += "\t";
498
                                                        flagAna = false;
499
                                                        break;
500
                                                }
501
                                                break;
502
                                        
503
                                        case XMLStreamConstants.CHARACTERS:
504
                                                if(flagForm)
505
                                                {        
506
                                                        vForm += parser.getText().trim();
507
                                                }
508
                                                if(flagAna)
509
                                                {
510
                                                        vAna += parser.getText().trim();
511
                                                }
512
                                                if(flagsic)
513
                                                {
514
                                                        vSic += parser.getText().trim();
515
                                                }
516
                                                if(flagorig)
517
                                                {
518
                                                        vOrig += parser.getText().trim();
519
                                                }
520
                                                if(flagabbr)
521
                                                {
522
                                                        vAbbr += parser.getText().trim();
523
                                                }
524
                                                break;
525
                                }
526
                        }
527
                        output.write("</txmcorpus>"); 
528
                        output.close();
529
                        parser.close();
530
                        inputData.close();
531
                }
532
                catch (XMLStreamException ex) {
533
                        System.out.println(ex);
534
                }
535
                catch (IOException ex) {
536
                        System.out.println("IOException while parsing " + inputData);
537
                }
538
                
539
                return true;
540
        }
541
        
542
        
543
        /**
544
         * start processing.
545
         *
546
         * @param rootDirFile the directory containing the TEI-TXM files
547
         * @return true, if successful
548
         */
549
        public boolean run(File rootDirFile) 
550
        {
551
                String rootDir =rootDirFile.getAbsolutePath();
552
                
553
                if(cwbLoc == null)
554
                        cwbLoc = org.txm.Toolbox.getParam(org.txm.Toolbox.CQI_SERVER_PATH_TO_CQPLIB)+File.separator;
555
                
556
                if(!new File(cwbLoc).exists())
557
                {
558
                        println ("CWB path error: "+cwbLoc)
559
                        return false;
560
                }
561
                if(!new File(rootDir).exists())
562
                {
563
                        println ("binary directory does not exists: "+rootDir)
564
                        return false;
565
                }
566
                new File(rootDir+"/wtc/","bvh.wtc").delete();//cleaning&preparing
567
                new File(rootDir,"/wtc/").deleteDir();
568
                new File(rootDir,"/wtc/").mkdir();
569
                new File(rootDir,"/data/").deleteDir();
570
                new File(rootDir,"/data/").mkdir();
571
                new File(rootDir,"registry/").mkdir();
572
                
573
                String textid="";
574
                int counttext =0;
575
                List<File> files = new File(rootDirFile,"txm").listFiles();
576
                Collections.sort(files);
577
                //1- Transform into WTC file
578
                for(File f : files)
579
                {
580
                        counttext++;
581
                        if(!f.exists())
582
                        {
583
                                println("file "+f+ " does not exists")        
584
                        }
585
                        else
586
                        {        
587
                                println("process file "+f)
588
                                String txtname = f.getName().substring(0,f.getName().length()-4);
589
                                def builder = new compiler(f.toURL(),txtname, "bvh", "default");
590
                                builder.setLang lang
591
                                builder.transfomFileWtc(rootDir+"/wtc","bvh.wtc");
592
                        }
593
                }
594
                
595
                //2- Import into CWB
596
                def outDir =rootDir;
597
                def outDirTxm = rootDir;
598
                
599
                CwbEncode cwbEn = new CwbEncode();
600
                cwbEn.setDebug(debug);
601
                CwbMakeAll cwbMa = new CwbMakeAll();
602
                cwbMa.setDebug(debug);
603
                String[] pAttributes = ["frpos","frlemme","afrpos","afrlemme","id","page","line","orig","sic","abbr"];
604
                String[] sAttributes = ["txmcorpus:0+lang", "text:2+id+type+base+project","div:2+id+type","p:0+id","q:0+id","said:0+id","foreign:0+id","lg:0+id","l:0+id","sp:0+id","speaker:0+id","stage:0+id"];
605
                try {
606
                        String regPath = outDirTxm + "/registry/"+"bvh"
607
                        cwbEn.run(new File(cwbLoc,"cwb-encode").getAbsolutePath(), outDirTxm + "/data", outDir + "/wtc/"+"bvh.wtc", regPath, pAttributes, sAttributes);
608
                        if (!new File(regPath).exists()) {
609
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
610
                                return false;
611
                        }
612
                        cwbMa.run(new File(cwbLoc,"cwb-makeall").getAbsolutePath(), "BVH", outDirTxm + "/registry");
613
                        
614
                } catch (Exception ex) {System.out.println(ex); return false}
615
                
616
                System.out.println("Done.") 
617
                
618
                return true;
619
        }
620
        
621
        /**
622
         * show cwb-encode and cwb-makeall messages.
623
         */
624
        public void setDebug()
625
        {
626
                this.debug = true;
627
        }
628
        
629
        /**
630
         * test purpose.
631
         *
632
         * @param args the arguments
633
         */
634
        public static void main(String[] args)
635
        {
636
                File dir = new File("~/xml/bvh");
637
                def c = new compiler();
638
                c.setDebug();
639
                c.setCwbPath("~/TXM/cwb/bin");
640
                c.run(dir);
641
        }
642
}