Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / RGAQCJ / compiler.groovy @ 1000

History | View | Annotate | Download (14.1 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.scripts.importer.RGAQCJ
29

    
30

    
31

    
32
import org.txm.importer.cwb.CwbEncode
33
import org.txm.importer.cwb.CwbMakeAll
34
import org.txm.scripts.importer.*;
35
import org.txm.scripts.*;
36
import org.txm.importer.scripts.xmltxm.BuildTTSrc;
37
import org.txm.importer.scripts.xmltxm.*;
38
import org.txm.utils.treetagger.TreeTagger;
39

    
40
import javax.xml.stream.*;
41
import java.net.URL;
42
import java.io.File;
43
import java.util.HashMap;
44
import java.util.List;
45

    
46
// TODO: Auto-generated Javadoc
47
/**
48
 * The Class compiler.
49
 */
50
class compiler 
51
{
52
        
53
        /** The debug. */
54
        private boolean debug= false;
55
        
56
        /** The input data. */
57
        private def inputData;
58
        
59
        /** The factory. */
60
        private def factory;
61
        
62
        /** The parser. */
63
        private XMLStreamReader parser;
64
        
65
        /** The dir. */
66
        private def dir;
67
        
68
        /** The output. */
69
        private def output;
70
        
71
        /** The url. */
72
        private def url;
73
        
74
        /** The anahash. */
75
        private HashMap<String,String> anahash =new HashMap<String,String>() ;
76
        
77
        /** The text. */
78
        String text="";
79
        
80
        /** The base. */
81
        String base="";
82
        
83
        /** The project. */
84
        String project="";
85
        
86
        /** The lang. */
87
        private String lang ="fr";
88
        
89
        /**
90
         * initialize.
91
         *
92
         */
93
        public compiler(){}
94
        
95
        /**
96
         * Instantiates a new compiler.
97
         *
98
         * @param url the url
99
         * @param text the text
100
         * @param base the base
101
         * @param project the project
102
         */
103
        public compiler(URL url,String text,String base, String project)
104
        {
105
                this.text = text
106
                this.base = base;
107
                this.project = project;
108
                try {
109
                        this.url = url;
110
                        inputData = url.openStream();
111
                        
112
                        factory = XMLInputFactory.newInstance();
113
                        parser = factory.createXMLStreamReader(inputData);
114
                } catch (XMLStreamException ex) {
115
                        System.out.println(ex);
116
                }catch (IOException ex) {
117
                        System.out.println("IOException while parsing ");
118
                }
119
        }
120
        
121
        /**
122
         * set the language of the corpus.
123
         *
124
         * @param lang the lang
125
         * @return the java.lang. object
126
         */
127
        public setLang(String lang)
128
        {
129
                this.lang = lang;
130
        }
131
        
132
        /**
133
         * Creates the output.
134
         *
135
         * @param dirPathName the dir path name
136
         * @param fileName the file name
137
         * @return true, if successful
138
         */
139
        private boolean createOutput(String dirPathName, String fileName){
140
                try {
141
                        File f = new File(dirPathName, fileName)
142
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
143
                        return true;
144
                } catch (Exception e) {
145
                        System.out.println(e.getLocalizedMessage());
146
                        return false;
147
                }
148
        }
149
        
150
        /**
151
         * Go to text.
152
         */
153
        private void GoToText()
154
        {
155
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
156
                {
157
                        if(event == XMLStreamConstants.END_ELEMENT)
158
                                if(parser.getLocalName().equals("teiHeader"))
159
                                        return;
160
                }
161
        }
162
        
163
        /**
164
         * Transfom file cqp.
165
         *
166
         * @param dirPathName the dir path name
167
         * @param fileName the file name
168
         * @return true, if successful
169
         */
170
        public boolean transfomFileCqp(String dirPathName, String fileName)
171
        {
172
                createOutput(dirPathName, fileName);
173
                
174
                String headvalue=""
175
                String vAna = "";
176
                String vForm = "";
177
                String wordid= "";
178
                String vHead = "";
179
                
180
                int p_id = 0;
181
                int q_id = 0;
182
                int body_id = 0;
183
                int front_id = 0;
184
                int back_id = 0;
185
                String lb_id = 0;
186
                int lb_count = 0;
187
                String pb_id = 0;
188
                int pb_count = 0;
189
                String ab_id = 0;
190
                
191
                boolean captureword = false;
192
                
193
                String vExpan = "";
194
                String vCorr = "";
195
                String vReg = "";
196
                String vOrig = "";
197
                String vSic = "";
198
                String vAbbr = "";
199
                
200
                boolean flaglg = false;
201
                boolean flaghead = false;
202
                boolean flagAuthor = false;
203
                boolean flagDate = false;
204
                boolean flagWord = false;
205
                boolean flagForm = false;
206
                boolean flagAna = false;
207
                
208
                boolean flagchoice = false;
209
                boolean flagcorr = false;
210
                boolean flagsic = false;
211
                boolean flagreg = false;
212
                boolean flagexpan = false;
213
                boolean flagorig = false;
214
                boolean flagabbr = false;
215
                boolean flagfw = false;
216
                
217
                File xpathfile = new File(url.getFile());
218
                String titreId = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title[@type='reference']/text()");
219
                String auteur = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:author/text()");
220
                String datecompo = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:date[@type='compo']/@when");
221
                String ssiecle = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:date[@type='compo_sous_siecle']/@n");
222
                String domaine = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:textDesc/tei:domain/@type");
223
                String genre = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:textDesc/@n");
224
                String forme = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:textClass/tei:catRef/@target[contains(.,'forme')]").substring(1);
225
                String dialecte = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:region[@type='dialecte_auteur']/text()");
226
                
227
                this.GoToText()
228
                output.write("<txmcorpus lang=\""+lang+"\">\n"); 
229
                try 
230
                {
231
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
232
                        {
233
                                switch (event) 
234
                                {
235
                                        case XMLStreamConstants.START_ELEMENT:
236
                                                switch (parser.getLocalName()) 
237
                                                {                                                
238
                                                        case "text":
239
                                                        output.write("<text id=\""+text+"\"" +
240
                                                        " titre=\""+titreId+"\"" +
241
                                                        " auteur=\""+auteur+"\"" +
242
                                                        " datecompo=\""+datecompo+"\"" +
243
                                                        " ssiecle=\""+ssiecle+"\"" +
244
                                                        " domaine=\""+domaine+"\"" +
245
                                                        " genre=\""+genre+"\"" +
246
                                                        " forme=\""+forme+"\"" +
247
                                                        " dialecte=\""+dialecte+"\"" +
248
                                                        " base=\""+base+"\"" +
249
                                                        " project=\""+project+"\">\n");
250
                                                        captureword=true;
251
                                                        break;
252
                                                        
253
                                                        case "div":
254
                                                        //output.write("<div type=\""+parser.getAttributeValue(null,"type")+"\">\n");
255
                                                        break;
256
                                                        
257
                                                        case "p":
258
                                                        output.write("<p n=\""+(p_id++)+"\">\n");
259
                                                        break;
260
                                                        case "ab":
261
                                                        output.write("<ab n=\""+ab_id+++"\" rend=\""+parser.getAttributeValue(null,"rend")+"\">\n");
262
                                                        break;
263
                                                        case "q":
264
                                                        output.write("<q n=\""+(q_id++)+"\">\n");
265
                                                        break;
266
                                                        case "front":
267
                                                        output.write("<front n=\""+(front_id++)+"\">\n");
268
                                                        break;
269
                                                        case "body":
270
                                                        output.write("<body n=\""+(body_id++)+"\">\n");
271
                                                        break;
272
                                                        case "back":
273
                                                        output.write("<back n=\""+(back_id++)+"\">\n");
274
                                                        break;
275
                                                        case "lb":
276
                                                        String n = parser.getAttributeValue(null,"n")
277
                                                        if(n != null)
278
                                                                lb_id = n;
279
                                                        else
280
                                                                lb_id =""+lb_count++;
281
                                                        break;
282
                                                        case "pb":
283
                                                        String n = parser.getAttributeValue(null,"n");
284
                                                        if(n != null)
285
                                                                pb_id = n;
286
                                                        else
287
                                                                pb_id = ""+(pb_count++);
288
                                                        
289
                                                        break;
290
                                                        
291
                                                        case "choice":
292
                                                        flagchoice = true;
293
                                                        break;
294
                                                        case "corr":
295
                                                        flagcorr = true;
296
                                                        vCorr= "";
297
                                                        break;
298
                                                        case "reg":
299
                                                        flagreg = true;
300
                                                        vReg= "";
301
                                                        break;
302
                                                        case "expan":
303
                                                        flagexpan = true;
304
                                                        vExpan= "";
305
                                                        break;
306
                                                        case "orig":
307
                                                        flagreg = true;
308
                                                        vOrig= "";
309
                                                        break;
310
                                                        case "sic":
311
                                                        flagsic = true;
312
                                                        vSic= "";
313
                                                        break;
314
                                                        case "abbr":
315
                                                        flagreg = true;
316
                                                        vAbbr= "";
317
                                                        break;
318
                                                        
319
                                                        case "w":
320
                                                        for(int i = 0 ; i < parser.getAttributeCount(); i++)
321
                                                                if(parser.getAttributeLocalName(i).equals("id"))
322
                                                                {        
323
                                                                        wordid = parser.getAttributeValue(i);
324
                                                                        break;
325
                                                                }
326
                                                        vAna = "";
327
                                                        vForm = "";
328
                                                        flagWord = true;
329
                                                        break;
330
                                                        case "form":
331
                                                        flagForm = true;
332
                                                        vForm = "";
333
                                                        vAna ="";
334
                                                        break;
335
                                                        
336
                                                        case "ana":
337
                                                        flagAna = true;
338
                                                        break;
339
                                                }
340
                                                break;
341
                                        
342
                                        case XMLStreamConstants.END_ELEMENT:
343
                                                switch (parser.getLocalName()) 
344
                                                {        
345
                                                        case "div":
346
                                                        break;
347
                                                        
348
                                                        case "text":
349
                                                        output.write("</text>\n");
350
                                                        captureword=false;
351
                                                        break;
352
                                                        case "p":
353
                                                        output.write("</p>\n");
354
                                                        break;
355
                                                        case "ab":
356
                                                        output.write("</ab>\n");
357
                                                        break;
358
                                                        case "q":
359
                                                        output.write("</q>\n");
360
                                                        break;
361
                                                        case "front":
362
                                                        output.write("</front>\n");
363
                                                        break;
364
                                                        case "body":
365
                                                        output.write("</body>\n");
366
                                                        break;
367
                                                        case "back":
368
                                                        output.write("</back>\n");
369
                                                        break;
370
                                                        
371
                                                        case "fw":
372
                                                        flagfw = false;
373
                                                        break;
374
                                                        case "choice":
375
                                                                
376
                                                                if(vCorr != "")
377
                                                                {
378
                                                                        //System.out.println(vCorr+" >> write corr "+vForm +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr)
379
                                                                        output.write( vCorr +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
380
                                                                }
381
                                                                else if(vReg != "")
382
                                                                {
383
                                                                        //System.out.println("write reg "+vForm)
384
                                                                        output.write( vReg +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
385
                                                                }
386
                                                                else if(vExpan != "")
387
                                                                {
388
                                                                        //System.out.println("write expan "+vForm)
389
                                                                        output.write( vExpan +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
390
                                                                }        
391
                                                        flagchoice = false;
392
                                                        vCorr= "";
393
                                                        vSic= "";
394
                                                        break;
395
                                                        case "corr":
396
                                                        flagcorr = false;
397
                                                        
398
                                                        break;
399
                                                        case "reg":
400
                                                        flagreg = false;
401
                                                        vReg = "";
402
                                                        break;
403
                                                        case "expan":
404
                                                        flagexpan = false;
405
                                                        vExpan= "";
406
                                                        break;
407
                                                        case "orig":
408
                                                        flagreg = false;
409
                                                        vOrig= "";
410
                                                        break;
411
                                                        case "sic":
412
                                                        flagsic = false;
413
                                                        
414
                                                        break;
415
                                                        case "abbr":
416
                                                        flagreg = false;
417
                                                        vAbbr= "";
418
                                                        break;
419
                                                        
420
                                                        case "w":
421
                                                        if(vAna != null)
422
                                                                if(captureword)
423
                                                                {
424
                                                                        if(flagchoice)
425
                                                                        {
426
                                                                                
427
                                                                        }
428
                                                                        else if(flagfw)
429
                                                                        {
430
                                                                                
431
                                                                        }
432
                                                                        else
433
                                                                        {
434
                                                                                output.write( vForm +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\n");
435
                                                                        }
436
                                                                        
437
                                                                }
438

    
439
                                                        flagWord = false;
440
                                                        break;
441
                                                        
442
                                                        case "form":
443
                                                        flagForm = false;
444
                                                        break;
445
                                                        
446
                                                        case "ana":
447
                                                        vAna += "\t";
448
                                                        flagAna = false;
449
                                                        break;
450
                                                }
451
                                                break;
452
                                        
453
                                        case XMLStreamConstants.CHARACTERS:
454
                                                if(flagAna)
455
                                                {
456
                                                        vAna += parser.getText().trim();
457
                                                }
458
                                        
459
                                                if(flagForm)
460
                                                {
461
                                                        vForm += parser.getText().trim();
462
                                                        if(flagchoice)
463
                                                        {
464
                                                                if(flagsic)
465
                                                                {
466
                                                                        vSic += parser.getText().trim();
467
                                                                }
468
                                                                if(flagorig)
469
                                                                {
470
                                                                        vOrig += parser.getText().trim();
471
                                                                }
472
                                                                if(flagabbr)
473
                                                                {
474
                                                                        vAbbr += parser.getText().trim();
475
                                                                }
476
                                                                if(flagcorr)
477
                                                                {
478
                                                                        vCorr += parser.getText().trim();
479
                                                                }
480
                                                        }
481
                                                }
482
                                }
483
                        }
484
                        output.write("</txmcorpus>"); 
485
                        output.close();
486
                        parser.close();
487
                }
488
                catch (XMLStreamException ex) {
489
                        System.out.println(ex);
490
                }
491
                catch (IOException ex) {
492
                        System.out.println("IOException while parsing " + inputData);
493
                }
494
                
495
                return true;
496
        }
497
        
498
        
499
        
500
        /**
501
         * Run.
502
         *
503
         * @param rootDirFile the root dir file
504
         * @return true, if successful
505
         */
506
        public boolean run(File rootDirFile) 
507
        {
508
                String rootDir =rootDirFile.getAbsolutePath();
509
                
510
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
511
                        println ("Error: CWB executables not well set.")
512
                        return false;
513
                }
514
                if(!new File(rootDir).exists())
515
                {
516
                        println ("binary directory does not exists: "+rootDir)
517
                        return false;
518
                }
519
                new File(rootDir+"/cqp/","bfm3tt.cqp").delete();//cleaning&preparing
520
                new File(rootDir,"/cqp/").deleteDir();
521
                new File(rootDir,"/cqp/").mkdir();
522
                new File(rootDir,"/data/").deleteDir();
523
                new File(rootDir,"/data/").mkdir();
524
                new File(rootDir,"registry/").mkdir();
525
                
526
                String textid="";
527
                int counttext =0;
528
                List<File> files = new File(rootDirFile,"txm").listFiles();
529
                //1- Transform into CQP file
530
                for(File f : files)
531
                {
532
                        counttext++;
533
                        if(!f.exists())
534
                        {
535
                                println("file "+f+ " does not exists")        
536
                        }
537
                        else
538
                        {        
539
                                println("process file "+f)
540
                                def builder = new compiler(f.toURL(),f.getName(),"bfm3tt","default");
541
                                builder.setLang(lang);
542
                                builder.transfomFileCqp(rootDir+"/cqp","bfm3tt.cqp");
543
                        }
544
                }
545
                
546
                 //2- Import into CWB
547
                 def outDir =rootDir;
548
                 def outDirTxm = rootDir;
549
                 CwbEncode cwbEn = new CwbEncode();
550
                 cwbEn.setDebug(debug);
551
                 CwbMakeAll cwbMa = new CwbMakeAll();
552
                 cwbMa.setDebug(debug);
553
                 String[] pAttributes = ["pos","id","pb","lb"];
554
                 String[] sAttributes = ["txmcorpus:0+lang","text:0+id+titre+auteur+datecompo+ssiecle+domaine+genre+forme+dialecte+base+project","front:0+n","body:0+n","ab:0+n+rend","div:0+id+type","q:1+n","p:0+n","back:0+n"];
555
                 try
556
                 {
557
                 cwbEn.run(outDirTxm + "/data", outDir + "/cqp/"+"bfm3tt.cqp", outDirTxm + "/registry/"+"bfm3tt",pAttributes, sAttributes);
558
                 cwbMa.run("BFM3TT", outDirTxm + "/registry");
559
                 } catch (Exception ex) {System.out.println(ex); return false;}
560
                 
561
                System.out.println("Done.") 
562
                
563
                return true;
564
        }
565
        
566
        /**
567
         * Sets the debug.
568
         */
569
        public void setDebug()
570
        {
571
                this.debug = true;
572
        }
573
        
574
        /**
575
         * The main method.
576
         *
577
         * @param args the arguments
578
         */
579
        public static void main(String[] args)
580
        {
581
                File dir = new File("~/xml/bfm3tt");
582
                def c = new compiler();
583
                c.setDebug();
584
                c.setCwbPath("~/TXM/cwb/bin");
585
                c.run(dir);
586
        }
587
}