Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / RGAQCJ / compiler.groovy @ 187

History | View | Annotate | Download (14.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (Tue, 24 Jan 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.RGAQCJ
29

    
30

    
31

    
32
import org.txm.importer.cwb.CwbEncode
33
import org.txm.importer.cwb.CwbMakeAll
34
import org.txm.importer.*;
35
import org.txm.scripts.*;
36
import org.txm.scripts.teitxm.BuildTTSrc;
37
import org.txm.scripts.teitxm.*;
38
import org.txm.utils.treetagger.TreeTagger;
39

    
40
import javax.xml.stream.*;
41
import java.net.URL;
42
import java.io.File;
43
import java.util.HashMap;
44
import java.util.List;
45

    
46
// TODO: Auto-generated Javadoc
47
/**
48
 * The Class compiler.
49
 */
50
class compiler 
51
{
52
        
53
        /** The debug. */
54
        private boolean debug= false;
55
        
56
        /** The input data. */
57
        private def inputData;
58
        
59
        /** The factory. */
60
        private def factory;
61
        
62
        /** The parser. */
63
        private XMLStreamReader parser;
64
        
65
        /** The dir. */
66
        private def dir;
67
        
68
        /** The output. */
69
        private def output;
70
        
71
        /** The url. */
72
        private def url;
73
        
74
        /** The anahash. */
75
        private HashMap<String,String> anahash =new HashMap<String,String>() ;
76
        
77
        /** The text. */
78
        String text="";
79
        
80
        /** The base. */
81
        String base="";
82
        
83
        /** The project. */
84
        String project="";
85
        
86
        /** The cwb loc. */
87
        String cwbLoc;
88
        
89
        /** The lang. */
90
        private String lang ="fr";
91
        
92
        /**
93
         * initialize.
94
         *
95
         */
96
        public compiler(){}
97
        
98
        /**
99
         * Instantiates a new compiler.
100
         *
101
         * @param url the url
102
         * @param text the text
103
         * @param base the base
104
         * @param project the project
105
         */
106
        public compiler(URL url,String text,String base, String project)
107
        {
108
                this.text = text
109
                this.base = base;
110
                this.project = project;
111
                try {
112
                        this.url = url;
113
                        inputData = url.openStream();
114
                        
115
                        factory = XMLInputFactory.newInstance();
116
                        parser = factory.createXMLStreamReader(inputData);
117
                } catch (XMLStreamException ex) {
118
                        System.out.println(ex);
119
                }catch (IOException ex) {
120
                        System.out.println("IOException while parsing ");
121
                }
122
        }
123
        
124
        /**
125
         * set the language of the corpus.
126
         *
127
         * @param lang the lang
128
         * @return the java.lang. object
129
         */
130
        public setLang(String lang)
131
        {
132
                this.lang = lang;
133
        }
134
        
135
        /**
136
         * Sets the cwb path.
137
         *
138
         * @param path the new cwb path
139
         */
140
        public void setCwbPath(String path)
141
        {
142
                if(!new File(path).exists())
143
                        System.err.println("CWB Path : "+path+" does not exists")
144
                cwbLoc = path;
145
        }
146
        
147
        /**
148
         * Creates the output.
149
         *
150
         * @param dirPathName the dir path name
151
         * @param fileName the file name
152
         * @return true, if successful
153
         */
154
        private boolean createOutput(String dirPathName, String fileName){
155
                try {
156
                        File f = new File(dirPathName, fileName)
157
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
158
                        return true;
159
                } catch (Exception e) {
160
                        System.out.println(e.getLocalizedMessage());
161
                        return false;
162
                }
163
        }
164
        
165
        /**
166
         * Go to text.
167
         */
168
        private void GoToText()
169
        {
170
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
171
                {
172
                        if(event == XMLStreamConstants.END_ELEMENT)
173
                                if(parser.getLocalName().equals("teiHeader"))
174
                                        return;
175
                }
176
        }
177
        
178
        /**
179
         * Transfom file wtc.
180
         *
181
         * @param dirPathName the dir path name
182
         * @param fileName the file name
183
         * @return true, if successful
184
         */
185
        public boolean transfomFileWtc(String dirPathName, String fileName)
186
        {
187
                createOutput(dirPathName, fileName);
188
                
189
                String headvalue=""
190
                String vAna = "";
191
                String vForm = "";
192
                String wordid= "";
193
                String vHead = "";
194
                
195
                int p_id = 0;
196
                int q_id = 0;
197
                int body_id = 0;
198
                int front_id = 0;
199
                int back_id = 0;
200
                String lb_id = 0;
201
                int lb_count = 0;
202
                String pb_id = 0;
203
                int pb_count = 0;
204
                String ab_id = 0;
205
                
206
                boolean captureword = false;
207
                
208
                String vExpan = "";
209
                String vCorr = "";
210
                String vReg = "";
211
                String vOrig = "";
212
                String vSic = "";
213
                String vAbbr = "";
214
                
215
                boolean flaglg = false;
216
                boolean flaghead = false;
217
                boolean flagAuthor = false;
218
                boolean flagDate = false;
219
                boolean flagWord = false;
220
                boolean flagForm = false;
221
                boolean flagAna = false;
222
                
223
                boolean flagchoice = false;
224
                boolean flagcorr = false;
225
                boolean flagsic = false;
226
                boolean flagreg = false;
227
                boolean flagexpan = false;
228
                boolean flagorig = false;
229
                boolean flagabbr = false;
230
                boolean flagfw = false;
231
                
232
                File xpathfile = new File(url.getFile());
233
                String titreId = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title[@type='reference']/text()");
234
                String auteur = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:author/text()");
235
                String datecompo = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:date[@type='compo']/@when");
236
                String ssiecle = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:date[@type='compo_sous_siecle']/@n");
237
                String domaine = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:textDesc/tei:domain/@type");
238
                String genre = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:textDesc/@n");
239
                String forme = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:textClass/tei:catRef/@target[contains(.,'forme')]").substring(1);
240
                String dialecte = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:region[@type='dialecte_auteur']/text()");
241
                
242
                this.GoToText()
243
                output.write("<txmcorpus lang=\""+lang+"\">\n"); 
244
                try 
245
                {
246
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
247
                        {
248
                                switch (event) 
249
                                {
250
                                        case XMLStreamConstants.START_ELEMENT:
251
                                                switch (parser.getLocalName()) 
252
                                                {                                                
253
                                                        case "text":
254
                                                        output.write("<text id=\""+text+"\"" +
255
                                                        " titre=\""+titreId+"\"" +
256
                                                        " auteur=\""+auteur+"\"" +
257
                                                        " datecompo=\""+datecompo+"\"" +
258
                                                        " ssiecle=\""+ssiecle+"\"" +
259
                                                        " domaine=\""+domaine+"\"" +
260
                                                        " genre=\""+genre+"\"" +
261
                                                        " forme=\""+forme+"\"" +
262
                                                        " dialecte=\""+dialecte+"\"" +
263
                                                        " base=\""+base+"\"" +
264
                                                        " project=\""+project+"\">\n");
265
                                                        captureword=true;
266
                                                        break;
267
                                                        
268
                                                        case "div":
269
                                                        //output.write("<div type=\""+parser.getAttributeValue(null,"type")+"\">\n");
270
                                                        break;
271
                                                        
272
                                                        case "p":
273
                                                        output.write("<p n=\""+(p_id++)+"\">\n");
274
                                                        break;
275
                                                        case "ab":
276
                                                        output.write("<ab n=\""+ab_id+++"\" rend=\""+parser.getAttributeValue(null,"rend")+"\">\n");
277
                                                        break;
278
                                                        case "q":
279
                                                        output.write("<q n=\""+(q_id++)+"\">\n");
280
                                                        break;
281
                                                        case "front":
282
                                                        output.write("<front n=\""+(front_id++)+"\">\n");
283
                                                        break;
284
                                                        case "body":
285
                                                        output.write("<body n=\""+(body_id++)+"\">\n");
286
                                                        break;
287
                                                        case "back":
288
                                                        output.write("<back n=\""+(back_id++)+"\">\n");
289
                                                        break;
290
                                                        case "lb":
291
                                                        String n = parser.getAttributeValue(null,"n")
292
                                                        if(n != null)
293
                                                                lb_id = n;
294
                                                        else
295
                                                                lb_id =""+lb_count++;
296
                                                        break;
297
                                                        case "pb":
298
                                                        String n = parser.getAttributeValue(null,"n");
299
                                                        if(n != null)
300
                                                                pb_id = n;
301
                                                        else
302
                                                                pb_id = ""+(pb_count++);
303
                                                        
304
                                                        break;
305
                                                        
306
                                                        case "choice":
307
                                                        flagchoice = true;
308
                                                        break;
309
                                                        case "corr":
310
                                                        flagcorr = true;
311
                                                        vCorr= "";
312
                                                        break;
313
                                                        case "reg":
314
                                                        flagreg = true;
315
                                                        vReg= "";
316
                                                        break;
317
                                                        case "expan":
318
                                                        flagexpan = true;
319
                                                        vExpan= "";
320
                                                        break;
321
                                                        case "orig":
322
                                                        flagreg = true;
323
                                                        vOrig= "";
324
                                                        break;
325
                                                        case "sic":
326
                                                        flagsic = true;
327
                                                        vSic= "";
328
                                                        break;
329
                                                        case "abbr":
330
                                                        flagreg = true;
331
                                                        vAbbr= "";
332
                                                        break;
333
                                                        
334
                                                        case "w":
335
                                                        for(int i = 0 ; i < parser.getAttributeCount(); i++)
336
                                                                if(parser.getAttributeLocalName(i).equals("id"))
337
                                                                {        
338
                                                                        wordid = parser.getAttributeValue(i);
339
                                                                        break;
340
                                                                }
341
                                                        vAna = "";
342
                                                        vForm = "";
343
                                                        flagWord = true;
344
                                                        break;
345
                                                        case "form":
346
                                                        flagForm = true;
347
                                                        vForm = "";
348
                                                        vAna ="";
349
                                                        break;
350
                                                        
351
                                                        case "ana":
352
                                                        flagAna = true;
353
                                                        break;
354
                                                }
355
                                                break;
356
                                        
357
                                        case XMLStreamConstants.END_ELEMENT:
358
                                                switch (parser.getLocalName()) 
359
                                                {        
360
                                                        case "div":
361
                                                        break;
362
                                                        
363
                                                        case "text":
364
                                                        output.write("</text>\n");
365
                                                        captureword=false;
366
                                                        break;
367
                                                        case "p":
368
                                                        output.write("</p>\n");
369
                                                        break;
370
                                                        case "ab":
371
                                                        output.write("</ab>\n");
372
                                                        break;
373
                                                        case "q":
374
                                                        output.write("</q>\n");
375
                                                        break;
376
                                                        case "front":
377
                                                        output.write("</front>\n");
378
                                                        break;
379
                                                        case "body":
380
                                                        output.write("</body>\n");
381
                                                        break;
382
                                                        case "back":
383
                                                        output.write("</back>\n");
384
                                                        break;
385
                                                        
386
                                                        case "fw":
387
                                                        flagfw = false;
388
                                                        break;
389
                                                        case "choice":
390
                                                                
391
                                                                if(vCorr != "")
392
                                                                {
393
                                                                        //System.out.println(vCorr+" >> write corr "+vForm +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr)
394
                                                                        output.write( vCorr +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
395
                                                                }
396
                                                                else if(vReg != "")
397
                                                                {
398
                                                                        //System.out.println("write reg "+vForm)
399
                                                                        output.write( vReg +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
400
                                                                }
401
                                                                else if(vExpan != "")
402
                                                                {
403
                                                                        //System.out.println("write expan "+vForm)
404
                                                                        output.write( vExpan +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
405
                                                                }        
406
                                                        flagchoice = false;
407
                                                        vCorr= "";
408
                                                        vSic= "";
409
                                                        break;
410
                                                        case "corr":
411
                                                        flagcorr = false;
412
                                                        
413
                                                        break;
414
                                                        case "reg":
415
                                                        flagreg = false;
416
                                                        vReg = "";
417
                                                        break;
418
                                                        case "expan":
419
                                                        flagexpan = false;
420
                                                        vExpan= "";
421
                                                        break;
422
                                                        case "orig":
423
                                                        flagreg = false;
424
                                                        vOrig= "";
425
                                                        break;
426
                                                        case "sic":
427
                                                        flagsic = false;
428
                                                        
429
                                                        break;
430
                                                        case "abbr":
431
                                                        flagreg = false;
432
                                                        vAbbr= "";
433
                                                        break;
434
                                                        
435
                                                        case "w":
436
                                                        if(vAna != null)
437
                                                                if(captureword)
438
                                                                {
439
                                                                        if(flagchoice)
440
                                                                        {
441
                                                                                
442
                                                                        }
443
                                                                        else if(flagfw)
444
                                                                        {
445
                                                                                
446
                                                                        }
447
                                                                        else
448
                                                                        {
449
                                                                                output.write( vForm +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\n");
450
                                                                        }
451
                                                                        
452
                                                                }
453

    
454
                                                        flagWord = false;
455
                                                        break;
456
                                                        
457
                                                        case "form":
458
                                                        flagForm = false;
459
                                                        break;
460
                                                        
461
                                                        case "ana":
462
                                                        vAna += "\t";
463
                                                        flagAna = false;
464
                                                        break;
465
                                                }
466
                                                break;
467
                                        
468
                                        case XMLStreamConstants.CHARACTERS:
469
                                                if(flagAna)
470
                                                {
471
                                                        vAna += parser.getText().trim();
472
                                                }
473
                                        
474
                                                if(flagForm)
475
                                                {
476
                                                        vForm += parser.getText().trim();
477
                                                        if(flagchoice)
478
                                                        {
479
                                                                if(flagsic)
480
                                                                {
481
                                                                        vSic += parser.getText().trim();
482
                                                                }
483
                                                                if(flagorig)
484
                                                                {
485
                                                                        vOrig += parser.getText().trim();
486
                                                                }
487
                                                                if(flagabbr)
488
                                                                {
489
                                                                        vAbbr += parser.getText().trim();
490
                                                                }
491
                                                                if(flagcorr)
492
                                                                {
493
                                                                        vCorr += parser.getText().trim();
494
                                                                }
495
                                                        }
496
                                                }
497
                                }
498
                        }
499
                        output.write("</txmcorpus>"); 
500
                        output.close();
501
                        parser.close();
502
                }
503
                catch (XMLStreamException ex) {
504
                        System.out.println(ex);
505
                }
506
                catch (IOException ex) {
507
                        System.out.println("IOException while parsing " + inputData);
508
                }
509
                
510
                return true;
511
        }
512
        
513
        
514
        
515
        /**
516
         * Run.
517
         *
518
         * @param rootDirFile the root dir file
519
         * @return true, if successful
520
         */
521
        public boolean run(File rootDirFile) 
522
        {
523
                String rootDir =rootDirFile.getAbsolutePath();
524
                
525
                if(cwbLoc == null)
526
                        cwbLoc = org.txm.Toolbox.getParam(org.txm.Toolbox.CQI_SERVER_PATH_TO_CQPLIB)+File.separator;
527
                
528
                if(!new File(cwbLoc).exists())
529
                {
530
                        println ("CWB path error: "+cwbLoc)
531
                        return false;
532
                }
533
                if(!new File(rootDir).exists())
534
                {
535
                        println ("binary directory does not exists: "+rootDir)
536
                        return false;
537
                }
538
                new File(rootDir+"/wtc/","bfm3tt.wtc").delete();//cleaning&preparing
539
                new File(rootDir,"/wtc/").deleteDir();
540
                new File(rootDir,"/wtc/").mkdir();
541
                new File(rootDir,"/data/").deleteDir();
542
                new File(rootDir,"/data/").mkdir();
543
                new File(rootDir,"registry/").mkdir();
544
                
545
                String textid="";
546
                int counttext =0;
547
                List<File> files = new File(rootDirFile,"txm").listFiles();
548
                //1- Transform into WTC file
549
                for(File f : files)
550
                {
551
                        counttext++;
552
                        if(!f.exists())
553
                        {
554
                                println("file "+f+ " does not exists")        
555
                        }
556
                        else
557
                        {        
558
                                println("process file "+f)
559
                                def builder = new compiler(f.toURL(),f.getName(),"bfm3tt","default");
560
                                builder.setLang(lang);
561
                                builder.transfomFileWtc(rootDir+"/wtc","bfm3tt.wtc");
562
                        }
563
                }
564
                
565
                 //2- Import into CWB
566
                 def outDir =rootDir;
567
                 def outDirTxm = rootDir;
568
                 CwbEncode cwbEn = new CwbEncode();
569
                 cwbEn.setDebug(debug);
570
                 CwbMakeAll cwbMa = new CwbMakeAll();
571
                 cwbMa.setDebug(debug);
572
                 String[] pAttributes = ["pos","id","pb","lb"];
573
                 String[] sAttributes = ["txmcorpus:0+lang","text:0+id+titre+auteur+datecompo+ssiecle+domaine+genre+forme+dialecte+base+project","front:0+n","body:0+n","ab:0+n+rend","div:0+id+type","q:1+n","p:0+n","back:0+n"];
574
                 try
575
                 {
576
                 cwbEn.run(new File(cwbLoc,"cwb-encode").getAbsolutePath(), outDirTxm + "/data", outDir + "/wtc/"+"bfm3tt.wtc", outDirTxm + "/registry/"+"bfm3tt",pAttributes, sAttributes);
577
                 cwbMa.run(new File(cwbLoc,"cwb-makeall").getAbsolutePath(), "BFM3TT", outDirTxm + "/registry");
578
                 } catch (Exception ex) {System.out.println(ex); return false;}
579
                 
580
                System.out.println("Done.") 
581
                
582
                return true;
583
        }
584
        
585
        /**
586
         * Sets the debug.
587
         */
588
        public void setDebug()
589
        {
590
                this.debug = true;
591
        }
592
        
593
        /**
594
         * The main method.
595
         *
596
         * @param args the arguments
597
         */
598
        public static void main(String[] args)
599
        {
600
                File dir = new File("~/xml/bfm3tt");
601
                def c = new compiler();
602
                c.setDebug();
603
                c.setCwbPath("~/TXM/cwb/bin");
604
                c.run(dir);
605
        }
606
}