Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / corptef / compiler.groovy @ 1000

History | View | Annotate | Download (14.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.corptef
29

    
30

    
31
import org.txm.importer.cwb.CwbEncode
32
import org.txm.importer.cwb.CwbMakeAll
33
import org.txm.scripts.importer.*;
34
import org.txm.scripts.*;
35
import org.txm.importer.scripts.xmltxm.BuildTTSrc;
36
import org.txm.importer.scripts.xmltxm.*;
37
import org.txm.utils.treetagger.TreeTagger;
38

    
39
import javax.xml.stream.*;
40
import java.net.URL;
41
import java.io.File;
42
import java.util.HashMap;
43
import java.util.List;
44

    
45
// TODO: Auto-generated Javadoc
46
/**
47
 * The Class compiler.
48
 */
49
class compiler
50
{
51

    
52
        /** The debug. */
53
        private boolean debug= false;
54

    
55
        /** The input data. */
56
        private def inputData;
57

    
58
        /** The factory. */
59
        private def factory;
60

    
61
        /** The parser. */
62
        private XMLStreamReader parser;
63

    
64
        /** The dir. */
65
        private def dir;
66

    
67
        /** The output. */
68
        private def output;
69

    
70
        /** The url. */
71
        private def url;
72

    
73
        /** The anahash. */
74
        private HashMap<String,String> anahash =new HashMap<String,String>() ;
75

    
76
        /** The text. */
77
        String text="";
78

    
79
        /** The base. */
80
        String base="";
81

    
82
        /** The project. */
83
        String project="";
84

    
85
        /** The lang. */
86
        private String lang ="fr";
87

    
88
        /**
89
         * initialize.
90
         *
91
         */
92
        public compiler(){}
93

    
94
        /**
95
         * Instantiates a new compiler.
96
         *
97
         * @param url the url
98
         * @param text the text
99
         * @param base the base
100
         * @param project the project
101
         */
102
        public compiler(URL url,String text,String base, String project)
103
        {
104
                this.text = text
105
                this.base = base;
106
                this.project = project;
107
                try {
108
                        this.url = url;
109
                        inputData = url.openStream();
110

    
111
                        factory = XMLInputFactory.newInstance();
112
                        parser = factory.createXMLStreamReader(inputData);
113
                } catch (XMLStreamException ex) {
114
                        System.out.println(ex);
115
                }catch (IOException ex) {
116
                        System.out.println("IOException while parsing ");
117
                }
118
        }
119

    
120
        /**
121
         * set the language of the corpus.
122
         *
123
         * @param lang the lang
124
         * @return the java.lang. object
125
         */
126
        public setLang(String lang)
127
        {
128
                this.lang = lang;
129
        }
130

    
131
        /**
132
         * Creates the output.
133
         *
134
         * @param dirPathName the dir path name
135
         * @param fileName the file name
136
         * @return true, if successful
137
         */
138
        private boolean createOutput(String dirPathName, String fileName){
139
                try {
140
                        File f = new File(dirPathName, fileName)
141
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
142
                        return true;
143
                } catch (Exception e) {
144
                        System.out.println(e.getLocalizedMessage());
145
                        return false;
146
                }
147
        }
148

    
149
        /**
150
         * Go to text.
151
         */
152
        private void GoToText()
153
        {
154
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
155
                {
156
                        if(event == XMLStreamConstants.END_ELEMENT)
157
                                if(parser.getLocalName().equals("teiHeader"))
158
                                        return;
159
                }
160
        }
161

    
162
        /**
163
         * Transfom file cqp.
164
         *
165
         * @param dirPathName the dir path name
166
         * @param fileName the file name
167
         * @return true, if successful
168
         */
169
        public boolean transfomFileCqp(String dirPathName, String fileName)
170
        {
171
                createOutput(dirPathName, fileName);
172

    
173
                String headvalue=""
174
                String vAna = "";
175
                String vForm = "";
176
                String wordid= "";
177
                String vHead = "";
178

    
179
                int p_id = 0;
180
                int q_id = 0;
181
                int body_id = 0;
182
                int front_id = 0;
183
                int back_id = 0;
184
                String lb_id = 0;
185
                int lb_count = 0;
186
                String pb_id = 0;
187
                int pb_count = 0;
188
                String ab_id = 0;
189

    
190
                boolean captureword = false;
191

    
192
                String vExpan = "";
193
                String vCorr = "";
194
                String vReg = "";
195
                String vOrig = "";
196
                String vSic = "";
197
                String vAbbr = "";
198

    
199
                boolean flaglg = false;
200
                boolean flaghead = false;
201
                boolean flagAuthor = false;
202
                boolean flagDate = false;
203
                boolean flagWord = false;
204
                boolean flagForm = false;
205
                boolean flagAna = false;
206

    
207
                boolean flagchoice = false;
208
                boolean flagcorr = false;
209
                boolean flagsic = false;
210
                boolean flagreg = false;
211
                boolean flagexpan = false;
212
                boolean flagorig = false;
213
                boolean flagabbr = false;
214
                boolean flagfw = false;
215

    
216
                File xpathfile = new File(url.getFile());
217
                String titreId = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title[@type='reference']/text()");
218
                String auteur = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:author/text()");
219
                String datecompo = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:date[@type='compo']/@when");
220
                String ssiecle = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:date[@type='compo_sous_siecle']/@n");
221
                String domaine = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:textDesc/tei:domain/@type");
222
                String genre = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:textDesc/@n");
223
                String forme = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:textClass/tei:catRef/@target[contains(.,'forme')]").substring(1);
224
                String dialecte = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:region[@type='dialecte_auteur']/text()");
225

    
226
                this.GoToText()
227
                output.write("<txmcorpus lang=\""+lang+"\">\n");
228
                try
229
                {
230
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
231
                        {
232
                                switch (event)
233
                                {
234
                                        case XMLStreamConstants.START_ELEMENT:
235
                                                switch (parser.getLocalName())
236
                                                {
237
                                                        case "text":
238
                                                        output.write("<text id=\""+text+"\"" +
239
                                                        " titre=\""+titreId+"\"" +
240
                                                        " auteur=\""+auteur+"\"" +
241
                                                        " datecompo=\""+datecompo+"\"" +
242
                                                        " ssiecle=\""+ssiecle+"\"" +
243
                                                        " domaine=\""+domaine+"\"" +
244
                                                        " genre=\""+genre+"\"" +
245
                                                        " forme=\""+forme+"\"" +
246
                                                        " dialecte=\""+dialecte+"\"" +
247
                                                        " base=\""+base+"\"" +
248
                                                        " project=\""+project+"\">\n");
249
                                                        captureword=true;
250
                                                        break;
251

    
252
                                                        case "div":
253
                                                        //output.write("<div type=\""+parser.getAttributeValue(null,"type")+"\">\n");
254
                                                        break;
255

    
256
                                                        case "p":
257
                                                        output.write("<p n=\""+(p_id++)+"\">\n");
258
                                                        break;
259
                                                        case "ab":
260
                                                        output.write("<ab n=\""+ab_id+++"\" rend=\""+parser.getAttributeValue(null,"rend")+"\">\n");
261
                                                        break;
262
                                                        case "q":
263
                                                        output.write("<q n=\""+(q_id++)+"\">\n");
264
                                                        break;
265
                                                        case "front":
266
                                                        output.write("<front n=\""+(front_id++)+"\">\n");
267
                                                        break;
268
                                                        case "body":
269
                                                        output.write("<body n=\""+(body_id++)+"\">\n");
270
                                                        break;
271
                                                        case "back":
272
                                                        output.write("<back n=\""+(back_id++)+"\">\n");
273
                                                        break;
274
                                                        case "lb":
275
                                                        String n = parser.getAttributeValue(null,"n")
276
                                                        if(n != null)
277
                                                                lb_id = n;
278
                                                        else
279
                                                                lb_id =""+lb_count++;
280
                                                        break;
281
                                                        case "pb":
282
                                                        String n = parser.getAttributeValue(null,"n");
283
                                                        if(n != null)
284
                                                                pb_id = n;
285
                                                        else
286
                                                                pb_id = ""+(pb_count++);
287

    
288
                                                        break;
289

    
290
                                                        case "choice":
291
                                                        flagchoice = true;
292
                                                        break;
293
                                                        case "corr":
294
                                                        flagcorr = true;
295
                                                        vCorr= "";
296
                                                        break;
297
                                                        case "reg":
298
                                                        flagreg = true;
299
                                                        vReg= "";
300
                                                        break;
301
                                                        case "expan":
302
                                                        flagexpan = true;
303
                                                        vExpan= "";
304
                                                        break;
305
                                                        case "orig":
306
                                                        flagreg = true;
307
                                                        vOrig= "";
308
                                                        break;
309
                                                        case "sic":
310
                                                        flagsic = true;
311
                                                        vSic= "";
312
                                                        break;
313
                                                        case "abbr":
314
                                                        flagreg = true;
315
                                                        vAbbr= "";
316
                                                        break;
317

    
318
                                                        case "w":
319
                                                        for(int i = 0 ; i < parser.getAttributeCount(); i++)
320
                                                                if(parser.getAttributeLocalName(i).equals("id"))
321
                                                        {
322
                                                                wordid = parser.getAttributeValue(i);
323
                                                        }
324
                                                        vAna = "";
325
                                                        vForm = "";
326
                                                        flagWord = true;
327
                                                        break;
328
                                                        case "form":
329
                                                        flagForm = true;
330
                                                        vForm = "";
331
                                                        vAna ="";
332
                                                        break;
333

    
334
                                                        case "ana":
335
                                                        flagAna = true;
336
                                                        break;
337
                                                }
338
                                                break;
339

    
340
                                        case XMLStreamConstants.END_ELEMENT:
341
                                                switch (parser.getLocalName())
342
                                                {
343
                                                        case "div":
344
                                                        break;
345

    
346
                                                        case "text":
347
                                                        output.write("</text>\n");
348
                                                        captureword=false;
349
                                                        break;
350
                                                        case "p":
351
                                                        output.write("</p>\n");
352
                                                        break;
353
                                                        case "ab":
354
                                                        output.write("</ab>\n");
355
                                                        break;
356
                                                        case "q":
357
                                                        output.write("</q>\n");
358
                                                        break;
359
                                                        case "front":
360
                                                        output.write("</front>\n");
361
                                                        break;
362
                                                        case "body":
363
                                                        output.write("</body>\n");
364
                                                        break;
365
                                                        case "back":
366
                                                        output.write("</back>\n");
367
                                                        break;
368

    
369
                                                        case "fw":
370
                                                        flagfw = false;
371
                                                        break;
372
                                                        case "choice":
373

    
374
                                                        if(vCorr != "")
375
                                                        {
376
                                                                //System.out.println(vCorr+" >> write corr "+vForm +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr)
377
                                                                output.write( vCorr +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
378
                                                        }
379
                                                        else if(vReg != "")
380
                                                        {
381
                                                                //System.out.println("write reg "+vForm)
382
                                                                output.write( vReg +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
383
                                                        }
384
                                                        else if(vExpan != "")
385
                                                        {
386
                                                                //System.out.println("write expan "+vForm)
387
                                                                output.write( vExpan +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
388
                                                        }
389
                                                        flagchoice = false;
390
                                                        vCorr= "";
391
                                                        vSic= "";
392
                                                        break;
393
                                                        case "corr":
394
                                                        flagcorr = false;
395

    
396
                                                        break;
397
                                                        case "reg":
398
                                                        flagreg = false;
399
                                                        vReg = "";
400
                                                        break;
401
                                                        case "expan":
402
                                                        flagexpan = false;
403
                                                        vExpan= "";
404
                                                        break;
405
                                                        case "orig":
406
                                                        flagreg = false;
407
                                                        vOrig= "";
408
                                                        break;
409
                                                        case "sic":
410
                                                        flagsic = false;
411

    
412
                                                        break;
413
                                                        case "abbr":
414
                                                        flagreg = false;
415
                                                        vAbbr= "";
416
                                                        break;
417

    
418
                                                        case "w":
419
                                                        if(vAna != null)
420
                                                                if(captureword)
421
                                                        {
422
                                                                if(flagchoice)
423
                                                                {
424

    
425
                                                                }
426
                                                                else if(flagfw)
427
                                                                {
428

    
429
                                                                }
430
                                                                else
431
                                                                {
432
                                                                        output.write( vForm +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
433
                                                                }
434

    
435
                                                        }
436

    
437
                                                        flagWord = false;
438
                                                        break;
439

    
440
                                                        case "form":
441
                                                        flagForm = false;
442
                                                        break;
443

    
444
                                                        case "ana":
445
                                                        vAna += "\t";
446
                                                        flagAna = false;
447
                                                        break;
448
                                                }
449
                                                break;
450

    
451
                                        case XMLStreamConstants.CHARACTERS:
452
                                                if(flagAna)
453
                                                {
454
                                                        vAna += parser.getText().trim();
455
                                                }
456

    
457
                                                if(flagForm)
458
                                                {
459
                                                        vForm += parser.getText().trim();
460
                                                        if(flagchoice)
461
                                                        {
462
                                                                if(flagsic)
463
                                                                {
464
                                                                        vSic += parser.getText().trim();
465
                                                                }
466
                                                                if(flagorig)
467
                                                                {
468
                                                                        vOrig += parser.getText().trim();
469
                                                                }
470
                                                                if(flagabbr)
471
                                                                {
472
                                                                        vAbbr += parser.getText().trim();
473
                                                                }
474
                                                                if(flagcorr)
475
                                                                {
476
                                                                        vCorr += parser.getText().trim();
477
                                                                }
478
                                                        }
479
                                                }
480
                                }
481
                        }
482
                        output.write("</txmcorpus>");
483
                        output.close();
484
                        parser.close();
485
                        inputData.close();
486
                }
487
                catch (XMLStreamException ex) {
488
                        System.out.println(ex);
489
                }
490
                catch (IOException ex) {
491
                        System.out.println("IOException while parsing " + inputData);
492
                }
493

    
494
                return true;
495
        }
496

    
497

    
498

    
499
        /**
500
         * Run.
501
         *
502
         * @param rootDirFile the root dir file
503
         * @return true, if successful
504
         */
505
        public boolean run(File rootDirFile)
506
        {
507
                String rootDir =rootDirFile.getAbsolutePath();
508

    
509
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
510
                        println ("Error: CWB executables not well set.")
511
                        return false;
512
                }
513
                if(!new File(rootDir).exists())
514
                {
515
                        println ("binary directory does not exists: "+rootDir)
516
                        return false;
517
                }
518
                new File(rootDir+"/cqp/","corptef.cqp").delete();//cleaning&preparing
519
                new File(rootDir,"/cqp/").deleteDir();
520
                new File(rootDir,"/cqp/").mkdir();
521
                new File(rootDir,"/data/").deleteDir();
522
                new File(rootDir,"/data/").mkdir();
523
                new File(rootDir,"registry/").mkdir();
524

    
525
                String textid="";
526
                int counttext =0;
527
                List<File> files = new File(rootDirFile,"txm").listFiles();
528
                Collections.sort(files);
529
                //1- Transform into CQP file
530
                for(File f : files)
531
                {
532
                        counttext++;
533
                        if(!f.exists())
534
                        {
535
                                println("file "+f+ " does not exists")
536
                        }
537
                        else
538
                        {
539
                                println("process file "+f)
540
                                String txtname = f.getName().substring(0,f.getName().length()-4);
541
                                def builder = new compiler(f.toURL(),txtname, "corptef", "default");
542
                                builder.setLang lang
543
                                builder.transfomFileCqp(rootDir+"/cqp","corptef.cqp");
544
                        }
545
                }
546

    
547
                //2- Import into CWB
548
                def outDir = rootDir;
549
                CwbEncode cwbEn = new CwbEncode();
550
                cwbEn.setDebug(debug);
551
                CwbMakeAll cwbMa = new CwbMakeAll();
552
                cwbMa.setDebug(debug);
553
                String[] pAttributes = ["ttpos","ttlemme","id","pb","lb","orig","sic","abbr"];
554
                String[] sAttributes = ["txmcorpus:0+lang", "text:0+id+titre+auteur+datecompo+ssiecle+domaine+genre+forme+dialecte+base+project","front:0+n","body:0+n","ab:0+n+rend","div:0+id+type","p:0+n","back:0+n"];
555
                try {
556

    
557
                        String regPath = new File(outDir , "registry/corptef").getAbsolutePath();
558
                        cwbEn.run(
559
                                        new File(outDir, "/data").getAbsolutePath(),
560
                                        new File(outDir, "cqp/"+"corptef.cqp").getAbsolutePath(),
561
                                        regPath,
562
                                        pAttributes, sAttributes);
563
                        if (!new File(regPath).exists()) {
564
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
565
                                return false;
566
                        }
567
                        cwbMa.run(
568
                                        "CORPTEF",
569
                                        new File(outDir, "registry").getAbsolutePath());
570
                } catch (Exception ex) {System.out.println(ex); return false;}
571
                // "C:\Documents and Settings\alavrent\TXM\cwb\bin\cwb-encode" -d "C:\Documents and Settings\alavrent\TXM\corpora\corptef\data" -f "C:\Documents and Settings\alavrent\TXM\corpora\corptef\cqp\corpte.cqp" -R "C:\Documents and Settings\alavrent\TXM\corpora\corptef\registry\corptef" -c utf8 -xsB -xsB -P ttpos -P ttlemme -P id -P pb -P lb -P orig -P sic -P abbr -S text:0+id+titre+auteur+datecompo+ssiecle+domaine+genre+forme+dialecte+base+project -S front:0+n -S body:0+n -S ab:0+n+rend -S div:0+id+type -S p:0+n -S back:0+n
572
                System.out.println("Done.")
573

    
574
                return true;
575
        }
576

    
577
        /**
578
         * Sets the debug.
579
         */
580
        public void setDebug()
581
        {
582
                this.debug = true;
583
        }
584

    
585
        /**
586
         * The main method.
587
         *
588
         * @param args the arguments
589
         */
590
        public static void main(String[] args)
591
        {
592
                File dir = new File("C:/TXM/corpora/corptef/");
593
                def c = new compiler();
594
                c.setDebug();
595
                c.setCwbPath("C:/Program Files/TXM/cwb/bin");
596
                c.run(dir);
597
        }
598
}