Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / corptef / compiler.groovy @ 187

History | View | Annotate | Download (15.2 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (Tue, 24 Jan 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.corptef
29

    
30

    
31
import org.txm.importer.cwb.CwbEncode
32
import org.txm.importer.cwb.CwbMakeAll
33
import org.txm.importer.*;
34
import org.txm.scripts.*;
35
import org.txm.scripts.teitxm.BuildTTSrc;
36
import org.txm.scripts.teitxm.*;
37
import org.txm.utils.treetagger.TreeTagger;
38

    
39
import javax.xml.stream.*;
40
import java.net.URL;
41
import java.io.File;
42
import java.util.HashMap;
43
import java.util.List;
44

    
45
// TODO: Auto-generated Javadoc
46
/**
47
 * The Class compiler.
48
 */
49
class compiler
50
{
51

    
52
        /** The debug. */
53
        private boolean debug= false;
54

    
55
        /** The input data. */
56
        private def inputData;
57

    
58
        /** The factory. */
59
        private def factory;
60

    
61
        /** The parser. */
62
        private XMLStreamReader parser;
63

    
64
        /** The dir. */
65
        private def dir;
66

    
67
        /** The output. */
68
        private def output;
69

    
70
        /** The url. */
71
        private def url;
72

    
73
        /** The anahash. */
74
        private HashMap<String,String> anahash =new HashMap<String,String>() ;
75

    
76
        /** The text. */
77
        String text="";
78

    
79
        /** The base. */
80
        String base="";
81

    
82
        /** The project. */
83
        String project="";
84

    
85
        /** The cwb loc. */
86
        String cwbLoc;
87

    
88
        /** The lang. */
89
        private String lang ="fr";
90

    
91
        /**
92
         * initialize.
93
         *
94
         */
95
        public compiler(){}
96

    
97
        /**
98
         * Instantiates a new compiler.
99
         *
100
         * @param url the url
101
         * @param text the text
102
         * @param base the base
103
         * @param project the project
104
         */
105
        public compiler(URL url,String text,String base, String project)
106
        {
107
                this.text = text
108
                this.base = base;
109
                this.project = project;
110
                try {
111
                        this.url = url;
112
                        inputData = url.openStream();
113

    
114
                        factory = XMLInputFactory.newInstance();
115
                        parser = factory.createXMLStreamReader(inputData);
116
                } catch (XMLStreamException ex) {
117
                        System.out.println(ex);
118
                }catch (IOException ex) {
119
                        System.out.println("IOException while parsing ");
120
                }
121
        }
122

    
123
        /**
124
         * set the language of the corpus.
125
         *
126
         * @param lang the lang
127
         * @return the java.lang. object
128
         */
129
        public setLang(String lang)
130
        {
131
                this.lang = lang;
132
        }
133

    
134
        /**
135
         * Sets the cwb path.
136
         *
137
         * @param path the new cwb path
138
         */
139
        public void setCwbPath(String path)
140
        {
141
                if(!new File(path).exists())
142
                        System.err.println("CWB Path : "+path+" does not exists")
143
                cwbLoc = path;
144
        }
145

    
146
        /**
147
         * Creates the output.
148
         *
149
         * @param dirPathName the dir path name
150
         * @param fileName the file name
151
         * @return true, if successful
152
         */
153
        private boolean createOutput(String dirPathName, String fileName){
154
                try {
155
                        File f = new File(dirPathName, fileName)
156
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
157
                        return true;
158
                } catch (Exception e) {
159
                        System.out.println(e.getLocalizedMessage());
160
                        return false;
161
                }
162
        }
163

    
164
        /**
165
         * Go to text.
166
         */
167
        private void GoToText()
168
        {
169
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
170
                {
171
                        if(event == XMLStreamConstants.END_ELEMENT)
172
                                if(parser.getLocalName().equals("teiHeader"))
173
                                        return;
174
                }
175
        }
176

    
177
        /**
178
         * Transfom file wtc.
179
         *
180
         * @param dirPathName the dir path name
181
         * @param fileName the file name
182
         * @return true, if successful
183
         */
184
        public boolean transfomFileWtc(String dirPathName, String fileName)
185
        {
186
                createOutput(dirPathName, fileName);
187

    
188
                String headvalue=""
189
                String vAna = "";
190
                String vForm = "";
191
                String wordid= "";
192
                String vHead = "";
193

    
194
                int p_id = 0;
195
                int q_id = 0;
196
                int body_id = 0;
197
                int front_id = 0;
198
                int back_id = 0;
199
                String lb_id = 0;
200
                int lb_count = 0;
201
                String pb_id = 0;
202
                int pb_count = 0;
203
                String ab_id = 0;
204

    
205
                boolean captureword = false;
206

    
207
                String vExpan = "";
208
                String vCorr = "";
209
                String vReg = "";
210
                String vOrig = "";
211
                String vSic = "";
212
                String vAbbr = "";
213

    
214
                boolean flaglg = false;
215
                boolean flaghead = false;
216
                boolean flagAuthor = false;
217
                boolean flagDate = false;
218
                boolean flagWord = false;
219
                boolean flagForm = false;
220
                boolean flagAna = false;
221

    
222
                boolean flagchoice = false;
223
                boolean flagcorr = false;
224
                boolean flagsic = false;
225
                boolean flagreg = false;
226
                boolean flagexpan = false;
227
                boolean flagorig = false;
228
                boolean flagabbr = false;
229
                boolean flagfw = false;
230

    
231
                File xpathfile = new File(url.getFile());
232
                String titreId = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title[@type='reference']/text()");
233
                String auteur = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:author/text()");
234
                String datecompo = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:date[@type='compo']/@when");
235
                String ssiecle = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:date[@type='compo_sous_siecle']/@n");
236
                String domaine = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:textDesc/tei:domain/@type");
237
                String genre = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:textDesc/@n");
238
                String forme = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:textClass/tei:catRef/@target[contains(.,'forme')]").substring(1);
239
                String dialecte = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:region[@type='dialecte_auteur']/text()");
240

    
241
                this.GoToText()
242
                output.write("<txmcorpus lang=\""+lang+"\">\n");
243
                try
244
                {
245
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
246
                        {
247
                                switch (event)
248
                                {
249
                                        case XMLStreamConstants.START_ELEMENT:
250
                                                switch (parser.getLocalName())
251
                                                {
252
                                                        case "text":
253
                                                        output.write("<text id=\""+text+"\"" +
254
                                                        " titre=\""+titreId+"\"" +
255
                                                        " auteur=\""+auteur+"\"" +
256
                                                        " datecompo=\""+datecompo+"\"" +
257
                                                        " ssiecle=\""+ssiecle+"\"" +
258
                                                        " domaine=\""+domaine+"\"" +
259
                                                        " genre=\""+genre+"\"" +
260
                                                        " forme=\""+forme+"\"" +
261
                                                        " dialecte=\""+dialecte+"\"" +
262
                                                        " base=\""+base+"\"" +
263
                                                        " project=\""+project+"\">\n");
264
                                                        captureword=true;
265
                                                        break;
266

    
267
                                                        case "div":
268
                                                        //output.write("<div type=\""+parser.getAttributeValue(null,"type")+"\">\n");
269
                                                        break;
270

    
271
                                                        case "p":
272
                                                        output.write("<p n=\""+(p_id++)+"\">\n");
273
                                                        break;
274
                                                        case "ab":
275
                                                        output.write("<ab n=\""+ab_id+++"\" rend=\""+parser.getAttributeValue(null,"rend")+"\">\n");
276
                                                        break;
277
                                                        case "q":
278
                                                        output.write("<q n=\""+(q_id++)+"\">\n");
279
                                                        break;
280
                                                        case "front":
281
                                                        output.write("<front n=\""+(front_id++)+"\">\n");
282
                                                        break;
283
                                                        case "body":
284
                                                        output.write("<body n=\""+(body_id++)+"\">\n");
285
                                                        break;
286
                                                        case "back":
287
                                                        output.write("<back n=\""+(back_id++)+"\">\n");
288
                                                        break;
289
                                                        case "lb":
290
                                                        String n = parser.getAttributeValue(null,"n")
291
                                                        if(n != null)
292
                                                                lb_id = n;
293
                                                        else
294
                                                                lb_id =""+lb_count++;
295
                                                        break;
296
                                                        case "pb":
297
                                                        String n = parser.getAttributeValue(null,"n");
298
                                                        if(n != null)
299
                                                                pb_id = n;
300
                                                        else
301
                                                                pb_id = ""+(pb_count++);
302

    
303
                                                        break;
304

    
305
                                                        case "choice":
306
                                                        flagchoice = true;
307
                                                        break;
308
                                                        case "corr":
309
                                                        flagcorr = true;
310
                                                        vCorr= "";
311
                                                        break;
312
                                                        case "reg":
313
                                                        flagreg = true;
314
                                                        vReg= "";
315
                                                        break;
316
                                                        case "expan":
317
                                                        flagexpan = true;
318
                                                        vExpan= "";
319
                                                        break;
320
                                                        case "orig":
321
                                                        flagreg = true;
322
                                                        vOrig= "";
323
                                                        break;
324
                                                        case "sic":
325
                                                        flagsic = true;
326
                                                        vSic= "";
327
                                                        break;
328
                                                        case "abbr":
329
                                                        flagreg = true;
330
                                                        vAbbr= "";
331
                                                        break;
332

    
333
                                                        case "w":
334
                                                        for(int i = 0 ; i < parser.getAttributeCount(); i++)
335
                                                                if(parser.getAttributeLocalName(i).equals("id"))
336
                                                        {
337
                                                                wordid = parser.getAttributeValue(i);
338
                                                        }
339
                                                        vAna = "";
340
                                                        vForm = "";
341
                                                        flagWord = true;
342
                                                        break;
343
                                                        case "form":
344
                                                        flagForm = true;
345
                                                        vForm = "";
346
                                                        vAna ="";
347
                                                        break;
348

    
349
                                                        case "ana":
350
                                                        flagAna = true;
351
                                                        break;
352
                                                }
353
                                                break;
354

    
355
                                        case XMLStreamConstants.END_ELEMENT:
356
                                                switch (parser.getLocalName())
357
                                                {
358
                                                        case "div":
359
                                                        break;
360

    
361
                                                        case "text":
362
                                                        output.write("</text>\n");
363
                                                        captureword=false;
364
                                                        break;
365
                                                        case "p":
366
                                                        output.write("</p>\n");
367
                                                        break;
368
                                                        case "ab":
369
                                                        output.write("</ab>\n");
370
                                                        break;
371
                                                        case "q":
372
                                                        output.write("</q>\n");
373
                                                        break;
374
                                                        case "front":
375
                                                        output.write("</front>\n");
376
                                                        break;
377
                                                        case "body":
378
                                                        output.write("</body>\n");
379
                                                        break;
380
                                                        case "back":
381
                                                        output.write("</back>\n");
382
                                                        break;
383

    
384
                                                        case "fw":
385
                                                        flagfw = false;
386
                                                        break;
387
                                                        case "choice":
388

    
389
                                                        if(vCorr != "")
390
                                                        {
391
                                                                //System.out.println(vCorr+" >> write corr "+vForm +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr)
392
                                                                output.write( vCorr +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
393
                                                        }
394
                                                        else if(vReg != "")
395
                                                        {
396
                                                                //System.out.println("write reg "+vForm)
397
                                                                output.write( vReg +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
398
                                                        }
399
                                                        else if(vExpan != "")
400
                                                        {
401
                                                                //System.out.println("write expan "+vForm)
402
                                                                output.write( vExpan +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
403
                                                        }
404
                                                        flagchoice = false;
405
                                                        vCorr= "";
406
                                                        vSic= "";
407
                                                        break;
408
                                                        case "corr":
409
                                                        flagcorr = false;
410

    
411
                                                        break;
412
                                                        case "reg":
413
                                                        flagreg = false;
414
                                                        vReg = "";
415
                                                        break;
416
                                                        case "expan":
417
                                                        flagexpan = false;
418
                                                        vExpan= "";
419
                                                        break;
420
                                                        case "orig":
421
                                                        flagreg = false;
422
                                                        vOrig= "";
423
                                                        break;
424
                                                        case "sic":
425
                                                        flagsic = false;
426

    
427
                                                        break;
428
                                                        case "abbr":
429
                                                        flagreg = false;
430
                                                        vAbbr= "";
431
                                                        break;
432

    
433
                                                        case "w":
434
                                                        if(vAna != null)
435
                                                                if(captureword)
436
                                                        {
437
                                                                if(flagchoice)
438
                                                                {
439

    
440
                                                                }
441
                                                                else if(flagfw)
442
                                                                {
443

    
444
                                                                }
445
                                                                else
446
                                                                {
447
                                                                        output.write( vForm +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
448
                                                                }
449

    
450
                                                        }
451

    
452
                                                        flagWord = false;
453
                                                        break;
454

    
455
                                                        case "form":
456
                                                        flagForm = false;
457
                                                        break;
458

    
459
                                                        case "ana":
460
                                                        vAna += "\t";
461
                                                        flagAna = false;
462
                                                        break;
463
                                                }
464
                                                break;
465

    
466
                                        case XMLStreamConstants.CHARACTERS:
467
                                                if(flagAna)
468
                                                {
469
                                                        vAna += parser.getText().trim();
470
                                                }
471

    
472
                                                if(flagForm)
473
                                                {
474
                                                        vForm += parser.getText().trim();
475
                                                        if(flagchoice)
476
                                                        {
477
                                                                if(flagsic)
478
                                                                {
479
                                                                        vSic += parser.getText().trim();
480
                                                                }
481
                                                                if(flagorig)
482
                                                                {
483
                                                                        vOrig += parser.getText().trim();
484
                                                                }
485
                                                                if(flagabbr)
486
                                                                {
487
                                                                        vAbbr += parser.getText().trim();
488
                                                                }
489
                                                                if(flagcorr)
490
                                                                {
491
                                                                        vCorr += parser.getText().trim();
492
                                                                }
493
                                                        }
494
                                                }
495
                                }
496
                        }
497
                        output.write("</txmcorpus>");
498
                        output.close();
499
                        parser.close();
500
                        inputData.close();
501
                }
502
                catch (XMLStreamException ex) {
503
                        System.out.println(ex);
504
                }
505
                catch (IOException ex) {
506
                        System.out.println("IOException while parsing " + inputData);
507
                }
508

    
509
                return true;
510
        }
511

    
512

    
513

    
514
        /**
515
         * Run.
516
         *
517
         * @param rootDirFile the root dir file
518
         * @return true, if successful
519
         */
520
        public boolean run(File rootDirFile)
521
        {
522
                String rootDir =rootDirFile.getAbsolutePath();
523

    
524
                if(cwbLoc == null)
525
                        cwbLoc = org.txm.Toolbox.getParam(org.txm.Toolbox.CQI_SERVER_PATH_TO_CQPLIB)+File.separator;
526

    
527
                if(!new File(cwbLoc).exists())
528
                {
529
                        println ("CWB path error: "+cwbLoc)
530
                        return false;
531
                }
532
                if(!new File(rootDir).exists())
533
                {
534
                        println ("binary directory does not exists: "+rootDir)
535
                        return false;
536
                }
537
                new File(rootDir+"/wtc/","corptef.wtc").delete();//cleaning&preparing
538
                new File(rootDir,"/wtc/").deleteDir();
539
                new File(rootDir,"/wtc/").mkdir();
540
                new File(rootDir,"/data/").deleteDir();
541
                new File(rootDir,"/data/").mkdir();
542
                new File(rootDir,"registry/").mkdir();
543

    
544
                String textid="";
545
                int counttext =0;
546
                List<File> files = new File(rootDirFile,"txm").listFiles();
547
                Collections.sort(files);
548
                //1- Transform into WTC file
549
                for(File f : files)
550
                {
551
                        counttext++;
552
                        if(!f.exists())
553
                        {
554
                                println("file "+f+ " does not exists")
555
                        }
556
                        else
557
                        {
558
                                println("process file "+f)
559
                                String txtname = f.getName().substring(0,f.getName().length()-4);
560
                                def builder = new compiler(f.toURL(),txtname, "corptef", "default");
561
                                builder.setLang lang
562
                                builder.transfomFileWtc(rootDir+"/wtc","corptef.wtc");
563
                        }
564
                }
565

    
566
                //2- Import into CWB
567
                def outDir = rootDir;
568
                CwbEncode cwbEn = new CwbEncode();
569
                cwbEn.setDebug(debug);
570
                CwbMakeAll cwbMa = new CwbMakeAll();
571
                cwbMa.setDebug(debug);
572
                String[] pAttributes = ["ttpos","ttlemme","id","pb","lb","orig","sic","abbr"];
573
                String[] sAttributes = ["txmcorpus:0+lang", "text:0+id+titre+auteur+datecompo+ssiecle+domaine+genre+forme+dialecte+base+project","front:0+n","body:0+n","ab:0+n+rend","div:0+id+type","p:0+n","back:0+n"];
574
                try {
575

    
576
                        String regPath = new File(outDir , "registry/corptef").getAbsolutePath();
577
                        cwbEn.run(new File(cwbLoc,"cwb-encode").getAbsolutePath(),
578
                                        new File(outDir, "/data").getAbsolutePath(),
579
                                        new File(outDir, "wtc/"+"corptef.wtc").getAbsolutePath(),
580
                                        regPath,
581
                                        pAttributes, sAttributes);
582
                        if (!new File(regPath).exists()) {
583
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
584
                                return false;
585
                        }
586
                        cwbMa.run(new File(cwbLoc,"cwb-makeall").getAbsolutePath(),
587
                                        "CORPTEF",
588
                                        new File(outDir, "registry").getAbsolutePath());
589
                } catch (Exception ex) {System.out.println(ex); return false;}
590
                // "C:\Documents and Settings\alavrent\TXM\cwb\bin\cwb-encode" -d "C:\Documents and Settings\alavrent\TXM\corpora\corptef\data" -f "C:\Documents and Settings\alavrent\TXM\corpora\corptef\wtc\corpte.wtc" -R "C:\Documents and Settings\alavrent\TXM\corpora\corptef\registry\corptef" -c utf8 -xsB -xsB -P ttpos -P ttlemme -P id -P pb -P lb -P orig -P sic -P abbr -S text:0+id+titre+auteur+datecompo+ssiecle+domaine+genre+forme+dialecte+base+project -S front:0+n -S body:0+n -S ab:0+n+rend -S div:0+id+type -S p:0+n -S back:0+n
591
                System.out.println("Done.")
592

    
593
                return true;
594
        }
595

    
596
        /**
597
         * Sets the debug.
598
         */
599
        public void setDebug()
600
        {
601
                this.debug = true;
602
        }
603

    
604
        /**
605
         * The main method.
606
         *
607
         * @param args the arguments
608
         */
609
        public static void main(String[] args)
610
        {
611
                File dir = new File("C:/TXM/corpora/corptef/");
612
                def c = new compiler();
613
                c.setDebug();
614
                c.setCwbPath("C:/Program Files/TXM/cwb/bin");
615
                c.run(dir);
616
        }
617
}