Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / bfm / compiler.groovy @ 966

History | View | Annotate | Download (26.7 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
41
// $LastChangedRevision: 3400 $
42
// $LastChangedBy: mdecorde $
43
//
44
package org.txm.importer.bfm
45

    
46
import org.txm.Toolbox;
47
import org.txm.importer.cwb.*
48
import org.txm.importer.*;
49
import org.txm.scripts.*;
50
import org.txm.importer.xmltxm.BuildTTSrc;
51
import org.txm.importer.xmltxm.*;
52
import org.txm.utils.treetagger.TreeTagger;
53

    
54
import javax.xml.stream.*;
55
import java.net.URL;
56
import java.io.File;
57
import java.util.Comparator;
58
import java.util.HashMap;
59
import java.util.List;
60

    
61
// TODO: Auto-generated Javadoc
62
/**
63
 * Produce CQP files from the TEI-TXM files. <br/>
64
 * - Read texts metadata with XPath queries <br/>
65
 * - Add the following word properties : sic, abbr, orig, lb and pb <br/>
66
 * - Keep &lt;front>, &lt;body> and &lt;back> for each text <br/>
67
 * - Text enclosed in &lt;q> is tokenized <br/>
68
 * 
69
 * @author mdecorde
70
 *
71
 */
72
class compiler {
73
        /** The debug. */
74
        private boolean debug= false;
75

    
76
        /** The annotate_status. */
77
        private boolean annotate_status=true;
78

    
79
        /** The input data. */
80
        private def inputData;
81

    
82
        /** The factory. */
83
        private def factory;
84

    
85
        /** The parser. */
86
        private XMLStreamReader parser;
87

    
88
        /** The dir. */
89
        private def dir;
90

    
91
        /** The output. */
92
        private def output;
93

    
94
        /** The url. */
95
        private def url;
96

    
97
        /** The anahash. */
98
        static boolean firstWord = true;
99
        static private def anaTypes = [];
100
        private HashMap<String,String> anahash = new HashMap<String,String>() ;
101

    
102
        private static SAttributesListener sattrsListener;
103
        private static HashMap<String,ArrayList<String>> structs;
104
        private static HashMap<String, Integer> structsProf;
105

    
106
        /** The text. */
107
        private String text="";
108

    
109
        /** The base. */
110
        private String base="";
111

    
112
        /** The project. */
113
        private String project="";
114

    
115
        /** The lang. */
116
        private String lang ="fr";
117

    
118
        /**
119
         * contains the metadata xpath organize per name
120
         */
121
        Properties metadataXPath;
122

    
123
        /**
124
         * initialize.
125
         *
126
         */
127
        public compiler(){
128
                firstWord = true;
129
                anaTypes = [];
130
        }
131

    
132
        /**
133
         * initialize the compiler.
134
         *
135
         * @param url the file to process
136
         * @param text the Texte's name
137
         * @param base the base's name
138
         * @param project the Project's name
139
         */
140
        public compiler(URL url,String text,String base, String project, Properties metadataXPath)
141
        {
142
                this.metadataXPath = metadataXPath;
143
                this.text = text
144
                this.base = base;
145
                this.project = project;
146
                try {
147
                        this.url = url;
148
                        inputData = url.openStream();
149

    
150
                        factory = XMLInputFactory.newInstance();
151
                        parser = factory.createXMLStreamReader(inputData);
152
                        if (sattrsListener == null)
153
                                sattrsListener = new SAttributesListener(parser);
154
                        else
155
                                sattrsListener.start(parser)
156
                } catch (XMLStreamException ex) {
157
                        System.out.println(ex);
158
                }catch (IOException ex) {
159
                        System.out.println("IOException while parsing ");
160
                }
161
        }
162

    
163
        ArrayList<File> orderedFiles;
164
        public ArrayList<File> getOrderedTxmFiles() {
165
                return orderedFiles;
166
        }
167

    
168
        /**
169
         * Sets the lang.
170
         *
171
         * @param lang the new lang
172
         */
173
        public void setLang(String lang)
174
        {
175
                this.lang = lang;
176
        }
177

    
178
        /**
179
         * Sets the annotation done.
180
         *
181
         * @param done the new annotation done
182
         */
183
        public void setAnnotationDone(boolean done)
184
        {
185
                this.annotate_status = done;
186
        }
187

    
188
        /**
189
         * Creates the output.
190
         *
191
         * @param dirPathName the dir path name
192
         * @param fileName the file name
193
         * @return true, if successful
194
         */
195
        private boolean createOutput(File f) {
196
                try {
197
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
198
                } catch (Exception e) {
199
                        System.out.println(e.getLocalizedMessage());
200
                        return false;
201
                }
202
                return true;
203
        }
204

    
205
        /**
206
         * Go to text.
207
         */
208
        private void GoToText()
209
        {
210
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
211
                        if (event == XMLStreamConstants.END_ELEMENT)
212
                                if (parser.getLocalName().equals("teiHeader"))
213
                                        return;
214
                }
215
        }
216

    
217
        /**
218
         * Increment.
219
         *
220
         * @param parser the parser
221
         * @param value the value
222
         * @return the java.lang. object
223
         */
224
        private def increment(XMLStreamReader parser, int value)
225
        {
226
                String n=null;
227
                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
228
                        //System.out.println("attr name "+parser.getAttributeLocalName(i));
229
                        if (parser.getAttributeLocalName(i) == "n") {
230
                                n = parser.getAttributeValue(i);
231
                                break;
232
                        }
233
                }
234
                //System.out.println("inc n "+n);
235
                if (n != null)
236
                        try {
237
                                value = Integer.parseInt(n);
238
                                return value;
239
                        }
240
                        catch (Exception e) {return value+1;}
241

    
242
                value = value+1;
243
                return value;
244
        }
245

    
246
        /**
247
         * Transfom file cqp.
248
         *
249
         * @param dirPathName the dir path name
250
         * @param fileName the file name
251
         * @return true, if successful
252
         */
253
        private boolean transfomFileCqp(File cqpFile)
254
        {
255
                try {
256
                        if (!createOutput(cqpFile)) return false;
257

    
258
                        String headvalue = ""
259
                        String vAna = "";
260
                        String vForm = "";
261
                        String wordid = "";
262
                        String vHead = "";
263

    
264
                        Integer p_id = 0;
265
                        Integer s_id = 0;
266
                        Integer q_id = 0;
267
                        int sp_id = 0;
268
                        Integer body_id = 0;
269
                        Integer front_id = 0;
270
                        Integer back_id = 0;
271
                        Integer lb_id = 0;
272
                        Integer pb_id = 0;
273
                        Integer ab_id = 0;
274
                        int foreign_id = 0;
275
                        int name_id = 0;
276

    
277
                        boolean captureword = false;
278

    
279
                        String vExpan = "";
280
                        String vCorr = "";
281
                        String vReg = "";
282
                        String vOrig = "";
283
                        String vSic = "";
284
                        String vAbbr = "";
285
                        String givenpos = "";
286
                        String pb_n = "";
287
                        String foreign_lang = "";
288
                        String nameType = "";
289
                        String anaType;
290
                        //String abType = "";
291
                        
292
                        boolean foundtei=false, foundtext=false;
293

    
294
                        boolean flaglg = false;
295
                        int levelq = 0;
296
                        //boolean flagq = false;
297
                        boolean flaghead = false;
298
                        //Added:
299
                        boolean flagSp = false;
300
                        boolean flagAuthor = false;
301
                        boolean flagDate = false;
302
                        boolean flagWord = false;
303
                        boolean flagForm = false;
304
                        boolean flagAna = false;
305

    
306
                        boolean flagchoice = false;
307
                        boolean flagcorr = false;
308
                        boolean flagsic = false;
309
                        boolean flagreg = false;
310
                        boolean flagexpan = false;
311
                        boolean flagorig = false;
312
                        boolean flagabbr = false;
313
                        boolean flagfw = false;
314
                        //boolean flagSupplied = false;
315
                        int levelSupplied = 0;
316
                        //boolean flagSurplus = false;
317
                        boolean flagForeign = false;
318
                        //boolean flagName = false;
319

    
320
                        this.GoToText();
321
                        int missingId= 0
322
                        boolean USEVERSE = false; // switch default reference to verse references
323
                        String titreId; // the title to use in the reference
324
                        
325
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
326
                                switch (event) {
327
                                        case XMLStreamConstants.START_ELEMENT:
328

    
329
                                                String localname = parser.getLocalName();
330
                                                if (foundtext) sattrsListener.startElement(localname);
331
                                                
332
                                                switch (localname) {
333
                                                        case "TEI":
334
                                                                foundtei = true;
335
                                                                break;
336
                                                        case "text":
337
                                                                foundtext = true;
338
                                                                sattrsListener.startElement(localname);
339
                                                                output.write("<text id=\""+text+"\"")
340

    
341
                                                                for (int i = 0; i < parser.getAttributeCount() ; i++) {
342
                                                                        String name = parser.getAttributeLocalName(i);
343
                                                                        if ("id" == name || "base" == name || "project" == name) continue;
344
                                                                        output.write(" "+name+"=\""+parser.getAttributeValue(i)+"\"");
345
                                                                        
346
                                                                        if (name == "forme") {
347
                                                                                USEVERSE = (parser.getAttributeValue(i).contains("vers"))
348
                                                                        } else if (name == "sigle") {
349
                                                                                titreId = parser.getAttributeValue(i)
350
                                                                        }
351
                                                                }
352
                                                                
353
                                                                output.write(" base=\""+base+"\" project=\""+project+"\">\n");
354
                                                                captureword=true;
355
                                                                break;
356

    
357
                                                        case "div":
358
                                                        //output.write("<div type=\""+parser.getAttributeValue(null,"type")+"\">\n");
359
                                                                String divType = "NA";
360
                                                                String divSubtype = "NA";
361
                                                                String divN = "NA";
362
                                                                String divId ="NA";
363
                                                                for(int i = 0 ; i < parser.getAttributeCount(); i++) {
364
                                                                        if(parser.getAttributeLocalName(i) == "type") {
365
                                                                                divType = parser.getAttributeValue(i);
366
                                                                        } else if(parser.getAttributeLocalName(i) == "subtype") {
367
                                                                                divSubtype = parser.getAttributeValue(i);
368
                                                                        } else if(parser.getAttributeLocalName(i) == "n") {
369
                                                                                divN = parser.getAttributeValue(i);
370
                                                                        } else if(parser.getAttributeLocalName(i) == "id") {
371
                                                                                divId = parser.getAttributeValue(i);
372
                                                                                break;
373
                                                                        }
374
                                                                }
375
                                                                output.write("<div type=\""+divType+"\" subtype=\""+divSubtype+"\" n=\""+divN+"\" id=\""+divId+"\">\n");
376
                                                                break;
377
                                                        case "p":
378
                                                                p_id = increment(parser, p_id);
379
                                                                output.write("<p n=\""+p_id+"\">\n");
380
                                                                break;
381
                                                        case "ab":
382
                                                                ab_id = increment(parser, ab_id)
383
                                                                output.write("<ab n=\""+(ab_id)+"\" type=\""+parser.getAttributeValue(null,"type")+"\" subtype=\""+parser.getAttributeValue(null,"subtype")+"\" rend=\""+parser.getAttributeValue(null,"rend")+"\">\n");
384
                                                                break;
385
                                                        case "q":
386
                                                                q_id = increment(parser, q_id)
387
                                                                output.write("<q n=\""+(q_id)+"\">\n");
388
                                                        //flagq=true;
389
                                                                levelq = levelq + 1;
390
                                                                break;
391
                                                        case "sp":
392
                                                                sp_id = increment(parser, sp_id)
393
                                                                output.write("<sp n=\""+(sp_id)+"\">\n");
394
                                                                flagSp = true;
395
                                                                break;
396
                                                        case "front":
397
                                                                front_id = increment(parser, front_id)
398
                                                                output.write("<front n=\""+front_id+"\">\n");
399
                                                                break;
400
                                                        case "body":
401
                                                                body_id= increment(parser, body_id)
402
                                                                output.write("<body n=\""+body_id+"\">\n");
403
                                                                break;
404
                                                        case "back":
405
                                                                back_id = increment(parser, back_id)
406
                                                                output.write("<back n=\""+back_id+"\">\n");
407
                                                                break;
408
                                                        case "lb":
409
                                                                lb_id = increment(parser, lb_id)
410
                                                                break;
411
                                                        case "pb":
412
                                                                pb_id = increment(parser, pb_id)
413
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
414
                                                                        if (parser.getAttributeLocalName(i) == "n") {
415
                                                                                pb_n = parser.getAttributeValue(i);
416
                                                                        }
417
                                                                }
418
                                                                break;
419
                                                        case "s":
420
                                                                s_id = increment(parser, s_id)
421
                                                                output.write("<s n=\""+s_id+"\">\n");
422
                                                                break;
423
                                                        case "choice":
424
                                                                flagchoice = true;
425
                                                                break;
426
                                                        case "corr":
427
                                                                flagcorr = true;
428
                                                                vCorr= "";
429
                                                                break;
430
                                                        case "reg":
431
                                                                flagreg = true;
432
                                                                vReg= "";
433
                                                                break;
434
                                                        case "expan":
435
                                                                flagexpan = true;
436
                                                                vExpan= "";
437
                                                                break;
438
                                                        case "orig":
439
                                                                flagreg = true;
440
                                                                vOrig= "";
441
                                                                break;
442
                                                        case "sic":
443
                                                                flagsic = true;
444
                                                                vSic= "";
445
                                                                break;
446
                                                        case "abbr":
447
                                                                flagreg = true;
448
                                                                vAbbr= "";
449
                                                                break;
450
                                                        case "foreign":
451
                                                                flagForeign = true;
452
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
453
                                                                        if (parser.getAttributeLocalName(i) == "lang") {
454
                                                                                lang = parser.getAttributeValue(i);
455
                                                                                break;
456
                                                                        }
457
                                                                }
458

    
459
                                                                output.write("<foreign n=\""+(foreign_id++)+"\" lang=\""+lang+"\">\n");
460
                                                        //vForeign = "";
461
                                                                break;
462

    
463
                                                        case "name":
464
                                                        //flagName = true;
465
                                                                for(int i = 0 ; i < parser.getAttributeCount(); i++)
466
                                                                        if(parser.getAttributeLocalName(i) == "type")
467
                                                                {
468
                                                                        nameType = parser.getAttributeValue(i);
469
                                                                        break;
470
                                                                }
471

    
472
                                                                output.write("<name n=\""+(name_id++)+"\" type=\""+nameType+"\">\n");
473
                                                                break;
474
                                                        case "supplied":
475
                                                        //flagSupplied = true;
476
                                                                levelSupplied = levelSupplied + 1;
477
                                                                break;
478

    
479
                                                        case "surplus":
480
                                                                flagfw = true;
481
                                                                break;
482

    
483
                                                        case "del":
484
                                                                flagfw = true;
485
                                                                break;
486

    
487
                                                        case "w":
488
                                                                givenpos = "";
489
                                                                wordid = "w_"+text+"_m"+missingId++
490
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
491
                                                                        if (parser.getAttributeLocalName(i) == "id") {
492
                                                                                wordid = parser.getAttributeValue(i);
493
                                                                        } else if (parser.getAttributeLocalName(i) == "type") {
494
                                                                                givenpos = parser.getAttributeValue(i);
495
                                                                        }
496
                                                                }
497
                                                                if (wordid.startsWith("w")) {
498
                                                                        if (!wordid.startsWith("w_"))
499
                                                                                wordid = "w_"+wordid.substring(1)
500
                                                                } else {
501
                                                                        wordid = "w_"+wordid;
502
                                                                }
503

    
504
                                                                if (givenpos == null || givenpos == "")
505
                                                                        givenpos = "NA";
506
                                                                vForm = "";
507
                                                                anahash.clear(); // remove previous word ana values
508
                                                                flagWord = true;
509
                                                                break;
510
                                                        case "form":
511
                                                                flagForm = true;
512
                                                                vForm = "";
513
                                                                break;
514

    
515
                                                        case "ana":
516
                                                                flagAna = true;
517
                                                                anaType = parser.getAttributeValue(null, "type")
518
                                                                anahash.put(anaType, "");
519
                                                                if (firstWord) {
520
                                                                        anaTypes << anaType;
521
                                                                }
522
                                                                break;
523
                                                }
524
                                                break;
525

    
526
                                        case XMLStreamConstants.END_ELEMENT:
527
                                                String localname = parser.getLocalName();
528
                                                if (foundtext) sattrsListener.endElement(localname);
529

    
530
                                                switch (localname) {
531
                                                        case "div":
532
                                                                output.write("</div>\n");
533
                                                                break;
534
                                                        case "text":
535
                                                                output.write("</text>\n");
536
                                                                captureword=false;
537
                                                                break;
538
                                                        case "p":
539
                                                                output.write("</p>\n");
540
                                                                break;
541
                                                        case "s":
542
                                                                output.write("</s>\n");
543
                                                                break;
544
                                                        case "ab":
545
                                                                output.write("</ab>\n");
546
                                                                break;
547
                                                        case "q":
548
                                                                output.write("</q>\n");
549
                                                        //flagq= false;
550
                                                                levelq = levelq - 1;
551
                                                                break;
552
                                                        case "sp":
553
                                                                output.write("</sp>\n");
554
                                                                flagSp = false;
555
                                                                break;
556
                                                        case "front":
557
                                                                output.write("</front>\n");
558
                                                                break;
559
                                                        case "body":
560
                                                                output.write("</body>\n");
561
                                                                break;
562
                                                        case "back":
563
                                                                output.write("</back>\n");
564
                                                                break;
565

    
566
                                                        //                                                        case "fw":
567
                                                        //                                                        flagfw = false;
568
                                                        //                                                        break;
569

    
570
                                                        case "choice":
571
                                                                if(vOrig == "")
572
                                                                        vOrig="NA";
573
                                                                if(vSic == "")
574
                                                                        vSic="NA";
575
                                                                if(vAbbr == "")
576
                                                                        vAbbr="NA";
577

    
578
                                                                String ref;
579
                                                                if(USEVERSE)
580
                                                                        ref = titreId+", p."+pb_n+", v."+lb_id;
581
                                                                else
582
                                                                        ref = titreId+", p."+pb_n;
583

    
584
                                                                if (flagfw) {
585
                                                                        // on est hors texte
586
                                                                } else {
587
                                                                        String vFormToWrite = vForm;
588
                                                                        if (vCorr != "") {
589
                                                                                vFormToWrite = vCorr;
590
                                                                        } else if(vReg != "") {
591
                                                                                vFormToWrite = vReg;
592
                                                                        } else if(vExpan != "") {
593
                                                                                vFormToWrite = vExpan
594
                                                                        }
595
                                                                        firstWord = false;
596
                                                                        output.write( vFormToWrite +"\t"+wordid+"\t"+levelq.toString().substring(0,1)+"\t"+flagSp.toString().substring(0,1)+"\t"+pb_n+"\t"+lb_id+"\t"+vOrig+
597
                                                                                        "\t"+vSic+"\t"+vAbbr+"\t"+ref+"\t"+givenpos+"\t"+levelSupplied.toString().substring(0,1)+"\t"+lang+"\t"+nameType);
598
                                                                        for(String type : anaTypes) {
599
                                                                                output.write("\t"+anahash.get(type));
600
                                                                        }
601
                                                                        output.write("\n")
602
                                                                }
603
                                                                flagchoice = false;
604
                                                                vCorr= "";
605
                                                                vSic= "";
606
                                                                break;
607
                                                        case "corr":
608
                                                                flagcorr = false;
609

    
610
                                                                break;
611
                                                        case "reg":
612
                                                                flagreg = false;
613
                                                                vReg = "";
614
                                                                break;
615
                                                        case "expan":
616
                                                                flagexpan = false;
617
                                                                vExpan= "";
618
                                                                break;
619
                                                        case "orig":
620
                                                                flagreg = false;
621
                                                                vOrig= "";
622
                                                                break;
623
                                                        case "sic":
624
                                                                flagsic = false;
625

    
626
                                                                break;
627
                                                        case "abbr":
628
                                                                flagreg = false;
629
                                                                vAbbr= "";
630
                                                                break;
631

    
632
                                                        case "foreign":
633
                                                                flagForeign = false;
634
                                                                lang = "";
635
                                                                output.write("</foreign>\n");
636
                                                                break;
637

    
638
                                                        case "name":
639
                                                        //flagName = false;
640
                                                                nameType = "";
641
                                                                output.write("</name>\n");
642
                                                                break;
643

    
644
                                                        case "supplied":
645
                                                        //flagSupplied = false;
646
                                                                levelSupplied = levelSupplied - 1;
647
                                                                break;
648

    
649
                                                        case "surplus":
650
                                                                flagfw = false;
651
                                                                break;
652

    
653
                                                        case "del":
654
                                                                flagfw = false;
655
                                                                break;
656

    
657
                                                        case "w":
658
                                                                if (captureword) {
659
                                                                        if (flagchoice) {
660

    
661
                                                                        } else if(flagfw) {
662

    
663
                                                                        } else {
664
                                                                                if (vOrig == "")
665
                                                                                        vOrig="NA";
666
                                                                                if(vSic == "")
667
                                                                                        vSic="NA";
668
                                                                                if(vAbbr == "")
669
                                                                                        vAbbr="NA";
670
                                                                                if (nameType == "")
671
                                                                                        nameType = "NA";
672
                                                                                if(lang == "")
673
                                                                                        lang="fr"
674

    
675
                                                                                String ref;
676
                                                                                if(USEVERSE)
677
                                                                                        ref = titreId+", p."+pb_n+", v."+lb_id;
678
                                                                                else
679
                                                                                        ref = titreId+", p."+pb_n;
680

    
681
                                                                                firstWord = false;
682
                                                                                output.write(vForm.replaceAll("&", "&amp;").replaceAll("<", "&lt;") +"\t"+wordid+"\t"+levelq.toString().substring(0,1)+"\t"+flagSp.toString().substring(0,1)+"\t"+pb_n+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\t"+ref+"\t"+givenpos+"\t"+levelSupplied.toString().substring(0,1)+"\t"+lang+"\t"+nameType);
683
                                                                                for(String type : anaTypes) {
684
                                                                                        output.write("\t"+anahash.get(type));
685
                                                                                }
686
                                                                                output.write("\n")
687
                                                                        }
688

    
689
                                                                        flagWord = false;
690
                                                                }
691
                                                                break;
692

    
693
                                                        case "form":
694
                                                                flagForm = false;
695
                                                                break;
696

    
697
                                                        case "ana":
698
                                                                flagAna = false;
699
                                                                anahash.put(anaType, vAna);
700
                                                                vAna = "";
701
                                                                break;
702
                                                }
703
                                                break; // end elem
704

    
705
                                        case XMLStreamConstants.CHARACTERS:
706
                                                if (flagAna) {
707
                                                        vAna += parser.getText().trim()
708
                                                }
709

    
710
                                                if (flagForm) {
711
                                                        vForm += parser.getText().trim();
712
                                                        if (flagchoice) {
713
                                                                if (flagsic) {
714
                                                                        vSic += parser.getText().trim();
715
                                                                }
716
                                                                if (flagorig) {
717
                                                                        vOrig += parser.getText().trim();
718
                                                                }
719
                                                                if (flagabbr) {
720
                                                                        vAbbr += parser.getText().trim();
721
                                                                }
722
                                                                if (flagcorr) {
723
                                                                        vCorr += parser.getText().trim();
724
                                                                }
725
                                                        }
726
                                                }
727
                                }
728
                        }
729
                        //output.write("</txmcorpus>");
730
                        output.close();
731
                        parser.close();
732
                        inputData.close();
733
                } catch (Exception ex) {
734
                        System.out.println("Exception while parsing " + inputData);
735
                        ex.printStackTrace();
736
                        return false;
737
                }
738

    
739
                return true;
740
        }
741

    
742

    
743
        /**
744
         * Run.
745
         *
746
         * @param rootDirFile contains the TEI-TXM files
747
         * @param basename the basename
748
         * @return true, if successful
749
         */
750
        public boolean run(File binDir, File txmDir, String corpusname, Properties metadataXPath)
751
        {
752
                sattrsListener = null; // reset SAttribute Listener for each new import
753
                this.metadataXPath = metadataXPath;
754

    
755
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
756
                        println ("Error: CWB executables not well set.")
757
                        return false;
758
                }
759
                if (!txmDir.exists()) {
760
                        println ("binary directory does not exists: "+txmDir)
761
                        return false;
762
                }
763

    
764
                File cqpFile = new File(binDir, "cqp/${corpusname}.cqp");
765
                new File(binDir, "/cqp/").deleteDir();
766
                new File(binDir, "/cqp/").mkdir();
767
                new File(binDir, "/data/${corpusname}").deleteDir();
768
                new File(binDir, "/data/${corpusname}").mkdir();
769
                new File(binDir, "registry/").mkdir();
770

    
771
                String textid = "";
772
                int counttext = 0;
773
                List<File> files = txmDir.listFiles();
774
                
775
                // get text siecles to be able to sort with it
776
                HashMap<File,Integer[]> filesiecle = new HashMap<File, Integer[]>()
777
                for (File f : files) {
778
                        Integer[] date = new Integer[3];
779
                        date[0] = date[1] = date[2] = 0;
780
                        String xpath = "//tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:date[@type='compo']/@when"
781
                        if (metadataXPath.containsKey("datecompo"))
782
                                xpath = metadataXPath.get("datecompo")
783
                        String datecompo = XPathResult.getXpathResponse(f, xpath);
784
                        if (datecompo != null) {
785
                                //println f.getName()+" > "+datecompo
786
                                String[] split = datecompo.split("-"); // yyyy-mm-dd
787
                                if (split.length == 3) {
788
                                        date[0] = Integer.parseInt(split[0]);
789
                                        date[1] = Integer.parseInt(split[1]);
790
                                        date[2] = Integer.parseInt(split[2]);
791
                                }
792
                                else if (split.length == 1) { // yyyy
793
                                        date[0] = Integer.parseInt(split[0]);
794
                                        date[1] = 1;
795
                                        date[2] = 1;
796
                                }                                
797
                        }
798
                        filesiecle.put(f, date);
799
                }
800
                //println "date compos: "+filesiecle
801
                Collections.sort(files); // Alpha order
802
                Collections.sort(files, new Comparator<File>() { // Date order
803
                                        @Override
804
                                        public int compare(File o1, File o2) {
805
                                                Integer[] date1 = filesiecle.get(o1);
806
                                                Integer[] date2 = filesiecle.get(o2);
807
                                                if (date1[0] < date2[0]) {
808
                                                        return -1;
809
                                                } else if(date1[0] > date2[0]) {
810
                                                        return 1;
811
                                                }
812

    
813
                                                if (date1[1] < date2[1]) {
814
                                                        return -1;
815
                                                } else if(date1[1] > date2[1]) {
816
                                                        return 1;
817
                                                }
818

    
819
                                                if (date1[2] < date2[2]) {
820
                                                        return -1;
821
                                                } else if(date1[2] > date2[2]) {
822
                                                        return 1;
823
                                                }
824

    
825
                                                return 0;
826
                                        }
827
                                });
828
                        
829
                this.orderedFiles = files;
830
                println("process "+files.size()+" files ")
831
                //println("files: $files")
832
                //write txmcorpus
833
                if (!createOutput(cqpFile)) {
834
                        println "Error: could not write cqp file"
835
                        return false;
836
                } else {
837
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
838
                        output.close();
839
                }
840
                
841
                //1- Transform into CQP file
842
                for (File f : files) {
843
                        counttext++;
844
                        if (!f.exists()) {
845
                                println("file "+f+ " does not exists")
846
                        } else {
847
                                print "."
848
                                String txtname = f.getName().substring(0, f.getName().length()-4);
849
                                def builder = new compiler(f.toURI().toURL(), txtname, corpusname, "default", metadataXPath);
850
                                builder.setLang(lang)
851
                                if (!builder.transfomFileCqp(cqpFile)) {
852
                                        println "Failed to compile "+f
853
                                }
854
                                builder.setAnnotationDone(this.annotate_status);
855
                        }
856
                }
857

    
858
                //close txmcorpus
859
                if (!createOutput(cqpFile)) {
860
                        println "Error: could not write cqp file"
861
                        return false;
862
                } else {
863
                        output.write("</txmcorpus>\n");
864
                        output.close();
865
                }
866
                println ""
867
                
868
                //2- Import into CWB
869
                def outDir = binDir.getAbsolutePath();;
870
                CwbEncode cwbEn = new CwbEncode();
871
                cwbEn.setDebug(debug);
872
                CwbMakeAll cwbMa = new CwbMakeAll();
873
                cwbMa.setDebug(debug);
874
                
875
                def pAttrs = ["id","q","sp","pb","lb","orig","sic","abbr","ref","pos","supplied","lang","nametype"];
876
                for(String type : anaTypes)
877
                        pAttrs.add(type.substring(1)); // remove #
878

    
879
                structs = sattrsListener.getStructs();
880
                structsProf = sattrsListener.getProfs();
881
                if (debug) {
882
                        println structs
883
                        println structsProf
884
                }
885
                // add structures+properties found in sources
886
                List<String> sargs = new ArrayList<String>();
887
                for (String name : structs.keySet()) {
888
                        if ( name == "text") continue; // added after
889
                        //if ( name == "q") continue; // added after
890
                        //if ( name == "foreign") continue; // added after
891
                        String concat = name+":"+structsProf.get(name); // append the depth
892
                        for (String value : structs.get(name)) // append the attributes
893
                                concat += "+"+value;
894
                        if ((name == "p" || name == "body" || name == "back" || name == "front") &&
895
                        !(concat.endsWith("+n") || concat.contains("+n+")))
896
                                concat += "+n"
897
                        sargs.add(concat);
898
                }
899

    
900
                String textSAttributes = "text:0+id+base+project";
901
                if (metadataXPath != null) {
902
                        for (String meta : metadataXPath.keySet()) // text property declarations from metadata.csv
903
                                textSAttributes+="+"+meta;
904
                }
905
                if (!metadataXPath.keySet().contains("sigle"))
906
                        textSAttributes+="+sigle";
907
                        
908
                sargs.add(textSAttributes)
909
                sargs.add("txmcorpus:0+lang")
910
                //sargs.add("q:0+n+lang")
911
                for (int c = 0 ; c < sargs.size() ; c++) {
912
                        String sarg = sargs.get(c);
913
                        if (sarg.startsWith("q:")) {
914
                                if (! sarg.contains("+n")) sarg +="+n"
915
                                if (! sarg.contains("+lang")) sarg +="+lang"
916

    
917
                                sargs.set(c, sarg);
918
                        } else if(sarg.startsWith("foreign:")) {
919
                                if (! sarg.contains("+n")) sarg +="+n"
920
                                if (! sarg.contains("+lang")) sarg +="+lang"
921
                                sargs.set(c, sarg);
922
                        } else if(sarg.startsWith("ab:") || sarg.startsWith("sp:")) {
923
                                if (! sarg.contains("+n")) sarg +="+n"
924
                                if (! sarg.contains("+subtype")) sarg +="+subtype"
925
                                if (! sarg.contains("+rend")) sarg +="+rend"
926
                                sargs.set(c, sarg);
927
                        }  else if(sarg.startsWith("div:")) {
928
                                if (! sarg.contains("+n")) sarg +="+n"
929
                                if (! sarg.contains("+id")) sarg +="+id"
930
                                if (! sarg.contains("+type")) sarg +="+type"
931
                                if (! sarg.contains("+subtype")) sarg +="+subtype"
932
                                sargs.set(c, sarg);
933
                        } else if(sarg.startsWith("name:")) {
934
                                if (! sarg.contains("+n")) sarg +="+n"
935
                                if (! sarg.contains("+type")) sarg +="+type"
936
                                sargs.set(c, sarg);
937
                        }
938
                }
939
                sargs.sort();
940
                
941
                String[] sAttributes = sargs;
942
                String[] pAttributes = pAttrs;
943
                println "P-attributes: "+pAttributes
944
                println "S-attributes: "+sargs
945

    
946
                try {
947
                        String regPath = outDir + "/registry/"+corpusname.toLowerCase(); // CQP wants lower case registry files
948
                        cwbEn.run(outDir + "/data/${corpusname}", outDir + "/cqp/"+corpusname+".cqp", regPath,pAttributes, sAttributes);
949
                        if (!new File(regPath).exists()) {
950
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
951
                                return false;
952
                        }
953
                        cwbMa.run(corpusname, outDir + "/registry");
954
                } catch (Exception ex) {System.out.println(ex);return false;}
955

    
956
                return true;
957
        }
958

    
959
        /**
960
         * show cwb utils messages.
961
         */
962
        public void setDebug()
963
        {
964
                this.debug = true;
965
        }
966

    
967
        /**
968
         * test purpose.
969
         *
970
         * @param args the arguments
971
         */
972
        public static void main(String[] args)
973
        {
974
                File dir = new File("~/xml/bfm");
975
                def c = new compiler();
976
                c.setDebug();
977
                c.setCwbPath("~/TXM/cwb/bin");
978
                c.run(dir);
979
        }
980
}