Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / bfm / compiler.groovy @ 1688

History | View | Annotate | Download (27 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
41
// $LastChangedRevision: 3400 $
42
// $LastChangedBy: mdecorde $
43
//
44
package org.txm.scripts.importer.bfm
45

    
46
import org.txm.Toolbox;
47
import org.txm.importer.cwb.*
48
import org.txm.scripts.importer.*;
49
import org.txm.scripts.*;
50
import org.txm.importer.scripts.xmltxm.*;
51
import org.txm.utils.treetagger.TreeTagger;
52
import org.txm.objects.*
53
import javax.xml.stream.*;
54
import java.net.URL;
55
import java.io.File;
56
import java.util.Comparator;
57
import java.util.HashMap;
58
import java.util.List;
59
import org.txm.searchengine.cqp.corpus.*
60

    
61
/**
62
 * Produce CQP files from the TEI-TXM files. <br/>
63
 * - Read texts metadata with XPath queries <br/>
64
 * - Add the following word properties : sic, abbr, orig, lb and pb <br/>
65
 * - Keep &lt;front>, &lt;body> and &lt;back> for each text <br/>
66
 * - Text enclosed in &lt;q> is tokenized <br/>
67
 * 
68
 * @author mdecorde
69
 *
70
 */
71
class compiler {
72
        /** The debug. */
73
        private boolean debug= false;
74

    
75
        /** The annotate_status. */
76
        private boolean annotate_status=true;
77

    
78
        /** The input data. */
79
        private def inputData;
80

    
81
        /** The factory. */
82
        private def factory;
83

    
84
        /** The parser. */
85
        private XMLStreamReader parser;
86

    
87
        /** The dir. */
88
        private def dir;
89

    
90
        /** The output. */
91
        private def output;
92

    
93
        /** The url. */
94
        private def url;
95

    
96
        /** The anahash. */
97
        static boolean firstWord = true;
98
        static private def anaTypes = [];
99
        private HashMap<String,String> anahash = new HashMap<String,String>() ;
100

    
101
        private static SAttributesListener sattrsListener;
102
        private static HashMap<String,ArrayList<String>> structs;
103
        private static HashMap<String, Integer> structsProf;
104

    
105
        /** The text. */
106
        private String text="";
107

    
108
        /** The base. */
109
        private String base="";
110

    
111
        /** The lang. */
112
        private String lang ="fr";
113

    
114
        /**
115
         * contains the metadata xpath organize per name
116
         */
117
        Properties metadataXPath;
118

    
119
        /**
120
         * initialize.
121
         *
122
         */
123
        public compiler(){
124
                firstWord = true;
125
                anaTypes = [];
126
        }
127

    
128
        /**
129
         * initialize the compiler.
130
         *
131
         * @param url the file to process
132
         * @param text the Texte's name
133
         * @param base the base's name
134
         * @param project the Project's name
135
         */
136
        public compiler(URL url,String text,String base, String projectName, Properties metadataXPath)
137
        {
138
                this.metadataXPath = metadataXPath;
139
                this.text = text
140
                this.base = base;
141
                try {
142
                        this.url = url;
143
                        inputData = url.openStream();
144

    
145
                        factory = XMLInputFactory.newInstance();
146
                        parser = factory.createXMLStreamReader(inputData);
147
                        if (sattrsListener == null)
148
                                sattrsListener = new SAttributesListener(parser);
149
                        else
150
                                sattrsListener.start(parser)
151
                } catch (XMLStreamException ex) {
152
                        System.out.println(ex);
153
                }catch (IOException ex) {
154
                        System.out.println("IOException while parsing ");
155
                }
156
        }
157

    
158
        ArrayList<File> orderedFiles;
159
        public ArrayList<File> getOrderedTxmFiles() {
160
                return orderedFiles;
161
        }
162

    
163
        /**
164
         * Sets the lang.
165
         *
166
         * @param lang the new lang
167
         */
168
        public void setLang(String lang)
169
        {
170
                this.lang = lang;
171
        }
172

    
173
        /**
174
         * Sets the annotation done.
175
         *
176
         * @param done the new annotation done
177
         */
178
        public void setAnnotationDone(boolean done)
179
        {
180
                this.annotate_status = done;
181
        }
182

    
183
        /**
184
         * Creates the output.
185
         *
186
         * @param dirPathName the dir path name
187
         * @param fileName the file name
188
         * @return true, if successful
189
         */
190
        private boolean createOutput(File f) {
191
                try {
192
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
193
                } catch (Exception e) {
194
                        System.out.println(e.getLocalizedMessage());
195
                        return false;
196
                }
197
                return true;
198
        }
199

    
200
        /**
201
         * Go to text.
202
         */
203
        private void GoToText()
204
        {
205
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
206
                        if (event == XMLStreamConstants.END_ELEMENT)
207
                                if (parser.getLocalName().equals("teiHeader"))
208
                                        return;
209
                }
210
        }
211

    
212
        /**
213
         * Increment.
214
         *
215
         * @param parser the parser
216
         * @param value the value
217
         * @return the java.lang. object
218
         */
219
        private def increment(XMLStreamReader parser, int value)
220
        {
221
                String n=null;
222
                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
223
                        //System.out.println("attr name "+parser.getAttributeLocalName(i));
224
                        if (parser.getAttributeLocalName(i) == "n") {
225
                                n = parser.getAttributeValue(i);
226
                                break;
227
                        }
228
                }
229
                //System.out.println("inc n "+n);
230
                if (n != null)
231
                        try {
232
                                value = Integer.parseInt(n);
233
                                return value;
234
                        }
235
                        catch (Exception e) {return value+1;}
236

    
237
                value = value+1;
238
                return value;
239
        }
240

    
241
        /**
242
         * Transfom file cqp.
243
         *
244
         * @param dirPathName the dir path name
245
         * @param fileName the file name
246
         * @return true, if successful
247
         */
248
        private boolean transfomFileCqp(Project project, File cqpFile)
249
        {
250
                try {
251
                        if (!createOutput(cqpFile)) return false;
252

    
253
                        String headvalue = ""
254
                        String vAna = "";
255
                        String vForm = "";
256
                        String wordid = "";
257
                        String vHead = "";
258

    
259
                        Integer p_id = 0;
260
                        Integer s_id = 0;
261
                        Integer q_id = 0;
262
                        int sp_id = 0;
263
                        Integer body_id = 0;
264
                        Integer front_id = 0;
265
                        Integer back_id = 0;
266
                        Integer lb_id = 0;
267
                        Integer pb_id = 0;
268
                        Integer ab_id = 0;
269
                        int foreign_id = 0;
270
                        int name_id = 0;
271

    
272
                        boolean captureword = false;
273

    
274
                        String vExpan = "";
275
                        String vCorr = "";
276
                        String vReg = "";
277
                        String vOrig = "";
278
                        String vSic = "";
279
                        String vAbbr = "";
280
                        String givenpos = "";
281
                        String pb_n = "";
282
                        String foreign_lang = "";
283
                        String nameType = "";
284
                        String anaType;
285
                        //String abType = "";
286
                        
287
                        boolean foundtei=false, foundtext=false;
288

    
289
                        boolean flaglg = false;
290
                        int levelq = 0;
291
                        //boolean flagq = false;
292
                        boolean flaghead = false;
293
                        //Added:
294
                        boolean flagSp = false;
295
                        boolean flagAuthor = false;
296
                        boolean flagDate = false;
297
                        boolean flagWord = false;
298
                        boolean flagForm = false;
299
                        boolean flagAna = false;
300

    
301
                        boolean flagchoice = false;
302
                        boolean flagcorr = false;
303
                        boolean flagsic = false;
304
                        boolean flagreg = false;
305
                        boolean flagexpan = false;
306
                        boolean flagorig = false;
307
                        boolean flagabbr = false;
308
                        boolean flagfw = false;
309
                        //boolean flagSupplied = false;
310
                        int levelSupplied = 0;
311
                        //boolean flagSurplus = false;
312
                        boolean flagForeign = false;
313
                        //boolean flagName = false;
314

    
315
                        this.GoToText();
316
                        int missingId= 0
317
                        boolean USEVERSE = false; // switch default reference to verse references
318
                        String titreId; // the title to use in the reference
319
                        
320
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
321
                                switch (event) {
322
                                        case XMLStreamConstants.START_ELEMENT:
323

    
324
                                                String localname = parser.getLocalName();
325
                                                if (foundtext) sattrsListener.startElement(localname);
326
                                                
327
                                                switch (localname) {
328
                                                        case "TEI":
329
                                                                foundtei = true;
330
                                                                break;
331
                                                        case "text":
332
                                                                foundtext = true;
333
                                                                sattrsListener.startElement(localname);
334
                                                                output.write("<text id=\""+text+"\"")
335

    
336
                                                                for (int i = 0; i < parser.getAttributeCount() ; i++) {
337
                                                                        String name = parser.getAttributeLocalName(i);
338
                                                                        if ("id" == name || "base" == name || "project" == name) continue;
339
                                                                        output.write(" "+name+"=\""+parser.getAttributeValue(i)+"\"");
340
                                                                        
341
                                                                        if (name == "forme") {
342
                                                                                USEVERSE = (parser.getAttributeValue(i).contains("vers"))
343
                                                                        } else if (name == "sigle") {
344
                                                                                titreId = parser.getAttributeValue(i)
345
                                                                        }
346
                                                                }
347
                                                                
348
                                                                output.write(" base=\""+base+"\" project=\""+project.getName()+"\">\n");
349
                                                                captureword=true;
350
                                                                break;
351

    
352
                                                        case "div":
353
                                                        //output.write("<div type=\""+parser.getAttributeValue(null,"type")+"\">\n");
354
                                                                String divType = "NA";
355
                                                                String divSubtype = "NA";
356
                                                                String divN = "NA";
357
                                                                String divId ="NA";
358
                                                                for(int i = 0 ; i < parser.getAttributeCount(); i++) {
359
                                                                        if(parser.getAttributeLocalName(i) == "type") {
360
                                                                                divType = parser.getAttributeValue(i);
361
                                                                        } else if(parser.getAttributeLocalName(i) == "subtype") {
362
                                                                                divSubtype = parser.getAttributeValue(i);
363
                                                                        } else if(parser.getAttributeLocalName(i) == "n") {
364
                                                                                divN = parser.getAttributeValue(i);
365
                                                                        } else if(parser.getAttributeLocalName(i) == "id") {
366
                                                                                divId = parser.getAttributeValue(i);
367
                                                                                break;
368
                                                                        }
369
                                                                }
370
                                                                output.write("<div type=\""+divType+"\" subtype=\""+divSubtype+"\" n=\""+divN+"\" id=\""+divId+"\">\n");
371
                                                                break;
372
                                                        case "p":
373
                                                                p_id = increment(parser, p_id);
374
                                                                output.write("<p n=\""+p_id+"\">\n");
375
                                                                break;
376
                                                        case "ab":
377
                                                                ab_id = increment(parser, ab_id)
378
                                                                output.write("<ab n=\""+(ab_id)+"\" type=\""+parser.getAttributeValue(null,"type")+"\" subtype=\""+parser.getAttributeValue(null,"subtype")+"\" rend=\""+parser.getAttributeValue(null,"rend")+"\">\n");
379
                                                                break;
380
                                                        case "q":
381
                                                                q_id = increment(parser, q_id)
382
                                                                output.write("<q n=\""+(q_id)+"\">\n");
383
                                                        //flagq=true;
384
                                                                levelq = levelq + 1;
385
                                                                break;
386
                                                        case "sp":
387
                                                                sp_id = increment(parser, sp_id)
388
                                                                output.write("<sp n=\""+(sp_id)+"\">\n");
389
                                                                flagSp = true;
390
                                                                break;
391
                                                        case "front":
392
                                                                front_id = increment(parser, front_id)
393
                                                                output.write("<front n=\""+front_id+"\">\n");
394
                                                                break;
395
                                                        case "body":
396
                                                                body_id= increment(parser, body_id)
397
                                                                output.write("<body n=\""+body_id+"\">\n");
398
                                                                break;
399
                                                        case "back":
400
                                                                back_id = increment(parser, back_id)
401
                                                                output.write("<back n=\""+back_id+"\">\n");
402
                                                                break;
403
                                                        case "lb":
404
                                                                lb_id = increment(parser, lb_id)
405
                                                                break;
406
                                                        case "pb":
407
                                                                pb_id = increment(parser, pb_id)
408
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
409
                                                                        if (parser.getAttributeLocalName(i) == "n") {
410
                                                                                pb_n = parser.getAttributeValue(i);
411
                                                                        }
412
                                                                }
413
                                                                break;
414
                                                        case "s":
415
                                                                s_id = increment(parser, s_id)
416
                                                                output.write("<s n=\""+s_id+"\">\n");
417
                                                                break;
418
                                                        case "choice":
419
                                                                flagchoice = true;
420
                                                                break;
421
                                                        case "corr":
422
                                                                flagcorr = true;
423
                                                                vCorr= "";
424
                                                                break;
425
                                                        case "reg":
426
                                                                flagreg = true;
427
                                                                vReg= "";
428
                                                                break;
429
                                                        case "expan":
430
                                                                flagexpan = true;
431
                                                                vExpan= "";
432
                                                                break;
433
                                                        case "orig":
434
                                                                flagreg = true;
435
                                                                vOrig= "";
436
                                                                break;
437
                                                        case "sic":
438
                                                                flagsic = true;
439
                                                                vSic= "";
440
                                                                break;
441
                                                        case "abbr":
442
                                                                flagreg = true;
443
                                                                vAbbr= "";
444
                                                                break;
445
                                                        case "foreign":
446
                                                                flagForeign = true;
447
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
448
                                                                        if (parser.getAttributeLocalName(i) == "lang") {
449
                                                                                lang = parser.getAttributeValue(i);
450
                                                                                break;
451
                                                                        }
452
                                                                }
453

    
454
                                                                output.write("<foreign n=\""+(foreign_id++)+"\" lang=\""+lang+"\">\n");
455
                                                        //vForeign = "";
456
                                                                break;
457

    
458
                                                        case "name":
459
                                                        //flagName = true;
460
                                                                for(int i = 0 ; i < parser.getAttributeCount(); i++)
461
                                                                        if(parser.getAttributeLocalName(i) == "type")
462
                                                                {
463
                                                                        nameType = parser.getAttributeValue(i);
464
                                                                        break;
465
                                                                }
466

    
467
                                                                output.write("<name n=\""+(name_id++)+"\" type=\""+nameType+"\">\n");
468
                                                                break;
469
                                                        case "supplied":
470
                                                        //flagSupplied = true;
471
                                                                levelSupplied = levelSupplied + 1;
472
                                                                break;
473

    
474
                                                        case "surplus":
475
                                                                flagfw = true;
476
                                                                break;
477

    
478
                                                        case "del":
479
                                                                flagfw = true;
480
                                                                break;
481

    
482
                                                        case "w":
483
                                                                givenpos = "";
484
                                                                wordid = "w_"+text+"_m"+missingId++
485
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
486
                                                                        if (parser.getAttributeLocalName(i) == "id") {
487
                                                                                wordid = parser.getAttributeValue(i);
488
                                                                        } else if (parser.getAttributeLocalName(i) == "type") {
489
                                                                                givenpos = parser.getAttributeValue(i);
490
                                                                        }
491
                                                                }
492
                                                                if (wordid.startsWith("w")) {
493
                                                                        if (!wordid.startsWith("w_"))
494
                                                                                wordid = "w_"+wordid.substring(1)
495
                                                                } else {
496
                                                                        wordid = "w_"+wordid;
497
                                                                }
498

    
499
                                                                if (givenpos == null || givenpos == "")
500
                                                                        givenpos = "NA";
501
                                                                vForm = "";
502
                                                                anahash.clear(); // remove previous word ana values
503
                                                                flagWord = true;
504
                                                                break;
505
                                                        case "form":
506
                                                                flagForm = true;
507
                                                                vForm = "";
508
                                                                break;
509

    
510
                                                        case "ana":
511
                                                                flagAna = true;
512
                                                                anaType = parser.getAttributeValue(null, "type")
513
                                                                anahash.put(anaType, "");
514
                                                                if (firstWord) {
515
                                                                        anaTypes << anaType;
516
                                                                }
517
                                                                break;
518
                                                }
519
                                                break;
520

    
521
                                        case XMLStreamConstants.END_ELEMENT:
522
                                                String localname = parser.getLocalName();
523
                                                if (foundtext) sattrsListener.endElement(localname);
524

    
525
                                                switch (localname) {
526
                                                        case "div":
527
                                                                output.write("</div>\n");
528
                                                                break;
529
                                                        case "text":
530
                                                                output.write("</text>\n");
531
                                                                captureword=false;
532
                                                                break;
533
                                                        case "p":
534
                                                                output.write("</p>\n");
535
                                                                break;
536
                                                        case "s":
537
                                                                output.write("</s>\n");
538
                                                                break;
539
                                                        case "ab":
540
                                                                output.write("</ab>\n");
541
                                                                break;
542
                                                        case "q":
543
                                                                output.write("</q>\n");
544
                                                        //flagq= false;
545
                                                                levelq = levelq - 1;
546
                                                                break;
547
                                                        case "sp":
548
                                                                output.write("</sp>\n");
549
                                                                flagSp = false;
550
                                                                break;
551
                                                        case "front":
552
                                                                output.write("</front>\n");
553
                                                                break;
554
                                                        case "body":
555
                                                                output.write("</body>\n");
556
                                                                break;
557
                                                        case "back":
558
                                                                output.write("</back>\n");
559
                                                                break;
560

    
561
                                                        //                                                        case "fw":
562
                                                        //                                                        flagfw = false;
563
                                                        //                                                        break;
564

    
565
                                                        case "choice":
566
                                                                if(vOrig == "")
567
                                                                        vOrig="NA";
568
                                                                if(vSic == "")
569
                                                                        vSic="NA";
570
                                                                if(vAbbr == "")
571
                                                                        vAbbr="NA";
572

    
573
                                                                String ref;
574
                                                                if(USEVERSE)
575
                                                                        ref = titreId+", p."+pb_n+", v."+lb_id;
576
                                                                else
577
                                                                        ref = titreId+", p."+pb_n;
578

    
579
                                                                if (flagfw) {
580
                                                                        // on est hors texte
581
                                                                } else {
582
                                                                        String vFormToWrite = vForm;
583
                                                                        if (vCorr != "") {
584
                                                                                vFormToWrite = vCorr;
585
                                                                        } else if(vReg != "") {
586
                                                                                vFormToWrite = vReg;
587
                                                                        } else if(vExpan != "") {
588
                                                                                vFormToWrite = vExpan
589
                                                                        }
590
                                                                        firstWord = false;
591
                                                                        output.write( vFormToWrite +"\t"+wordid+"\t"+levelq.toString().substring(0,1)+"\t"+flagSp.toString().substring(0,1)+"\t"+pb_n+"\t"+lb_id+"\t"+vOrig+
592
                                                                                        "\t"+vSic+"\t"+vAbbr+"\t"+ref+"\t"+givenpos+"\t"+levelSupplied.toString().substring(0,1)+"\t"+lang+"\t"+nameType);
593
                                                                        for(String type : anaTypes) {
594
                                                                                output.write("\t"+anahash.get(type));
595
                                                                        }
596
                                                                        output.write("\n")
597
                                                                }
598
                                                                flagchoice = false;
599
                                                                vCorr= "";
600
                                                                vSic= "";
601
                                                                break;
602
                                                        case "corr":
603
                                                                flagcorr = false;
604

    
605
                                                                break;
606
                                                        case "reg":
607
                                                                flagreg = false;
608
                                                                vReg = "";
609
                                                                break;
610
                                                        case "expan":
611
                                                                flagexpan = false;
612
                                                                vExpan= "";
613
                                                                break;
614
                                                        case "orig":
615
                                                                flagreg = false;
616
                                                                vOrig= "";
617
                                                                break;
618
                                                        case "sic":
619
                                                                flagsic = false;
620

    
621
                                                                break;
622
                                                        case "abbr":
623
                                                                flagreg = false;
624
                                                                vAbbr= "";
625
                                                                break;
626

    
627
                                                        case "foreign":
628
                                                                flagForeign = false;
629
                                                                lang = "";
630
                                                                output.write("</foreign>\n");
631
                                                                break;
632

    
633
                                                        case "name":
634
                                                        //flagName = false;
635
                                                                nameType = "";
636
                                                                output.write("</name>\n");
637
                                                                break;
638

    
639
                                                        case "supplied":
640
                                                        //flagSupplied = false;
641
                                                                levelSupplied = levelSupplied - 1;
642
                                                                break;
643

    
644
                                                        case "surplus":
645
                                                                flagfw = false;
646
                                                                break;
647

    
648
                                                        case "del":
649
                                                                flagfw = false;
650
                                                                break;
651

    
652
                                                        case "w":
653
                                                                if (captureword) {
654
                                                                        if (flagchoice) {
655

    
656
                                                                        } else if(flagfw) {
657

    
658
                                                                        } else {
659
                                                                                if (vOrig == "")
660
                                                                                        vOrig="NA";
661
                                                                                if(vSic == "")
662
                                                                                        vSic="NA";
663
                                                                                if(vAbbr == "")
664
                                                                                        vAbbr="NA";
665
                                                                                if (nameType == "")
666
                                                                                        nameType = "NA";
667
                                                                                if(lang == "")
668
                                                                                        lang="fr"
669

    
670
                                                                                String ref;
671
                                                                                if(USEVERSE)
672
                                                                                        ref = titreId+", p."+pb_n+", v."+lb_id;
673
                                                                                else
674
                                                                                        ref = titreId+", p."+pb_n;
675

    
676
                                                                                firstWord = false;
677
                                                                                output.write(vForm.replaceAll("&", "&amp;").replaceAll("<", "&lt;") +"\t"+wordid+"\t"+levelq.toString().substring(0,1)+"\t"+flagSp.toString().substring(0,1)+"\t"+pb_n+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\t"+ref+"\t"+givenpos+"\t"+levelSupplied.toString().substring(0,1)+"\t"+lang+"\t"+nameType);
678
                                                                                for(String type : anaTypes) {
679
                                                                                        output.write("\t"+anahash.get(type));
680
                                                                                }
681
                                                                                output.write("\n")
682
                                                                        }
683

    
684
                                                                        flagWord = false;
685
                                                                }
686
                                                                break;
687

    
688
                                                        case "form":
689
                                                                flagForm = false;
690
                                                                break;
691

    
692
                                                        case "ana":
693
                                                                flagAna = false;
694
                                                                anahash.put(anaType, vAna);
695
                                                                vAna = "";
696
                                                                break;
697
                                                }
698
                                                break; // end elem
699

    
700
                                        case XMLStreamConstants.CHARACTERS:
701
                                                if (flagAna) {
702
                                                        vAna += parser.getText().trim()
703
                                                }
704

    
705
                                                if (flagForm) {
706
                                                        vForm += parser.getText().trim();
707
                                                        if (flagchoice) {
708
                                                                if (flagsic) {
709
                                                                        vSic += parser.getText().trim();
710
                                                                }
711
                                                                if (flagorig) {
712
                                                                        vOrig += parser.getText().trim();
713
                                                                }
714
                                                                if (flagabbr) {
715
                                                                        vAbbr += parser.getText().trim();
716
                                                                }
717
                                                                if (flagcorr) {
718
                                                                        vCorr += parser.getText().trim();
719
                                                                }
720
                                                        }
721
                                                }
722
                                }
723
                        }
724
                        //output.write("</txmcorpus>");
725
                        output.close();
726
                        parser.close();
727
                        inputData.close();
728
                } catch (Exception ex) {
729
                        System.out.println("Exception while parsing " + inputData);
730
                        ex.printStackTrace();
731
                        if (output != null) output.close();
732
                        if (parser != null) parser.close();
733
                        if (inputData != null) inputData.close();
734
                        return false;
735
                }
736

    
737
                return true;
738
        }
739

    
740

    
741
        /**
742
         * Run.
743
         *
744
         * @param rootDirFile contains the TEI-TXM files
745
         * @param basename the basename
746
         * @return true, if successful
747
         */
748
        public boolean run(Project project, File binDir, File txmDir, String corpusname, Properties metadataXPath)
749
        {
750
                sattrsListener = null; // reset SAttribute Listener for each new import
751
                this.metadataXPath = metadataXPath;
752

    
753
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
754
                        println ("Error: CWB executables not well set.")
755
                        return false;
756
                }
757
                
758
                CorpusBuild corpus = project.getCorpusBuild(project.getName());
759
                if (corpus != null) {
760
                        //println "CLEAN PREVIOUS CORPUS"
761
                        corpus.delete(); // remove old files
762
                }
763
                        
764
                // make new one
765
                corpus = new MainCorpus(project);
766
                corpus.setID(project.getName());
767
                corpus.setName(project.getName());
768
                corpus.setDescription("Built with the BFM import module");
769
                
770
                File cqpFile = new File(binDir,"cqp/"+corpusname+".cqp");
771
cqpFile.delete()
772
                new File(binDir,"cqp").mkdirs()
773
                new File(binDir,"data").mkdirs()
774
                new File(binDir,"registry").mkdirs()
775

    
776
                String textid = "";
777
                int counttext = 0;
778
                List<File> files = txmDir.listFiles();
779
                
780
                // get text siecles to be able to sort with it
781
                HashMap<File,Integer[]> filesiecle = new HashMap<File, Integer[]>()
782
                for (File f : files) {
783
                        Integer[] date = new Integer[3];
784
                        date[0] = date[1] = date[2] = 0;
785
                        String xpath = "//tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:date[@type='compo']/@when"
786
                        if (metadataXPath.containsKey("datecompo"))
787
                                xpath = metadataXPath.get("datecompo")
788
                        String datecompo = XPathResult.getXpathResponse(f, xpath);
789
                        if (datecompo != null) {
790
                                //println f.getName()+" > "+datecompo
791
                                String[] split = datecompo.split("-"); // yyyy-mm-dd
792
                                if (split.length == 3) {
793
                                        date[0] = Integer.parseInt(split[0]);
794
                                        date[1] = Integer.parseInt(split[1]);
795
                                        date[2] = Integer.parseInt(split[2]);
796
                                }
797
                                else if (split.length == 1) { // yyyy
798
                                        date[0] = Integer.parseInt(split[0]);
799
                                        date[1] = 1;
800
                                        date[2] = 1;
801
                                }                                
802
                        }
803
                        filesiecle.put(f, date);
804
                }
805
                //println "date compos: "+filesiecle
806
                Collections.sort(files); // Alpha order
807
                Collections.sort(files, new Comparator<File>() { // Date order
808
                                        @Override
809
                                        public int compare(File o1, File o2) {
810
                                                Integer[] date1 = filesiecle.get(o1);
811
                                                Integer[] date2 = filesiecle.get(o2);
812
                                                if (date1[0] < date2[0]) {
813
                                                        return -1;
814
                                                } else if(date1[0] > date2[0]) {
815
                                                        return 1;
816
                                                }
817

    
818
                                                if (date1[1] < date2[1]) {
819
                                                        return -1;
820
                                                } else if(date1[1] > date2[1]) {
821
                                                        return 1;
822
                                                }
823

    
824
                                                if (date1[2] < date2[2]) {
825
                                                        return -1;
826
                                                } else if(date1[2] > date2[2]) {
827
                                                        return 1;
828
                                                }
829

    
830
                                                return 0;
831
                                        }
832
                                });
833
                        
834
                this.orderedFiles = files;
835
                println("process "+files.size()+" files ")
836
                //println("files: $files")
837
                //write txmcorpus
838
                if (!createOutput(cqpFile)) {
839
                        println "Error: could not write cqp file"
840
                        return false;
841
                } else {
842
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
843
                        output.close();
844
                }
845
                
846
                //1- Transform into CQP file
847
                for (File f : files) {
848
                        counttext++;
849
                        if (!f.exists()) {
850
                                println("file "+f+ " does not exists")
851
                        } else {
852
                                print "."
853
                                String txtname = f.getName().substring(0, f.getName().length()-4);
854
                                def builder = new compiler(f.toURI().toURL(), txtname, corpusname, "default", metadataXPath);
855
                                builder.setLang(lang)
856
                                if (!builder.transfomFileCqp(project, cqpFile)) {
857
                                        println "Failed to compile "+f
858
                                }
859
                                builder.setAnnotationDone(this.annotate_status);
860
                        }
861
                }
862

    
863
                //close txmcorpus
864
                if (!createOutput(cqpFile)) {
865
                        println "Error: could not write cqp file"
866
                        return false;
867
                } else {
868
                        output.write("</txmcorpus>\n");
869
                        output.close();
870
                }
871
                println ""
872
                
873
                //2- Import into CWB
874
                def outDir = binDir.getAbsolutePath();;
875
                CwbEncode cwbEn = new CwbEncode();
876
                cwbEn.setDebug(debug);
877
                CwbMakeAll cwbMa = new CwbMakeAll();
878
                cwbMa.setDebug(debug);
879
                
880
                def pAttrs = ["id","q","sp","pb","lb","orig","sic","abbr","ref","pos","supplied","lang","nametype"];
881
                for(String type : anaTypes)
882
                        pAttrs.add(type.substring(1)); // remove #
883

    
884
                structs = sattrsListener.getStructs();
885
                structsProf = sattrsListener.getProfs();
886
                if (debug) {
887
                        println structs
888
                        println structsProf
889
                }
890
                // add structures+properties found in sources
891
                List<String> sargs = new ArrayList<String>();
892
                for (String name : structs.keySet()) {
893
                        if ( name == "text") continue; // added after
894
                        //if ( name == "q") continue; // added after
895
                        //if ( name == "foreign") continue; // added after
896
                        String concat = name+":"+structsProf.get(name); // append the depth
897
                        for (String value : structs.get(name)) // append the attributes
898
                                concat += "+"+value;
899
                        if ((name == "p" || name == "body" || name == "back" || name == "front") &&
900
                        !(concat.endsWith("+n") || concat.contains("+n+")))
901
                                concat += "+n"
902
                        sargs.add(concat);
903
                }
904

    
905
                String textSAttributes = "text:0+id+base+project";
906
                if (metadataXPath != null) {
907
                        for (String meta : metadataXPath.keySet()) // text property declarations from metadata.csv
908
                                textSAttributes+="+"+meta;
909
                }
910
                if (!metadataXPath.keySet().contains("sigle"))
911
                        textSAttributes+="+sigle";
912
                        
913
                sargs.add(textSAttributes)
914
                sargs.add("txmcorpus:0+lang")
915
                //sargs.add("q:0+n+lang")
916
                for (int c = 0 ; c < sargs.size() ; c++) {
917
                        String sarg = sargs.get(c);
918
                        if (sarg.startsWith("q:")) {
919
                                if (! sarg.contains("+n")) sarg +="+n"
920
                                if (! sarg.contains("+lang")) sarg +="+lang"
921

    
922
                                sargs.set(c, sarg);
923
                        } else if(sarg.startsWith("foreign:")) {
924
                                if (! sarg.contains("+n")) sarg +="+n"
925
                                if (! sarg.contains("+lang")) sarg +="+lang"
926
                                sargs.set(c, sarg);
927
                        } else if(sarg.startsWith("ab:") || sarg.startsWith("sp:")) {
928
                                if (! sarg.contains("+n")) sarg +="+n"
929
                                if (! sarg.contains("+subtype")) sarg +="+subtype"
930
                                if (! sarg.contains("+rend")) sarg +="+rend"
931
                                sargs.set(c, sarg);
932
                        }  else if(sarg.startsWith("div:")) {
933
                                if (! sarg.contains("+n")) sarg +="+n"
934
                                if (! sarg.contains("+id")) sarg +="+id"
935
                                if (! sarg.contains("+type")) sarg +="+type"
936
                                if (! sarg.contains("+subtype")) sarg +="+subtype"
937
                                sargs.set(c, sarg);
938
                        } else if(sarg.startsWith("name:")) {
939
                                if (! sarg.contains("+n")) sarg +="+n"
940
                                if (! sarg.contains("+type")) sarg +="+type"
941
                                sargs.set(c, sarg);
942
                        }
943
                }
944
                sargs.sort();
945
                
946
                String[] sAttributes = sargs;
947
                String[] pAttributes = pAttrs;
948
                println "P-attributes: "+pAttributes
949
                println "S-attributes: "+sargs
950

    
951
                try {
952
                        String regPath = outDir + "/registry/"+corpusname.toLowerCase(); // CQP wants lower case registry files
953
                        cwbEn.run(outDir + "/data/${corpusname}", outDir + "/cqp/"+corpusname+".cqp", regPath,pAttributes, sAttributes);
954
                        if (!new File(regPath).exists()) {
955
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
956
                                return false;
957
                        }
958
                        cwbMa.run(corpusname, outDir + "/registry");
959
                } catch (Exception ex) {System.out.println(ex);return false;}
960

    
961
                return true;
962
        }
963

    
964
        /**
965
         * show cwb utils messages.
966
         */
967
        public void setDebug()
968
        {
969
                this.debug = true;
970
        }
971

    
972
        /**
973
         * test purpose.
974
         *
975
         * @param args the arguments
976
         */
977
        public static void main(String[] args)
978
        {
979
                File dir = new File("~/xml/bfm");
980
                def c = new compiler();
981
                c.setDebug();
982
                c.setCwbPath("~/TXM/cwb/bin");
983
                c.run(dir);
984
        }
985
}