Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / bfm / compiler.groovy @ 1804

History | View | Annotate | Download (27.1 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
41
// $LastChangedRevision: 3400 $
42
// $LastChangedBy: mdecorde $
43
//
44
package org.txm.scripts.importer.bfm
45

    
46
import org.txm.Toolbox;
47
import org.txm.importer.cwb.*
48
import org.txm.scripts.importer.*;
49
import org.txm.scripts.*;
50
import org.txm.importer.scripts.xmltxm.*;
51
import org.txm.utils.treetagger.TreeTagger;
52
import org.txm.objects.*
53
import javax.xml.stream.*;
54
import java.net.URL;
55
import java.io.File;
56
import java.util.Comparator;
57
import java.util.HashMap;
58
import java.util.List;
59
import org.txm.searchengine.cqp.corpus.*
60

    
61
/**
62
 * Produce CQP files from the TEI-TXM files. <br/>
63
 * - Read texts metadata with XPath queries <br/>
64
 * - Add the following word properties : sic, abbr, orig, lb and pb <br/>
65
 * - Keep &lt;front>, &lt;body> and &lt;back> for each text <br/>
66
 * - Text enclosed in &lt;q> is tokenized <br/>
67
 * 
68
 * @author mdecorde
69
 *
70
 */
71
class compiler {
72
        /** The debug. */
73
        private boolean debug= false;
74

    
75
        /** The annotate_status. */
76
        private boolean annotate_status=true;
77

    
78
        /** The input data. */
79
        private def inputData;
80

    
81
        /** The factory. */
82
        private def factory;
83

    
84
        /** The parser. */
85
        private XMLStreamReader parser;
86

    
87
        /** The dir. */
88
        private def dir;
89

    
90
        /** The output. */
91
        private def output;
92

    
93
        /** The url. */
94
        private def url;
95

    
96
        /** The anahash. */
97
        static boolean firstWord = true;
98
        static private def anaTypes = [];
99
        private HashMap<String,String> anahash = new HashMap<String,String>() ;
100

    
101
        private static SAttributesListener sattrsListener;
102
        private static HashMap<String,ArrayList<String>> structs;
103
        private static HashMap<String, Integer> structsProf;
104

    
105
        /** The text. */
106
        private String text="";
107

    
108
        /** The base. */
109
        private String base="";
110

    
111
        /** The lang. */
112
        private String lang ="fr";
113

    
114
        /**
115
         * contains the metadata xpath organize per name
116
         */
117
        Properties metadataXPath;
118

    
119
        /**
120
         * initialize.
121
         *
122
         */
123
        public compiler(){
124
                firstWord = true;
125
                anaTypes = [];
126
        }
127

    
128
        /**
129
         * initialize the compiler.
130
         *
131
         * @param url the file to process
132
         * @param text the Texte's name
133
         * @param base the base's name
134
         * @param project the Project's name
135
         */
136
        public compiler(URL url,String text,String base, String projectName, Properties metadataXPath)
137
        {
138
                this.metadataXPath = metadataXPath;
139
                this.text = text
140
                this.base = base;
141
                try {
142
                        this.url = url;
143
                        inputData = url.openStream();
144

    
145
                        factory = XMLInputFactory.newInstance();
146
                        parser = factory.createXMLStreamReader(inputData);
147
                        if (sattrsListener == null)
148
                                sattrsListener = new SAttributesListener(parser);
149
                        else
150
                                sattrsListener.start(parser)
151
                } catch (XMLStreamException ex) {
152
                        System.out.println(ex);
153
                }catch (IOException ex) {
154
                        System.out.println("IOException while parsing ");
155
                }
156
        }
157

    
158
        ArrayList<File> orderedFiles;
159
        public ArrayList<File> getOrderedTxmFiles() {
160
                return orderedFiles;
161
        }
162

    
163
        /**
164
         * Sets the lang.
165
         *
166
         * @param lang the new lang
167
         */
168
        public void setLang(String lang)
169
        {
170
                this.lang = lang;
171
        }
172

    
173
        /**
174
         * Sets the annotation done.
175
         *
176
         * @param done the new annotation done
177
         */
178
        public void setAnnotationDone(boolean done)
179
        {
180
                this.annotate_status = done;
181
        }
182

    
183
        /**
184
         * Creates the output.
185
         *
186
         * @param dirPathName the dir path name
187
         * @param fileName the file name
188
         * @return true, if successful
189
         */
190
        private boolean createOutput(File f) {
191
                try {
192
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
193
                } catch (Exception e) {
194
                        System.out.println(e.getLocalizedMessage());
195
                        return false;
196
                }
197
                return true;
198
        }
199

    
200
        /**
201
         * Go to text.
202
         */
203
        private void GoToText()
204
        {
205
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
206
                        if (event == XMLStreamConstants.END_ELEMENT)
207
                                if (parser.getLocalName().equals("teiHeader"))
208
                                        return;
209
                }
210
        }
211

    
212
        /**
213
         * Increment.
214
         *
215
         * @param parser the parser
216
         * @param value the value
217
         * @return the java.lang. object
218
         */
219
        private def increment(XMLStreamReader parser, int value)
220
        {
221
                String n=null;
222
                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
223
                        //System.out.println("attr name "+parser.getAttributeLocalName(i));
224
                        if (parser.getAttributeLocalName(i) == "n") {
225
                                n = parser.getAttributeValue(i);
226
                                break;
227
                        }
228
                }
229
                //System.out.println("inc n "+n);
230
                if (n != null)
231
                        try {
232
                                value = Integer.parseInt(n);
233
                                return value;
234
                        }
235
                        catch (Exception e) {return value+1;}
236

    
237
                value = value+1;
238
                return value;
239
        }
240

    
241
        /**
242
         * Transfom file cqp.
243
         *
244
         * @param dirPathName the dir path name
245
         * @param fileName the file name
246
         * @return true, if successful
247
         */
248
        private boolean transfomFileCqp(Project project, File cqpFile)
249
        {
250
                try {
251
                        if (!createOutput(cqpFile)) return false;
252

    
253
                        String headvalue = ""
254
                        String vAna = "";
255
                        String vForm = "";
256
                        String wordid = "";
257
                        String vHead = "";
258

    
259
                        Integer p_id = 0;
260
                        Integer s_id = 0;
261
                        Integer q_id = 0;
262
                        int sp_id = 0;
263
                        Integer body_id = 0;
264
                        Integer front_id = 0;
265
                        Integer back_id = 0;
266
                        Integer lb_id = 0;
267
                        Integer pb_id = 0;
268
                        Integer ab_id = 0;
269
                        int foreign_id = 0;
270
                        int name_id = 0;
271

    
272
                        boolean captureword = false;
273

    
274
                        String vExpan = "";
275
                        String vCorr = "";
276
                        String vReg = "";
277
                        String vOrig = "";
278
                        String vSic = "";
279
                        String vAbbr = "";
280
                        String givenpos = "";
281
                        String pb_n = "";
282
                        String foreign_lang = "";
283
                        String nameType = "";
284
                        String anaType;
285
                        //String abType = "";
286
                        
287
                        boolean foundtei=false, foundtext=false;
288

    
289
                        boolean flaglg = false;
290
                        int levelq = 0;
291
                        //boolean flagq = false;
292
                        boolean flaghead = false;
293
                        //Added:
294
                        boolean flagSp = false;
295
                        boolean flagAuthor = false;
296
                        boolean flagDate = false;
297
                        boolean flagWord = false;
298
                        boolean flagForm = false;
299
                        boolean flagAna = false;
300

    
301
                        boolean flagchoice = false;
302
                        boolean flagcorr = false;
303
                        boolean flagsic = false;
304
                        boolean flagreg = false;
305
                        boolean flagexpan = false;
306
                        boolean flagorig = false;
307
                        boolean flagabbr = false;
308
                        boolean flagfw = false;
309
                        //boolean flagSupplied = false;
310
                        int levelSupplied = 0;
311
                        //boolean flagSurplus = false;
312
                        boolean flagForeign = false;
313
                        //boolean flagName = false;
314

    
315
                        this.GoToText();
316
                        int missingId= 0
317
                        boolean USEVERSE = false; // switch default reference to verse references
318
                        String titreId; // the title to use in the reference
319
                        
320
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
321
                                switch (event) {
322
                                        case XMLStreamConstants.START_ELEMENT:
323

    
324
                                                String localname = parser.getLocalName();
325
                                                if (foundtext) sattrsListener.startElement(localname);
326
                                                
327
                                                switch (localname) {
328
                                                        case "TEI":
329
                                                                foundtei = true;
330
                                                                break;
331
                                                        case "text":
332
                                                                foundtext = true;
333
                                                                sattrsListener.startElement(localname);
334
                                                                output.write("<text id=\""+text+"\"")
335

    
336
                                                                for (int i = 0; i < parser.getAttributeCount() ; i++) {
337
                                                                        String name = parser.getAttributeLocalName(i);
338
                                                                        if ("id" == name || "base" == name || "project" == name) continue;
339
                                                                        output.write(" "+name+"=\""+parser.getAttributeValue(i)+"\"");
340
                                                                        
341
                                                                        if (name == "forme") {
342
                                                                                USEVERSE = (parser.getAttributeValue(i).contains("vers"))
343
                                                                        } else if (name == "sigle") {
344
                                                                                titreId = parser.getAttributeValue(i)
345
                                                                        }
346
                                                                }
347
                                                                
348
                                                                output.write(" base=\""+base+"\" project=\""+project.getName()+"\">\n");
349
                                                                captureword=true;
350
                                                                break;
351

    
352
                                                        case "div":
353
                                                        //output.write("<div type=\""+parser.getAttributeValue(null,"type")+"\">\n");
354
                                                                String divType = "NA";
355
                                                                String divSubtype = "NA";
356
                                                                String divN = "NA";
357
                                                                String divId ="NA";
358
                                                                for(int i = 0 ; i < parser.getAttributeCount(); i++) {
359
                                                                        if(parser.getAttributeLocalName(i) == "type") {
360
                                                                                divType = parser.getAttributeValue(i);
361
                                                                        } else if(parser.getAttributeLocalName(i) == "subtype") {
362
                                                                                divSubtype = parser.getAttributeValue(i);
363
                                                                        } else if(parser.getAttributeLocalName(i) == "n") {
364
                                                                                divN = parser.getAttributeValue(i);
365
                                                                        } else if(parser.getAttributeLocalName(i) == "id") {
366
                                                                                divId = parser.getAttributeValue(i);
367
                                                                                break;
368
                                                                        }
369
                                                                }
370
                                                                output.write("<div type=\""+divType+"\" subtype=\""+divSubtype+"\" n=\""+divN+"\" id=\""+divId+"\">\n");
371
                                                                break;
372
                                                        case "p":
373
                                                                p_id = increment(parser, p_id);
374
                                                                output.write("<p n=\""+p_id+"\">\n");
375
                                                                break;
376
                                                        case "ab":
377
                                                                ab_id = increment(parser, ab_id)
378
                                                                output.write("<ab n=\""+(ab_id)+"\" type=\""+parser.getAttributeValue(null,"type")+"\" subtype=\""+parser.getAttributeValue(null,"subtype")+"\" rend=\""+parser.getAttributeValue(null,"rend")+"\">\n");
379
                                                                break;
380
                                                        case "q":
381
                                                                q_id = increment(parser, q_id)
382
                                                                output.write("<q n=\""+(q_id)+"\">\n");
383
                                                        //flagq=true;
384
                                                                levelq = levelq + 1;
385
                                                                break;
386
                                                        case "sp":
387
                                                                sp_id = increment(parser, sp_id)
388
                                                                output.write("<sp n=\""+(sp_id)+"\">\n");
389
                                                                flagSp = true;
390
                                                                break;
391
                                                        case "front":
392
                                                                front_id = increment(parser, front_id)
393
                                                                output.write("<front n=\""+front_id+"\">\n");
394
                                                                break;
395
                                                        case "body":
396
                                                                body_id= increment(parser, body_id)
397
                                                                output.write("<body n=\""+body_id+"\">\n");
398
                                                                break;
399
                                                        case "back":
400
                                                                back_id = increment(parser, back_id)
401
                                                                output.write("<back n=\""+back_id+"\">\n");
402
                                                                break;
403
                                                        case "lb":
404
                                                                lb_id = increment(parser, lb_id)
405
                                                                break;
406
                                                        case "pb":
407
                                                                pb_id = increment(parser, pb_id)
408
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
409
                                                                        if (parser.getAttributeLocalName(i) == "n") {
410
                                                                                pb_n = parser.getAttributeValue(i);
411
                                                                        }
412
                                                                }
413
                                                                break;
414
                                                        case "s":
415
                                                                s_id = increment(parser, s_id)
416
                                                                output.write("<s n=\""+s_id+"\">\n");
417
                                                                break;
418
                                                        case "choice":
419
                                                                flagchoice = true;
420
                                                                break;
421
                                                        case "corr":
422
                                                                flagcorr = true;
423
                                                                vCorr= "";
424
                                                                break;
425
                                                        case "reg":
426
                                                                flagreg = true;
427
                                                                vReg= "";
428
                                                                break;
429
                                                        case "expan":
430
                                                                flagexpan = true;
431
                                                                vExpan= "";
432
                                                                break;
433
                                                        case "orig":
434
                                                                flagreg = true;
435
                                                                vOrig= "";
436
                                                                break;
437
                                                        case "sic":
438
                                                                flagsic = true;
439
                                                                vSic= "";
440
                                                                break;
441
                                                        case "abbr":
442
                                                                flagreg = true;
443
                                                                vAbbr= "";
444
                                                                break;
445
                                                        case "foreign":
446
                                                                flagForeign = true;
447
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
448
                                                                        if (parser.getAttributeLocalName(i) == "lang") {
449
                                                                                lang = parser.getAttributeValue(i);
450
                                                                                break;
451
                                                                        }
452
                                                                }
453

    
454
                                                                output.write("<foreign n=\""+(foreign_id++)+"\" lang=\""+lang+"\">\n");
455
                                                        //vForeign = "";
456
                                                                break;
457

    
458
                                                        case "name":
459
                                                        //flagName = true;
460
                                                                for(int i = 0 ; i < parser.getAttributeCount(); i++)
461
                                                                        if(parser.getAttributeLocalName(i) == "type")
462
                                                                {
463
                                                                        nameType = parser.getAttributeValue(i);
464
                                                                        break;
465
                                                                }
466

    
467
                                                                output.write("<name n=\""+(name_id++)+"\" type=\""+nameType+"\">\n");
468
                                                                break;
469
                                                        case "supplied":
470
                                                        //flagSupplied = true;
471
                                                                levelSupplied = levelSupplied + 1;
472
                                                                break;
473

    
474
                                                        case "surplus":
475
                                                                flagfw = true;
476
                                                                break;
477

    
478
                                                        case "del":
479
                                                                flagfw = true;
480
                                                                break;
481

    
482
                                                        case "w":
483
                                                                givenpos = "";
484
                                                                wordid = "w_"+text+"_m"+missingId++
485
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
486
                                                                        if (parser.getAttributeLocalName(i) == "id") {
487
                                                                                wordid = parser.getAttributeValue(i);
488
                                                                        } else if (parser.getAttributeLocalName(i) == "type") {
489
                                                                                givenpos = parser.getAttributeValue(i);
490
                                                                        }
491
                                                                }
492
                                                                if (wordid.startsWith("w")) {
493
                                                                        if (!wordid.startsWith("w_"))
494
                                                                                wordid = "w_"+wordid.substring(1)
495
                                                                } else {
496
                                                                        wordid = "w_"+wordid;
497
                                                                }
498

    
499
                                                                if (givenpos == null || givenpos == "")
500
                                                                        givenpos = "NA";
501
                                                                vForm = "";
502
                                                                anahash.clear(); // remove previous word ana values
503
                                                                flagWord = true;
504
                                                                break;
505
                                                        case "form":
506
                                                                flagForm = true;
507
                                                                vForm = "";
508
                                                                break;
509

    
510
                                                        case "ana":
511
                                                                flagAna = true;
512
                                                                anaType = parser.getAttributeValue(null, "type")
513
                                                                anahash.put(anaType, "");
514
                                                                if (firstWord) {
515
                                                                        anaTypes << anaType;
516
                                                                }
517
                                                                break;
518
                                                }
519
                                                break;
520

    
521
                                        case XMLStreamConstants.END_ELEMENT:
522
                                                String localname = parser.getLocalName();
523
                                                if (foundtext) sattrsListener.endElement(localname);
524

    
525
                                                switch (localname) {
526
                                                        case "div":
527
                                                                output.write("</div>\n");
528
                                                                break;
529
                                                        case "text":
530
                                                                output.write("</text>\n");
531
                                                                captureword=false;
532
                                                                break;
533
                                                        case "p":
534
                                                                output.write("</p>\n");
535
                                                                break;
536
                                                        case "s":
537
                                                                output.write("</s>\n");
538
                                                                break;
539
                                                        case "ab":
540
                                                                output.write("</ab>\n");
541
                                                                break;
542
                                                        case "q":
543
                                                                output.write("</q>\n");
544
                                                        //flagq= false;
545
                                                                levelq = levelq - 1;
546
                                                                break;
547
                                                        case "sp":
548
                                                                output.write("</sp>\n");
549
                                                                flagSp = false;
550
                                                                break;
551
                                                        case "front":
552
                                                                output.write("</front>\n");
553
                                                                break;
554
                                                        case "body":
555
                                                                output.write("</body>\n");
556
                                                                break;
557
                                                        case "back":
558
                                                                output.write("</back>\n");
559
                                                                break;
560

    
561
                                                        //                                                        case "fw":
562
                                                        //                                                        flagfw = false;
563
                                                        //                                                        break;
564

    
565
                                                        case "choice":
566
                                                                if(vOrig == "")
567
                                                                        vOrig="NA";
568
                                                                if(vSic == "")
569
                                                                        vSic="NA";
570
                                                                if(vAbbr == "")
571
                                                                        vAbbr="NA";
572

    
573
                                                                String ref;
574
                                                                if(USEVERSE)
575
                                                                        ref = titreId+", p."+pb_n+", v."+lb_id;
576
                                                                else
577
                                                                        ref = titreId+", p."+pb_n;
578

    
579
                                                                if (flagfw) {
580
                                                                        // on est hors texte
581
                                                                } else {
582
                                                                        String vFormToWrite = vForm;
583
                                                                        if (vCorr != "") {
584
                                                                                vFormToWrite = vCorr;
585
                                                                        } else if(vReg != "") {
586
                                                                                vFormToWrite = vReg;
587
                                                                        } else if(vExpan != "") {
588
                                                                                vFormToWrite = vExpan
589
                                                                        }
590
                                                                        firstWord = false;
591
                                                                        output.write( vFormToWrite +"\t"+wordid+"\t"+levelq.toString().substring(0,1)+"\t"+flagSp.toString().substring(0,1)+"\t"+pb_n+"\t"+lb_id+"\t"+vOrig+
592
                                                                                        "\t"+vSic+"\t"+vAbbr+"\t"+ref+"\t"+givenpos+"\t"+levelSupplied.toString().substring(0,1)+"\t"+lang+"\t"+nameType);
593
                                                                        for(String type : anaTypes) {
594
                                                                                output.write("\t"+anahash.get(type));
595
                                                                        }
596
                                                                        output.write("\n")
597
                                                                }
598
                                                                flagchoice = false;
599
                                                                vCorr= "";
600
                                                                vSic= "";
601
                                                                break;
602
                                                        case "corr":
603
                                                                flagcorr = false;
604

    
605
                                                                break;
606
                                                        case "reg":
607
                                                                flagreg = false;
608
                                                                vReg = "";
609
                                                                break;
610
                                                        case "expan":
611
                                                                flagexpan = false;
612
                                                                vExpan= "";
613
                                                                break;
614
                                                        case "orig":
615
                                                                flagreg = false;
616
                                                                vOrig= "";
617
                                                                break;
618
                                                        case "sic":
619
                                                                flagsic = false;
620

    
621
                                                                break;
622
                                                        case "abbr":
623
                                                                flagreg = false;
624
                                                                vAbbr= "";
625
                                                                break;
626

    
627
                                                        case "foreign":
628
                                                                flagForeign = false;
629
                                                                lang = "";
630
                                                                output.write("</foreign>\n");
631
                                                                break;
632

    
633
                                                        case "name":
634
                                                        //flagName = false;
635
                                                                nameType = "";
636
                                                                output.write("</name>\n");
637
                                                                break;
638

    
639
                                                        case "supplied":
640
                                                        //flagSupplied = false;
641
                                                                levelSupplied = levelSupplied - 1;
642
                                                                break;
643

    
644
                                                        case "surplus":
645
                                                                flagfw = false;
646
                                                                break;
647

    
648
                                                        case "del":
649
                                                                flagfw = false;
650
                                                                break;
651

    
652
                                                        case "w":
653
                                                                if (captureword) {
654
                                                                        if (flagchoice) {
655

    
656
                                                                        } else if(flagfw) {
657

    
658
                                                                        } else {
659
                                                                                if (vOrig == "")
660
                                                                                        vOrig="NA";
661
                                                                                if(vSic == "")
662
                                                                                        vSic="NA";
663
                                                                                if(vAbbr == "")
664
                                                                                        vAbbr="NA";
665
                                                                                if (nameType == "")
666
                                                                                        nameType = "NA";
667
                                                                                if(lang == "")
668
                                                                                        lang="fr"
669

    
670
                                                                                String ref;
671
                                                                                if(USEVERSE)
672
                                                                                        ref = titreId+", p."+pb_n+", v."+lb_id;
673
                                                                                else
674
                                                                                        ref = titreId+", p."+pb_n;
675

    
676
                                                                                firstWord = false;
677
                                                                                output.write(vForm.replaceAll("&", "&amp;").replaceAll("<", "&lt;") +"\t"+wordid+"\t"+levelq.toString().substring(0,1)+"\t"+flagSp.toString().substring(0,1)+"\t"+pb_n+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\t"+ref+"\t"+givenpos+"\t"+levelSupplied.toString().substring(0,1)+"\t"+lang+"\t"+nameType);
678
                                                                                for(String type : anaTypes) {
679
                                                                                        output.write("\t"+anahash.get(type));
680
                                                                                }
681
                                                                                output.write("\n")
682
                                                                        }
683

    
684
                                                                        flagWord = false;
685
                                                                }
686
                                                                break;
687

    
688
                                                        case "form":
689
                                                                flagForm = false;
690
                                                                break;
691

    
692
                                                        case "ana":
693
                                                                flagAna = false;
694
                                                                anahash.put(anaType, vAna);
695
                                                                vAna = "";
696
                                                                break;
697
                                                }
698
                                                break; // end elem
699

    
700
                                        case XMLStreamConstants.CHARACTERS:
701
                                                if (flagAna) {
702
                                                        vAna += parser.getText().trim()
703
                                                }
704

    
705
                                                if (flagForm) {
706
                                                        vForm += parser.getText().trim();
707
                                                        if (flagchoice) {
708
                                                                if (flagsic) {
709
                                                                        vSic += parser.getText().trim();
710
                                                                }
711
                                                                if (flagorig) {
712
                                                                        vOrig += parser.getText().trim();
713
                                                                }
714
                                                                if (flagabbr) {
715
                                                                        vAbbr += parser.getText().trim();
716
                                                                }
717
                                                                if (flagcorr) {
718
                                                                        vCorr += parser.getText().trim();
719
                                                                }
720
                                                        }
721
                                                }
722
                                }
723
                        }
724
                        //output.write("</txmcorpus>");
725
                        output.close();
726
                        parser.close();
727
                        inputData.close();
728
                } catch (Exception ex) {
729
                        System.out.println("Exception while parsing " + inputData);
730
                        ex.printStackTrace();
731
                        if (output != null) output.close();
732
                        if (parser != null) parser.close();
733
                        if (inputData != null) inputData.close();
734
                        return false;
735
                }
736

    
737
                return true;
738
        }
739

    
740

    
741
        /**
742
         * Run.
743
         *
744
         * @param rootDirFile contains the TEI-TXM files
745
         * @param basename the basename
746
         * @return true, if successful
747
         */
748
        public boolean run(Project project, File binDir, File txmDir, String corpusname, Properties metadataXPath)
749
        {
750
                sattrsListener = null; // reset SAttribute Listener for each new import
751
                this.metadataXPath = metadataXPath;
752

    
753
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
754
                        println ("Error: CWB executables not well set.")
755
                        return false;
756
                }
757
                
758
                CorpusBuild corpus = project.getCorpusBuild(project.getName(), MainCorpus.class);
759
                if (corpus != null) {
760
                        if (project.getDoUpdate()) {
761
                                corpus.clean(); // remove old files
762
                        } else {
763
                                corpus.delete(); // remove old files and TXMResult children
764
                        }
765
                } else {
766
                        corpus = new MainCorpus(project);
767
                        corpus.setID(project.getName());
768
                        corpus.setName(project.getName());
769
                }
770
                corpus.setDescription("Built with the BFM import module");
771
                
772
                File cqpFile = new File(binDir,"cqp/"+corpusname+".cqp");
773
cqpFile.delete()
774
                new File(binDir,"cqp").mkdirs()
775
                new File(binDir,"data").mkdirs()
776
                new File(binDir,"registry").mkdirs()
777

    
778
                String textid = "";
779
                int counttext = 0;
780
                List<File> files = txmDir.listFiles();
781
                
782
                // get text siecles to be able to sort with it
783
                HashMap<File,Integer[]> filesiecle = new HashMap<File, Integer[]>()
784
                for (File f : files) {
785
                        Integer[] date = new Integer[3];
786
                        date[0] = date[1] = date[2] = 0;
787
                        String xpath = "//tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:date[@type='compo']/@when"
788
                        if (metadataXPath.containsKey("datecompo"))
789
                                xpath = metadataXPath.get("datecompo")
790
                        String datecompo = XPathResult.getXpathResponse(f, xpath);
791
                        if (datecompo != null) {
792
                                //println f.getName()+" > "+datecompo
793
                                String[] split = datecompo.split("-"); // yyyy-mm-dd
794
                                if (split.length == 3) {
795
                                        date[0] = Integer.parseInt(split[0]);
796
                                        date[1] = Integer.parseInt(split[1]);
797
                                        date[2] = Integer.parseInt(split[2]);
798
                                }
799
                                else if (split.length == 1) { // yyyy
800
                                        date[0] = Integer.parseInt(split[0]);
801
                                        date[1] = 1;
802
                                        date[2] = 1;
803
                                }                                
804
                        }
805
                        filesiecle.put(f, date);
806
                }
807
                //println "date compos: "+filesiecle
808
                Collections.sort(files); // Alpha order
809
                Collections.sort(files, new Comparator<File>() { // Date order
810
                                        @Override
811
                                        public int compare(File o1, File o2) {
812
                                                Integer[] date1 = filesiecle.get(o1);
813
                                                Integer[] date2 = filesiecle.get(o2);
814
                                                if (date1[0] < date2[0]) {
815
                                                        return -1;
816
                                                } else if(date1[0] > date2[0]) {
817
                                                        return 1;
818
                                                }
819

    
820
                                                if (date1[1] < date2[1]) {
821
                                                        return -1;
822
                                                } else if(date1[1] > date2[1]) {
823
                                                        return 1;
824
                                                }
825

    
826
                                                if (date1[2] < date2[2]) {
827
                                                        return -1;
828
                                                } else if(date1[2] > date2[2]) {
829
                                                        return 1;
830
                                                }
831

    
832
                                                return 0;
833
                                        }
834
                                });
835
                        
836
                this.orderedFiles = files;
837
                println("process "+files.size()+" files ")
838
                //println("files: $files")
839
                //write txmcorpus
840
                if (!createOutput(cqpFile)) {
841
                        println "Error: could not write cqp file"
842
                        return false;
843
                } else {
844
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
845
                        output.close();
846
                }
847
                
848
                //1- Transform into CQP file
849
                for (File f : files) {
850
                        counttext++;
851
                        if (!f.exists()) {
852
                                println("file "+f+ " does not exists")
853
                        } else {
854
                                print "."
855
                                String txtname = f.getName().substring(0, f.getName().length()-4);
856
                                def builder = new compiler(f.toURI().toURL(), txtname, corpusname, "default", metadataXPath);
857
                                builder.setLang(lang)
858
                                if (!builder.transfomFileCqp(project, cqpFile)) {
859
                                        println "Failed to compile "+f
860
                                }
861
                                builder.setAnnotationDone(this.annotate_status);
862
                        }
863
                }
864

    
865
                //close txmcorpus
866
                if (!createOutput(cqpFile)) {
867
                        println "Error: could not write cqp file"
868
                        return false;
869
                } else {
870
                        output.write("</txmcorpus>\n");
871
                        output.close();
872
                }
873
                println ""
874
                
875
                //2- Import into CWB
876
                def outDir = binDir.getAbsolutePath();;
877
                CwbEncode cwbEn = new CwbEncode();
878
                cwbEn.setDebug(debug);
879
                CwbMakeAll cwbMa = new CwbMakeAll();
880
                cwbMa.setDebug(debug);
881
                
882
                def pAttrs = ["id","q","sp","pb","lb","orig","sic","abbr","ref","pos","supplied","lang","nametype"];
883
                for(String type : anaTypes)
884
                        pAttrs.add(type.substring(1)); // remove #
885

    
886
                structs = sattrsListener.getStructs();
887
                structsProf = sattrsListener.getProfs();
888
                if (debug) {
889
                        println structs
890
                        println structsProf
891
                }
892
                // add structures+properties found in sources
893
                List<String> sargs = new ArrayList<String>();
894
                for (String name : structs.keySet()) {
895
                        if ( name == "text") continue; // added after
896
                        //if ( name == "q") continue; // added after
897
                        //if ( name == "foreign") continue; // added after
898
                        String concat = name+":"+structsProf.get(name); // append the depth
899
                        for (String value : structs.get(name)) // append the attributes
900
                                concat += "+"+value;
901
                        if ((name == "p" || name == "body" || name == "back" || name == "front") &&
902
                        !(concat.endsWith("+n") || concat.contains("+n+")))
903
                                concat += "+n"
904
                        sargs.add(concat);
905
                }
906

    
907
                String textSAttributes = "text:0+id+base+project";
908
                if (metadataXPath != null) {
909
                        for (String meta : metadataXPath.keySet()) // text property declarations from metadata.csv
910
                                textSAttributes+="+"+meta;
911
                }
912
                if (!metadataXPath.keySet().contains("sigle"))
913
                        textSAttributes+="+sigle";
914
                        
915
                sargs.add(textSAttributes)
916
                sargs.add("txmcorpus:0+lang")
917
                //sargs.add("q:0+n+lang")
918
                for (int c = 0 ; c < sargs.size() ; c++) {
919
                        String sarg = sargs.get(c);
920
                        if (sarg.startsWith("q:")) {
921
                                if (! sarg.contains("+n")) sarg +="+n"
922
                                if (! sarg.contains("+lang")) sarg +="+lang"
923

    
924
                                sargs.set(c, sarg);
925
                        } else if(sarg.startsWith("foreign:")) {
926
                                if (! sarg.contains("+n")) sarg +="+n"
927
                                if (! sarg.contains("+lang")) sarg +="+lang"
928
                                sargs.set(c, sarg);
929
                        } else if(sarg.startsWith("ab:") || sarg.startsWith("sp:")) {
930
                                if (! sarg.contains("+n")) sarg +="+n"
931
                                if (! sarg.contains("+subtype")) sarg +="+subtype"
932
                                if (! sarg.contains("+rend")) sarg +="+rend"
933
                                sargs.set(c, sarg);
934
                        }  else if(sarg.startsWith("div:")) {
935
                                if (! sarg.contains("+n")) sarg +="+n"
936
                                if (! sarg.contains("+id")) sarg +="+id"
937
                                if (! sarg.contains("+type")) sarg +="+type"
938
                                if (! sarg.contains("+subtype")) sarg +="+subtype"
939
                                sargs.set(c, sarg);
940
                        } else if(sarg.startsWith("name:")) {
941
                                if (! sarg.contains("+n")) sarg +="+n"
942
                                if (! sarg.contains("+type")) sarg +="+type"
943
                                sargs.set(c, sarg);
944
                        }
945
                }
946
                sargs.sort();
947
                
948
                String[] sAttributes = sargs;
949
                String[] pAttributes = pAttrs;
950
                println "P-attributes: "+pAttributes
951
                println "S-attributes: "+sargs
952

    
953
                try {
954
                        String regPath = outDir + "/registry/"+corpusname.toLowerCase(); // CQP wants lower case registry files
955
                        cwbEn.run(outDir + "/data/${corpusname}", outDir + "/cqp/"+corpusname+".cqp", regPath,pAttributes, sAttributes);
956
                        if (!new File(regPath).exists()) {
957
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
958
                                return false;
959
                        }
960
                        cwbMa.run(corpusname, outDir + "/registry");
961
                } catch (Exception ex) {System.out.println(ex);return false;}
962

    
963
                return true;
964
        }
965

    
966
        /**
967
         * show cwb utils messages.
968
         */
969
        public void setDebug()
970
        {
971
                this.debug = true;
972
        }
973

    
974
        /**
975
         * test purpose.
976
         *
977
         * @param args the arguments
978
         */
979
        public static void main(String[] args)
980
        {
981
                File dir = new File("~/xml/bfm");
982
                def c = new compiler();
983
                c.setDebug();
984
                c.setCwbPath("~/TXM/cwb/bin");
985
                c.run(dir);
986
        }
987
}