Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / bfm / compiler.groovy @ 187

History | View | Annotate | Download (27.2 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (Tue, 24 Jan 2017) $
41
// $LastChangedRevision: 3400 $
42
// $LastChangedBy: mdecorde $
43
//
44
package org.txm.importer.bfm
45

    
46
import org.txm.Toolbox;
47
import org.txm.importer.cwb.*
48
import org.txm.importer.*;
49
import org.txm.scripts.*;
50
import org.txm.scripts.teitxm.BuildTTSrc;
51
import org.txm.scripts.teitxm.*;
52
import org.txm.utils.treetagger.TreeTagger;
53

    
54
import javax.xml.stream.*;
55
import java.net.URL;
56
import java.io.File;
57
import java.util.Comparator;
58
import java.util.HashMap;
59
import java.util.List;
60

    
61
// TODO: Auto-generated Javadoc
62
/**
63
 * Produce WTC files from the TEI-TXM files. <br/>
64
 * - Read texts metadata with XPath queries <br/>
65
 * - Add the following word properties : sic, abbr, orig, lb and pb <br/>
66
 * - Keep &lt;front>, &lt;body> and &lt;back> for each text <br/>
67
 * - Text enclosed in &lt;q> is tokenized <br/>
68
 * 
69
 * @author mdecorde
70
 *
71
 */
72
class compiler {
73
        /** The debug. */
74
        private boolean debug= false;
75

    
76
        /** The annotate_status. */
77
        private boolean annotate_status=true;
78

    
79
        /** The input data. */
80
        private def inputData;
81

    
82
        /** The factory. */
83
        private def factory;
84

    
85
        /** The parser. */
86
        private XMLStreamReader parser;
87

    
88
        /** The dir. */
89
        private def dir;
90

    
91
        /** The output. */
92
        private def output;
93

    
94
        /** The url. */
95
        private def url;
96

    
97
        /** The anahash. */
98
        static boolean firstWord = true;
99
        static private def anaTypes = [];
100
        private HashMap<String,String> anahash = new HashMap<String,String>() ;
101

    
102
        private static SAttributesListener sattrsListener;
103
        private static HashMap<String,ArrayList<String>> structs;
104
        private static HashMap<String, Integer> structsProf;
105

    
106
        /** The text. */
107
        private String text="";
108

    
109
        /** The base. */
110
        private String base="";
111

    
112
        /** The project. */
113
        private String project="";
114

    
115
        /** The cwb loc. */
116
        private String cwbLoc;
117

    
118
        /** The lang. */
119
        private String lang ="fr";
120

    
121
        /**
122
         * contains the metadata xpath organize per name
123
         */
124
        Properties metadataXPath;
125

    
126
        /**
127
         * initialize.
128
         *
129
         */
130
        public compiler(){
131
                firstWord = true;
132
                anaTypes = [];
133
        }
134

    
135
        /**
136
         * initialize the compiler.
137
         *
138
         * @param url the file to process
139
         * @param text the Texte's name
140
         * @param base the base's name
141
         * @param project the Project's name
142
         */
143
        public compiler(URL url,String text,String base, String project, Properties metadataXPath)
144
        {
145
                this.metadataXPath = metadataXPath;
146
                this.text = text
147
                this.base = base;
148
                this.project = project;
149
                try {
150
                        this.url = url;
151
                        inputData = url.openStream();
152

    
153
                        factory = XMLInputFactory.newInstance();
154
                        parser = factory.createXMLStreamReader(inputData);
155
                        if (sattrsListener == null)
156
                                sattrsListener = new SAttributesListener(parser);
157
                        else
158
                                sattrsListener.start(parser)
159
                } catch (XMLStreamException ex) {
160
                        System.out.println(ex);
161
                }catch (IOException ex) {
162
                        System.out.println("IOException while parsing ");
163
                }
164
        }
165

    
166
        /**
167
         * Sets the cwb path.
168
         *
169
         * @param path the path to cwb utils binaries
170
         */
171
        public void setCwbPath(String path) {
172
                if (!new File(path).exists())
173
                        System.err.println("CWB Path : "+path+" does not exists")
174
                cwbLoc = path;
175
        }
176

    
177
        ArrayList<File> orderedFiles;
178
        public ArrayList<File> getOrderedTxmFiles() {
179
                return orderedFiles;
180
        }
181

    
182
        /**
183
         * Sets the lang.
184
         *
185
         * @param lang the new lang
186
         */
187
        public void setLang(String lang)
188
        {
189
                this.lang = lang;
190
        }
191

    
192
        /**
193
         * Sets the annotation done.
194
         *
195
         * @param done the new annotation done
196
         */
197
        public void setAnnotationDone(boolean done)
198
        {
199
                this.annotate_status = done;
200
        }
201

    
202
        /**
203
         * Creates the output.
204
         *
205
         * @param dirPathName the dir path name
206
         * @param fileName the file name
207
         * @return true, if successful
208
         */
209
        private boolean createOutput(File f) {
210
                try {
211
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
212
                } catch (Exception e) {
213
                        System.out.println(e.getLocalizedMessage());
214
                        return false;
215
                }
216
                return true;
217
        }
218

    
219
        /**
220
         * Go to text.
221
         */
222
        private void GoToText()
223
        {
224
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
225
                        if (event == XMLStreamConstants.END_ELEMENT)
226
                                if (parser.getLocalName().equals("teiHeader"))
227
                                        return;
228
                }
229
        }
230

    
231
        /**
232
         * Increment.
233
         *
234
         * @param parser the parser
235
         * @param value the value
236
         * @return the java.lang. object
237
         */
238
        private def increment(XMLStreamReader parser, int value)
239
        {
240
                String n=null;
241
                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
242
                        //System.out.println("attr name "+parser.getAttributeLocalName(i));
243
                        if (parser.getAttributeLocalName(i) == "n") {
244
                                n = parser.getAttributeValue(i);
245
                                break;
246
                        }
247
                }
248
                //System.out.println("inc n "+n);
249
                if (n != null)
250
                        try {
251
                                value = Integer.parseInt(n);
252
                                return value;
253
                        }
254
                        catch (Exception e) {return value+1;}
255

    
256
                value = value+1;
257
                return value;
258
        }
259

    
260
        /**
261
         * Transfom file wtc.
262
         *
263
         * @param dirPathName the dir path name
264
         * @param fileName the file name
265
         * @return true, if successful
266
         */
267
        private boolean transfomFileWtc(File wtcFile)
268
        {
269
                try {
270
                        if (!createOutput(wtcFile)) return false;
271

    
272
                        String headvalue = ""
273
                        String vAna = "";
274
                        String vForm = "";
275
                        String wordid = "";
276
                        String vHead = "";
277

    
278
                        Integer p_id = 0;
279
                        Integer s_id = 0;
280
                        Integer q_id = 0;
281
                        int sp_id = 0;
282
                        Integer body_id = 0;
283
                        Integer front_id = 0;
284
                        Integer back_id = 0;
285
                        Integer lb_id = 0;
286
                        Integer pb_id = 0;
287
                        Integer ab_id = 0;
288
                        int foreign_id = 0;
289
                        int name_id = 0;
290

    
291
                        boolean captureword = false;
292

    
293
                        String vExpan = "";
294
                        String vCorr = "";
295
                        String vReg = "";
296
                        String vOrig = "";
297
                        String vSic = "";
298
                        String vAbbr = "";
299
                        String givenpos = "";
300
                        String pb_n = "";
301
                        String foreign_lang = "";
302
                        String nameType = "";
303
                        String anaType;
304
                        //String abType = "";
305
                        
306
                        boolean foundtei=false, foundtext=false;
307

    
308
                        boolean flaglg = false;
309
                        int levelq = 0;
310
                        //boolean flagq = false;
311
                        boolean flaghead = false;
312
                        //Added:
313
                        boolean flagSp = false;
314
                        boolean flagAuthor = false;
315
                        boolean flagDate = false;
316
                        boolean flagWord = false;
317
                        boolean flagForm = false;
318
                        boolean flagAna = false;
319

    
320
                        boolean flagchoice = false;
321
                        boolean flagcorr = false;
322
                        boolean flagsic = false;
323
                        boolean flagreg = false;
324
                        boolean flagexpan = false;
325
                        boolean flagorig = false;
326
                        boolean flagabbr = false;
327
                        boolean flagfw = false;
328
                        //boolean flagSupplied = false;
329
                        int levelSupplied = 0;
330
                        //boolean flagSurplus = false;
331
                        boolean flagForeign = false;
332
                        //boolean flagName = false;
333

    
334
                        this.GoToText();
335
                        int missingId= 0
336
                        boolean USEVERSE = false; // switch default reference to verse references
337
                        String titreId; // the title to use in the reference
338
                        
339
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
340
                                switch (event) {
341
                                        case XMLStreamConstants.START_ELEMENT:
342

    
343
                                                String localname = parser.getLocalName();
344
                                                if (foundtext) sattrsListener.startElement(localname);
345
                                                
346
                                                switch (localname) {
347
                                                        case "TEI":
348
                                                                foundtei = true;
349
                                                                break;
350
                                                        case "text":
351
                                                                foundtext = true;
352
                                                                sattrsListener.startElement(localname);
353
                                                                output.write("<text id=\""+text+"\"")
354

    
355
                                                                for (int i = 0; i < parser.getAttributeCount() ; i++) {
356
                                                                        String name = parser.getAttributeLocalName(i);
357
                                                                        if ("id" == name || "base" == name || "project" == name) continue;
358
                                                                        output.write(" "+name+"=\""+parser.getAttributeValue(i)+"\"");
359
                                                                        
360
                                                                        if (name == "forme") {
361
                                                                                USEVERSE = (parser.getAttributeValue(i).contains("vers"))
362
                                                                        } else if (name == "sigle") {
363
                                                                                titreId = parser.getAttributeValue(i)
364
                                                                        }
365
                                                                }
366
                                                                
367
                                                                output.write(" base=\""+base+"\" project=\""+project+"\">\n");
368
                                                                captureword=true;
369
                                                                break;
370

    
371
                                                        case "div":
372
                                                        //output.write("<div type=\""+parser.getAttributeValue(null,"type")+"\">\n");
373
                                                                String divType = "NA";
374
                                                                String divSubtype = "NA";
375
                                                                String divN = "NA";
376
                                                                String divId ="NA";
377
                                                                for(int i = 0 ; i < parser.getAttributeCount(); i++) {
378
                                                                        if(parser.getAttributeLocalName(i) == "type") {
379
                                                                                divType = parser.getAttributeValue(i);
380
                                                                        } else if(parser.getAttributeLocalName(i) == "subtype") {
381
                                                                                divSubtype = parser.getAttributeValue(i);
382
                                                                        } else if(parser.getAttributeLocalName(i) == "n") {
383
                                                                                divN = parser.getAttributeValue(i);
384
                                                                        } else if(parser.getAttributeLocalName(i) == "id") {
385
                                                                                divId = parser.getAttributeValue(i);
386
                                                                                break;
387
                                                                        }
388
                                                                }
389
                                                                output.write("<div type=\""+divType+"\" subtype=\""+divSubtype+"\" n=\""+divN+"\" id=\""+divId+"\">\n");
390
                                                                break;
391
                                                        case "p":
392
                                                                p_id = increment(parser, p_id);
393
                                                                output.write("<p n=\""+p_id+"\">\n");
394
                                                                break;
395
                                                        case "ab":
396
                                                                ab_id = increment(parser, ab_id)
397
                                                                output.write("<ab n=\""+(ab_id)+"\" type=\""+parser.getAttributeValue(null,"type")+"\" subtype=\""+parser.getAttributeValue(null,"subtype")+"\" rend=\""+parser.getAttributeValue(null,"rend")+"\">\n");
398
                                                                break;
399
                                                        case "q":
400
                                                                q_id = increment(parser, q_id)
401
                                                                output.write("<q n=\""+(q_id)+"\">\n");
402
                                                        //flagq=true;
403
                                                                levelq = levelq + 1;
404
                                                                break;
405
                                                        case "sp":
406
                                                                sp_id = increment(parser, sp_id)
407
                                                                output.write("<sp n=\""+(sp_id)+"\">\n");
408
                                                                flagSp = true;
409
                                                                break;
410
                                                        case "front":
411
                                                                front_id = increment(parser, front_id)
412
                                                                output.write("<front n=\""+front_id+"\">\n");
413
                                                                break;
414
                                                        case "body":
415
                                                                body_id= increment(parser, body_id)
416
                                                                output.write("<body n=\""+body_id+"\">\n");
417
                                                                break;
418
                                                        case "back":
419
                                                                back_id = increment(parser, back_id)
420
                                                                output.write("<back n=\""+back_id+"\">\n");
421
                                                                break;
422
                                                        case "lb":
423
                                                                lb_id = increment(parser, lb_id)
424
                                                                break;
425
                                                        case "pb":
426
                                                                pb_id = increment(parser, pb_id)
427
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
428
                                                                        if (parser.getAttributeLocalName(i) == "n") {
429
                                                                                pb_n = parser.getAttributeValue(i);
430
                                                                        }
431
                                                                }
432
                                                                break;
433
                                                        case "s":
434
                                                                s_id = increment(parser, s_id)
435
                                                                output.write("<s n=\""+s_id+"\">\n");
436
                                                                break;
437
                                                        case "choice":
438
                                                                flagchoice = true;
439
                                                                break;
440
                                                        case "corr":
441
                                                                flagcorr = true;
442
                                                                vCorr= "";
443
                                                                break;
444
                                                        case "reg":
445
                                                                flagreg = true;
446
                                                                vReg= "";
447
                                                                break;
448
                                                        case "expan":
449
                                                                flagexpan = true;
450
                                                                vExpan= "";
451
                                                                break;
452
                                                        case "orig":
453
                                                                flagreg = true;
454
                                                                vOrig= "";
455
                                                                break;
456
                                                        case "sic":
457
                                                                flagsic = true;
458
                                                                vSic= "";
459
                                                                break;
460
                                                        case "abbr":
461
                                                                flagreg = true;
462
                                                                vAbbr= "";
463
                                                                break;
464
                                                        case "foreign":
465
                                                                flagForeign = true;
466
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
467
                                                                        if (parser.getAttributeLocalName(i) == "lang") {
468
                                                                                lang = parser.getAttributeValue(i);
469
                                                                                break;
470
                                                                        }
471
                                                                }
472

    
473
                                                                output.write("<foreign n=\""+(foreign_id++)+"\" lang=\""+lang+"\">\n");
474
                                                        //vForeign = "";
475
                                                                break;
476

    
477
                                                        case "name":
478
                                                        //flagName = true;
479
                                                                for(int i = 0 ; i < parser.getAttributeCount(); i++)
480
                                                                        if(parser.getAttributeLocalName(i) == "type")
481
                                                                {
482
                                                                        nameType = parser.getAttributeValue(i);
483
                                                                        break;
484
                                                                }
485

    
486
                                                                output.write("<name n=\""+(name_id++)+"\" type=\""+nameType+"\">\n");
487
                                                                break;
488
                                                        case "supplied":
489
                                                        //flagSupplied = true;
490
                                                                levelSupplied = levelSupplied + 1;
491
                                                                break;
492

    
493
                                                        case "surplus":
494
                                                                flagfw = true;
495
                                                                break;
496

    
497
                                                        case "del":
498
                                                                flagfw = true;
499
                                                                break;
500

    
501
                                                        case "w":
502
                                                                givenpos = "";
503
                                                                wordid = "w_"+text+"_m"+missingId++
504
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
505
                                                                        if (parser.getAttributeLocalName(i) == "id") {
506
                                                                                wordid = parser.getAttributeValue(i);
507
                                                                        } else if (parser.getAttributeLocalName(i) == "type") {
508
                                                                                givenpos = parser.getAttributeValue(i);
509
                                                                        }
510
                                                                }
511
                                                                if (wordid.startsWith("w")) {
512
                                                                        if (!wordid.startsWith("w_"))
513
                                                                                wordid = "w_"+wordid.substring(1)
514
                                                                } else {
515
                                                                        wordid = "w_"+wordid;
516
                                                                }
517

    
518
                                                                if (givenpos == null || givenpos == "")
519
                                                                        givenpos = "NA";
520
                                                                vForm = "";
521
                                                                anahash.clear(); // remove previous word ana values
522
                                                                flagWord = true;
523
                                                                break;
524
                                                        case "form":
525
                                                                flagForm = true;
526
                                                                vForm = "";
527
                                                                break;
528

    
529
                                                        case "ana":
530
                                                                flagAna = true;
531
                                                                anaType = parser.getAttributeValue(null, "type")
532
                                                                anahash.put(anaType, "");
533
                                                                if (firstWord) {
534
                                                                        anaTypes << anaType;
535
                                                                }
536
                                                                break;
537
                                                }
538
                                                break;
539

    
540
                                        case XMLStreamConstants.END_ELEMENT:
541
                                                String localname = parser.getLocalName();
542
                                                if (foundtext) sattrsListener.endElement(localname);
543

    
544
                                                switch (localname) {
545
                                                        case "div":
546
                                                                output.write("</div>\n");
547
                                                                break;
548
                                                        case "text":
549
                                                                output.write("</text>\n");
550
                                                                captureword=false;
551
                                                                break;
552
                                                        case "p":
553
                                                                output.write("</p>\n");
554
                                                                break;
555
                                                        case "s":
556
                                                                output.write("</s>\n");
557
                                                                break;
558
                                                        case "ab":
559
                                                                output.write("</ab>\n");
560
                                                                break;
561
                                                        case "q":
562
                                                                output.write("</q>\n");
563
                                                        //flagq= false;
564
                                                                levelq = levelq - 1;
565
                                                                break;
566
                                                        case "sp":
567
                                                                output.write("</sp>\n");
568
                                                                flagSp = false;
569
                                                                break;
570
                                                        case "front":
571
                                                                output.write("</front>\n");
572
                                                                break;
573
                                                        case "body":
574
                                                                output.write("</body>\n");
575
                                                                break;
576
                                                        case "back":
577
                                                                output.write("</back>\n");
578
                                                                break;
579

    
580
                                                        //                                                        case "fw":
581
                                                        //                                                        flagfw = false;
582
                                                        //                                                        break;
583

    
584
                                                        case "choice":
585
                                                                if(vOrig == "")
586
                                                                        vOrig="NA";
587
                                                                if(vSic == "")
588
                                                                        vSic="NA";
589
                                                                if(vAbbr == "")
590
                                                                        vAbbr="NA";
591

    
592
                                                                String ref;
593
                                                                if(USEVERSE)
594
                                                                        ref = titreId+", p."+pb_n+", v."+lb_id;
595
                                                                else
596
                                                                        ref = titreId+", p."+pb_n;
597

    
598
                                                                if (flagfw) {
599
                                                                        // on est hors texte
600
                                                                } else {
601
                                                                        String vFormToWrite = vForm;
602
                                                                        if (vCorr != "") {
603
                                                                                vFormToWrite = vCorr;
604
                                                                        } else if(vReg != "") {
605
                                                                                vFormToWrite = vReg;
606
                                                                        } else if(vExpan != "") {
607
                                                                                vFormToWrite = vExpan
608
                                                                        }
609
                                                                        firstWord = false;
610
                                                                        output.write( vFormToWrite +"\t"+wordid+"\t"+levelq.toString().substring(0,1)+"\t"+flagSp.toString().substring(0,1)+"\t"+pb_n+"\t"+lb_id+"\t"+vOrig+
611
                                                                                        "\t"+vSic+"\t"+vAbbr+"\t"+ref+"\t"+givenpos+"\t"+levelSupplied.toString().substring(0,1)+"\t"+lang+"\t"+nameType);
612
                                                                        for(String type : anaTypes) {
613
                                                                                output.write("\t"+anahash.get(type));
614
                                                                        }
615
                                                                        output.write("\n")
616
                                                                }
617
                                                                flagchoice = false;
618
                                                                vCorr= "";
619
                                                                vSic= "";
620
                                                                break;
621
                                                        case "corr":
622
                                                                flagcorr = false;
623

    
624
                                                                break;
625
                                                        case "reg":
626
                                                                flagreg = false;
627
                                                                vReg = "";
628
                                                                break;
629
                                                        case "expan":
630
                                                                flagexpan = false;
631
                                                                vExpan= "";
632
                                                                break;
633
                                                        case "orig":
634
                                                                flagreg = false;
635
                                                                vOrig= "";
636
                                                                break;
637
                                                        case "sic":
638
                                                                flagsic = false;
639

    
640
                                                                break;
641
                                                        case "abbr":
642
                                                                flagreg = false;
643
                                                                vAbbr= "";
644
                                                                break;
645

    
646
                                                        case "foreign":
647
                                                                flagForeign = false;
648
                                                                lang = "";
649
                                                                output.write("</foreign>\n");
650
                                                                break;
651

    
652
                                                        case "name":
653
                                                        //flagName = false;
654
                                                                nameType = "";
655
                                                                output.write("</name>\n");
656
                                                                break;
657

    
658
                                                        case "supplied":
659
                                                        //flagSupplied = false;
660
                                                                levelSupplied = levelSupplied - 1;
661
                                                                break;
662

    
663
                                                        case "surplus":
664
                                                                flagfw = false;
665
                                                                break;
666

    
667
                                                        case "del":
668
                                                                flagfw = false;
669
                                                                break;
670

    
671
                                                        case "w":
672
                                                                if (captureword) {
673
                                                                        if (flagchoice) {
674

    
675
                                                                        } else if(flagfw) {
676

    
677
                                                                        } else {
678
                                                                                if (vOrig == "")
679
                                                                                        vOrig="NA";
680
                                                                                if(vSic == "")
681
                                                                                        vSic="NA";
682
                                                                                if(vAbbr == "")
683
                                                                                        vAbbr="NA";
684
                                                                                if (nameType == "")
685
                                                                                        nameType = "NA";
686
                                                                                if(lang == "")
687
                                                                                        lang="fr"
688

    
689
                                                                                String ref;
690
                                                                                if(USEVERSE)
691
                                                                                        ref = titreId+", p."+pb_n+", v."+lb_id;
692
                                                                                else
693
                                                                                        ref = titreId+", p."+pb_n;
694

    
695
                                                                                firstWord = false;
696
                                                                                output.write(vForm.replaceAll("&", "&amp;").replaceAll("<", "&lt;") +"\t"+wordid+"\t"+levelq.toString().substring(0,1)+"\t"+flagSp.toString().substring(0,1)+"\t"+pb_n+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\t"+ref+"\t"+givenpos+"\t"+levelSupplied.toString().substring(0,1)+"\t"+lang+"\t"+nameType);
697
                                                                                for(String type : anaTypes) {
698
                                                                                        output.write("\t"+anahash.get(type));
699
                                                                                }
700
                                                                                output.write("\n")
701
                                                                        }
702

    
703
                                                                        flagWord = false;
704
                                                                }
705
                                                                break;
706

    
707
                                                        case "form":
708
                                                                flagForm = false;
709
                                                                break;
710

    
711
                                                        case "ana":
712
                                                                flagAna = false;
713
                                                                anahash.put(anaType, vAna);
714
                                                                vAna = "";
715
                                                                break;
716
                                                }
717
                                                break; // end elem
718

    
719
                                        case XMLStreamConstants.CHARACTERS:
720
                                                if (flagAna) {
721
                                                        vAna += parser.getText().trim()
722
                                                }
723

    
724
                                                if (flagForm) {
725
                                                        vForm += parser.getText().trim();
726
                                                        if (flagchoice) {
727
                                                                if (flagsic) {
728
                                                                        vSic += parser.getText().trim();
729
                                                                }
730
                                                                if (flagorig) {
731
                                                                        vOrig += parser.getText().trim();
732
                                                                }
733
                                                                if (flagabbr) {
734
                                                                        vAbbr += parser.getText().trim();
735
                                                                }
736
                                                                if (flagcorr) {
737
                                                                        vCorr += parser.getText().trim();
738
                                                                }
739
                                                        }
740
                                                }
741
                                }
742
                        }
743
                        //output.write("</txmcorpus>");
744
                        output.close();
745
                        parser.close();
746
                        inputData.close();
747
                } catch (Exception ex) {
748
                        System.out.println("Exception while parsing " + inputData);
749
                        ex.printStackTrace();
750
                        return false;
751
                }
752

    
753
                return true;
754
        }
755

    
756

    
757
        /**
758
         * Run.
759
         *
760
         * @param rootDirFile contains the TEI-TXM files
761
         * @param basename the basename
762
         * @return true, if successful
763
         */
764
        public boolean run(File binDir, File txmDir, String corpusname, Properties metadataXPath)
765
        {
766
                sattrsListener = null; // reset SAttribute Listener for each new import
767
                this.metadataXPath = metadataXPath;
768
                if (cwbLoc == null)
769
                        cwbLoc = org.txm.Toolbox.getParam(org.txm.Toolbox.CQI_SERVER_PATH_TO_CQPLIB)+File.separator;
770

    
771
                if (!new File(cwbLoc).exists()) {
772
                        println ("CWB path error: "+cwbLoc)
773
                        return false;
774
                }
775
                if (!txmDir.exists()) {
776
                        println ("binary directory does not exists: "+txmDir)
777
                        return false;
778
                }
779

    
780
                File wtcFile = new File(binDir, "wtc/${corpusname}.wtc");
781
                new File(binDir, "/wtc/").deleteDir();
782
                new File(binDir, "/wtc/").mkdir();
783
                new File(binDir, "/data/${corpusname}").deleteDir();
784
                new File(binDir, "/data/${corpusname}").mkdir();
785
                new File(binDir, "registry/").mkdir();
786

    
787
                String textid = "";
788
                int counttext = 0;
789
                List<File> files = txmDir.listFiles();
790
                
791
                // get text siecles to be able to sort with it
792
                HashMap<File,Integer[]> filesiecle = new HashMap<File, Integer[]>()
793
                for (File f : files) {
794
                        Integer[] date = new Integer[3];
795
                        date[0] = date[1] = date[2] = 0;
796
                        String xpath = "//tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:date[@type='compo']/@when"
797
                        if (metadataXPath.containsKey("datecompo"))
798
                                xpath = metadataXPath.get("datecompo")
799
                        String datecompo = XPathResult.getXpathResponse(f, xpath);
800
                        if (datecompo != null) {
801
                                //println f.getName()+" > "+datecompo
802
                                String[] split = datecompo.split("-"); // yyyy-mm-dd
803
                                if (split.length == 3) {
804
                                        date[0] = Integer.parseInt(split[0]);
805
                                        date[1] = Integer.parseInt(split[1]);
806
                                        date[2] = Integer.parseInt(split[2]);
807
                                }
808
                                else if (split.length == 1) { // yyyy
809
                                        date[0] = Integer.parseInt(split[0]);
810
                                        date[1] = 1;
811
                                        date[2] = 1;
812
                                }                                
813
                        }
814
                        filesiecle.put(f, date);
815
                }
816
                //println "date compos: "+filesiecle
817
                Collections.sort(files); // Alpha order
818
                Collections.sort(files, new Comparator<File>() { // Date order
819
                                        @Override
820
                                        public int compare(File o1, File o2) {
821
                                                Integer[] date1 = filesiecle.get(o1);
822
                                                Integer[] date2 = filesiecle.get(o2);
823
                                                if (date1[0] < date2[0]) {
824
                                                        return -1;
825
                                                } else if(date1[0] > date2[0]) {
826
                                                        return 1;
827
                                                }
828

    
829
                                                if (date1[1] < date2[1]) {
830
                                                        return -1;
831
                                                } else if(date1[1] > date2[1]) {
832
                                                        return 1;
833
                                                }
834

    
835
                                                if (date1[2] < date2[2]) {
836
                                                        return -1;
837
                                                } else if(date1[2] > date2[2]) {
838
                                                        return 1;
839
                                                }
840

    
841
                                                return 0;
842
                                        }
843
                                });
844
                        
845
                this.orderedFiles = files;
846
                println("process "+files.size()+" files ")
847
                //println("files: $files")
848
                //write txmcorpus
849
                if (!createOutput(wtcFile)) {
850
                        println "Error: could not write wtc file"
851
                        return false;
852
                } else {
853
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
854
                        output.close();
855
                }
856
                
857
                //1- Transform into WTC file
858
                for (File f : files) {
859
                        counttext++;
860
                        if (!f.exists()) {
861
                                println("file "+f+ " does not exists")
862
                        } else {
863
                                print "."
864
                                String txtname = f.getName().substring(0, f.getName().length()-4);
865
                                def builder = new compiler(f.toURI().toURL(), txtname, corpusname, "default", metadataXPath);
866
                                builder.setLang(lang)
867
                                if (!builder.transfomFileWtc(wtcFile)) {
868
                                        println "Failed to compile "+f
869
                                }
870
                                builder.setAnnotationDone(this.annotate_status);
871
                        }
872
                }
873

    
874
                //close txmcorpus
875
                if (!createOutput(wtcFile)) {
876
                        println "Error: could not write wtc file"
877
                        return false;
878
                } else {
879
                        output.write("</txmcorpus>\n");
880
                        output.close();
881
                }
882
                println ""
883
                
884
                //2- Import into CWB
885
                def outDir = binDir.getAbsolutePath();;
886
                CwbEncode cwbEn = new CwbEncode();
887
                cwbEn.setDebug(debug);
888
                CwbMakeAll cwbMa = new CwbMakeAll();
889
                cwbMa.setDebug(debug);
890
                
891
                def pAttrs = ["id","q","sp","pb","lb","orig","sic","abbr","ref","pos","supplied","lang","nametype"];
892
                for(String type : anaTypes)
893
                        pAttrs.add(type.substring(1)); // remove #
894

    
895
                structs = sattrsListener.getStructs();
896
                structsProf = sattrsListener.getProfs();
897
                if (debug) {
898
                        println structs
899
                        println structsProf
900
                }
901
                // add structures+properties found in sources
902
                List<String> sargs = new ArrayList<String>();
903
                for (String name : structs.keySet()) {
904
                        if ( name == "text") continue; // added after
905
                        //if ( name == "q") continue; // added after
906
                        //if ( name == "foreign") continue; // added after
907
                        String concat = name+":"+structsProf.get(name); // append the depth
908
                        for (String value : structs.get(name)) // append the attributes
909
                                concat += "+"+value;
910
                        if ((name == "p" || name == "body" || name == "back" || name == "front") &&
911
                        !(concat.endsWith("+n") || concat.contains("+n+")))
912
                                concat += "+n"
913
                        sargs.add(concat);
914
                }
915

    
916
                String textSAttributes = "text:0+id+base+project";
917
                if (metadataXPath != null) {
918
                        for (String meta : metadataXPath.keySet()) // text property declarations from metadata.csv
919
                                textSAttributes+="+"+meta;
920
                }
921
                if (!metadataXPath.keySet().contains("sigle"))
922
                        textSAttributes+="+sigle";
923
                        
924
                sargs.add(textSAttributes)
925
                sargs.add("txmcorpus:0+lang")
926
                //sargs.add("q:0+n+lang")
927
                for (int c = 0 ; c < sargs.size() ; c++) {
928
                        String sarg = sargs.get(c);
929
                        if (sarg.startsWith("q:")) {
930
                                if (! sarg.contains("+n")) sarg +="+n"
931
                                if (! sarg.contains("+lang")) sarg +="+lang"
932

    
933
                                sargs.set(c, sarg);
934
                        } else if(sarg.startsWith("foreign:")) {
935
                                if (! sarg.contains("+n")) sarg +="+n"
936
                                if (! sarg.contains("+lang")) sarg +="+lang"
937
                                sargs.set(c, sarg);
938
                        } else if(sarg.startsWith("ab:") || sarg.startsWith("sp:")) {
939
                                if (! sarg.contains("+n")) sarg +="+n"
940
                                if (! sarg.contains("+subtype")) sarg +="+subtype"
941
                                if (! sarg.contains("+rend")) sarg +="+rend"
942
                                sargs.set(c, sarg);
943
                        }  else if(sarg.startsWith("div:")) {
944
                                if (! sarg.contains("+n")) sarg +="+n"
945
                                if (! sarg.contains("+id")) sarg +="+id"
946
                                if (! sarg.contains("+type")) sarg +="+type"
947
                                if (! sarg.contains("+subtype")) sarg +="+subtype"
948
                                sargs.set(c, sarg);
949
                        } else if(sarg.startsWith("name:")) {
950
                                if (! sarg.contains("+n")) sarg +="+n"
951
                                if (! sarg.contains("+type")) sarg +="+type"
952
                                sargs.set(c, sarg);
953
                        }
954
                }
955
                sargs.sort();
956
                
957
                String[] sAttributes = sargs;
958
                String[] pAttributes = pAttrs;
959
                println "P-attributes: "+pAttributes
960
                println "S-attributes: "+sargs
961

    
962
                try {
963
                        String regPath = outDir + "/registry/"+corpusname.toLowerCase(); // CQP wants lower case registry files
964
                        cwbEn.run(new File(cwbLoc,"cwb-encode").getAbsolutePath(), outDir + "/data/${corpusname}", outDir + "/wtc/"+corpusname+".wtc", regPath,pAttributes, sAttributes);
965
                        if (!new File(regPath).exists()) {
966
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
967
                                return false;
968
                        }
969
                        cwbMa.run(new File(cwbLoc,"cwb-makeall").getAbsolutePath(), corpusname, outDir + "/registry");
970
                } catch (Exception ex) {System.out.println(ex);return false;}
971

    
972
                return true;
973
        }
974

    
975
        /**
976
         * show cwb utils messages.
977
         */
978
        public void setDebug()
979
        {
980
                this.debug = true;
981
        }
982

    
983
        /**
984
         * test purpose.
985
         *
986
         * @param args the arguments
987
         */
988
        public static void main(String[] args)
989
        {
990
                File dir = new File("~/xml/bfm");
991
                def c = new compiler();
992
                c.setDebug();
993
                c.setCwbPath("~/TXM/cwb/bin");
994
                c.run(dir);
995
        }
996
}