Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / bfm / compiler.groovy @ 966

History | View | Annotate | Download (26.7 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
22 321 mdecorde
//
23 321 mdecorde
// This file is part of the TXM platform.
24 321 mdecorde
//
25 321 mdecorde
// The TXM platform is free software: you can redistribute it and/or modif y
26 321 mdecorde
// it under the terms of the GNU General Public License as published by
27 321 mdecorde
// the Free Software Foundation, either version 3 of the License, or
28 321 mdecorde
// (at your option) any later version.
29 321 mdecorde
//
30 321 mdecorde
// The TXM platform is distributed in the hope that it will be useful,
31 321 mdecorde
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32 321 mdecorde
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33 321 mdecorde
// GNU General Public License for more details.
34 321 mdecorde
//
35 321 mdecorde
// You should have received a copy of the GNU General Public License
36 321 mdecorde
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37 321 mdecorde
//
38 321 mdecorde
//
39 321 mdecorde
//
40 479 mdecorde
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
41 321 mdecorde
// $LastChangedRevision: 3400 $
42 321 mdecorde
// $LastChangedBy: mdecorde $
43 321 mdecorde
//
44 321 mdecorde
package org.txm.importer.bfm
45 321 mdecorde
46 321 mdecorde
import org.txm.Toolbox;
47 321 mdecorde
import org.txm.importer.cwb.*
48 321 mdecorde
import org.txm.importer.*;
49 321 mdecorde
import org.txm.scripts.*;
50 927 mdecorde
import org.txm.importer.xmltxm.BuildTTSrc;
51 927 mdecorde
import org.txm.importer.xmltxm.*;
52 321 mdecorde
import org.txm.utils.treetagger.TreeTagger;
53 321 mdecorde
54 321 mdecorde
import javax.xml.stream.*;
55 321 mdecorde
import java.net.URL;
56 321 mdecorde
import java.io.File;
57 321 mdecorde
import java.util.Comparator;
58 321 mdecorde
import java.util.HashMap;
59 321 mdecorde
import java.util.List;
60 321 mdecorde
61 321 mdecorde
// TODO: Auto-generated Javadoc
62 321 mdecorde
/**
63 804 mdecorde
 * Produce CQP files from the TEI-TXM files. <br/>
64 321 mdecorde
 * - Read texts metadata with XPath queries <br/>
65 321 mdecorde
 * - Add the following word properties : sic, abbr, orig, lb and pb <br/>
66 321 mdecorde
 * - Keep &lt;front>, &lt;body> and &lt;back> for each text <br/>
67 321 mdecorde
 * - Text enclosed in &lt;q> is tokenized <br/>
68 321 mdecorde
 *
69 321 mdecorde
 * @author mdecorde
70 321 mdecorde
 *
71 321 mdecorde
 */
72 321 mdecorde
class compiler {
73 321 mdecorde
        /** The debug. */
74 321 mdecorde
        private boolean debug= false;
75 321 mdecorde
76 321 mdecorde
        /** The annotate_status. */
77 321 mdecorde
        private boolean annotate_status=true;
78 321 mdecorde
79 321 mdecorde
        /** The input data. */
80 321 mdecorde
        private def inputData;
81 321 mdecorde
82 321 mdecorde
        /** The factory. */
83 321 mdecorde
        private def factory;
84 321 mdecorde
85 321 mdecorde
        /** The parser. */
86 321 mdecorde
        private XMLStreamReader parser;
87 321 mdecorde
88 321 mdecorde
        /** The dir. */
89 321 mdecorde
        private def dir;
90 321 mdecorde
91 321 mdecorde
        /** The output. */
92 321 mdecorde
        private def output;
93 321 mdecorde
94 321 mdecorde
        /** The url. */
95 321 mdecorde
        private def url;
96 321 mdecorde
97 321 mdecorde
        /** The anahash. */
98 321 mdecorde
        static boolean firstWord = true;
99 321 mdecorde
        static private def anaTypes = [];
100 321 mdecorde
        private HashMap<String,String> anahash = new HashMap<String,String>() ;
101 321 mdecorde
102 321 mdecorde
        private static SAttributesListener sattrsListener;
103 321 mdecorde
        private static HashMap<String,ArrayList<String>> structs;
104 321 mdecorde
        private static HashMap<String, Integer> structsProf;
105 321 mdecorde
106 321 mdecorde
        /** The text. */
107 321 mdecorde
        private String text="";
108 321 mdecorde
109 321 mdecorde
        /** The base. */
110 321 mdecorde
        private String base="";
111 321 mdecorde
112 321 mdecorde
        /** The project. */
113 321 mdecorde
        private String project="";
114 321 mdecorde
115 321 mdecorde
        /** The lang. */
116 321 mdecorde
        private String lang ="fr";
117 321 mdecorde
118 321 mdecorde
        /**
119 321 mdecorde
         * contains the metadata xpath organize per name
120 321 mdecorde
         */
121 321 mdecorde
        Properties metadataXPath;
122 321 mdecorde
123 321 mdecorde
        /**
124 321 mdecorde
         * initialize.
125 321 mdecorde
         *
126 321 mdecorde
         */
127 321 mdecorde
        public compiler(){
128 321 mdecorde
                firstWord = true;
129 321 mdecorde
                anaTypes = [];
130 321 mdecorde
        }
131 321 mdecorde
132 321 mdecorde
        /**
133 321 mdecorde
         * initialize the compiler.
134 321 mdecorde
         *
135 321 mdecorde
         * @param url the file to process
136 321 mdecorde
         * @param text the Texte's name
137 321 mdecorde
         * @param base the base's name
138 321 mdecorde
         * @param project the Project's name
139 321 mdecorde
         */
140 321 mdecorde
        public compiler(URL url,String text,String base, String project, Properties metadataXPath)
141 321 mdecorde
        {
142 321 mdecorde
                this.metadataXPath = metadataXPath;
143 321 mdecorde
                this.text = text
144 321 mdecorde
                this.base = base;
145 321 mdecorde
                this.project = project;
146 321 mdecorde
                try {
147 321 mdecorde
                        this.url = url;
148 321 mdecorde
                        inputData = url.openStream();
149 321 mdecorde
150 321 mdecorde
                        factory = XMLInputFactory.newInstance();
151 321 mdecorde
                        parser = factory.createXMLStreamReader(inputData);
152 321 mdecorde
                        if (sattrsListener == null)
153 321 mdecorde
                                sattrsListener = new SAttributesListener(parser);
154 321 mdecorde
                        else
155 321 mdecorde
                                sattrsListener.start(parser)
156 321 mdecorde
                } catch (XMLStreamException ex) {
157 321 mdecorde
                        System.out.println(ex);
158 321 mdecorde
                }catch (IOException ex) {
159 321 mdecorde
                        System.out.println("IOException while parsing ");
160 321 mdecorde
                }
161 321 mdecorde
        }
162 321 mdecorde
163 321 mdecorde
        ArrayList<File> orderedFiles;
164 321 mdecorde
        public ArrayList<File> getOrderedTxmFiles() {
165 321 mdecorde
                return orderedFiles;
166 321 mdecorde
        }
167 321 mdecorde
168 321 mdecorde
        /**
169 321 mdecorde
         * Sets the lang.
170 321 mdecorde
         *
171 321 mdecorde
         * @param lang the new lang
172 321 mdecorde
         */
173 321 mdecorde
        public void setLang(String lang)
174 321 mdecorde
        {
175 321 mdecorde
                this.lang = lang;
176 321 mdecorde
        }
177 321 mdecorde
178 321 mdecorde
        /**
179 321 mdecorde
         * Sets the annotation done.
180 321 mdecorde
         *
181 321 mdecorde
         * @param done the new annotation done
182 321 mdecorde
         */
183 321 mdecorde
        public void setAnnotationDone(boolean done)
184 321 mdecorde
        {
185 321 mdecorde
                this.annotate_status = done;
186 321 mdecorde
        }
187 321 mdecorde
188 321 mdecorde
        /**
189 321 mdecorde
         * Creates the output.
190 321 mdecorde
         *
191 321 mdecorde
         * @param dirPathName the dir path name
192 321 mdecorde
         * @param fileName the file name
193 321 mdecorde
         * @return true, if successful
194 321 mdecorde
         */
195 321 mdecorde
        private boolean createOutput(File f) {
196 321 mdecorde
                try {
197 321 mdecorde
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
198 321 mdecorde
                } catch (Exception e) {
199 321 mdecorde
                        System.out.println(e.getLocalizedMessage());
200 321 mdecorde
                        return false;
201 321 mdecorde
                }
202 321 mdecorde
                return true;
203 321 mdecorde
        }
204 321 mdecorde
205 321 mdecorde
        /**
206 321 mdecorde
         * Go to text.
207 321 mdecorde
         */
208 321 mdecorde
        private void GoToText()
209 321 mdecorde
        {
210 321 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
211 321 mdecorde
                        if (event == XMLStreamConstants.END_ELEMENT)
212 321 mdecorde
                                if (parser.getLocalName().equals("teiHeader"))
213 321 mdecorde
                                        return;
214 321 mdecorde
                }
215 321 mdecorde
        }
216 321 mdecorde
217 321 mdecorde
        /**
218 321 mdecorde
         * Increment.
219 321 mdecorde
         *
220 321 mdecorde
         * @param parser the parser
221 321 mdecorde
         * @param value the value
222 321 mdecorde
         * @return the java.lang. object
223 321 mdecorde
         */
224 321 mdecorde
        private def increment(XMLStreamReader parser, int value)
225 321 mdecorde
        {
226 321 mdecorde
                String n=null;
227 321 mdecorde
                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
228 321 mdecorde
                        //System.out.println("attr name "+parser.getAttributeLocalName(i));
229 321 mdecorde
                        if (parser.getAttributeLocalName(i) == "n") {
230 321 mdecorde
                                n = parser.getAttributeValue(i);
231 321 mdecorde
                                break;
232 321 mdecorde
                        }
233 321 mdecorde
                }
234 321 mdecorde
                //System.out.println("inc n "+n);
235 321 mdecorde
                if (n != null)
236 321 mdecorde
                        try {
237 321 mdecorde
                                value = Integer.parseInt(n);
238 321 mdecorde
                                return value;
239 321 mdecorde
                        }
240 321 mdecorde
                        catch (Exception e) {return value+1;}
241 321 mdecorde
242 321 mdecorde
                value = value+1;
243 321 mdecorde
                return value;
244 321 mdecorde
        }
245 321 mdecorde
246 321 mdecorde
        /**
247 803 mdecorde
         * Transfom file cqp.
248 321 mdecorde
         *
249 321 mdecorde
         * @param dirPathName the dir path name
250 321 mdecorde
         * @param fileName the file name
251 321 mdecorde
         * @return true, if successful
252 321 mdecorde
         */
253 804 mdecorde
        private boolean transfomFileCqp(File cqpFile)
254 321 mdecorde
        {
255 321 mdecorde
                try {
256 803 mdecorde
                        if (!createOutput(cqpFile)) return false;
257 321 mdecorde
258 321 mdecorde
                        String headvalue = ""
259 321 mdecorde
                        String vAna = "";
260 321 mdecorde
                        String vForm = "";
261 321 mdecorde
                        String wordid = "";
262 321 mdecorde
                        String vHead = "";
263 321 mdecorde
264 321 mdecorde
                        Integer p_id = 0;
265 321 mdecorde
                        Integer s_id = 0;
266 321 mdecorde
                        Integer q_id = 0;
267 321 mdecorde
                        int sp_id = 0;
268 321 mdecorde
                        Integer body_id = 0;
269 321 mdecorde
                        Integer front_id = 0;
270 321 mdecorde
                        Integer back_id = 0;
271 321 mdecorde
                        Integer lb_id = 0;
272 321 mdecorde
                        Integer pb_id = 0;
273 321 mdecorde
                        Integer ab_id = 0;
274 321 mdecorde
                        int foreign_id = 0;
275 321 mdecorde
                        int name_id = 0;
276 321 mdecorde
277 321 mdecorde
                        boolean captureword = false;
278 321 mdecorde
279 321 mdecorde
                        String vExpan = "";
280 321 mdecorde
                        String vCorr = "";
281 321 mdecorde
                        String vReg = "";
282 321 mdecorde
                        String vOrig = "";
283 321 mdecorde
                        String vSic = "";
284 321 mdecorde
                        String vAbbr = "";
285 321 mdecorde
                        String givenpos = "";
286 321 mdecorde
                        String pb_n = "";
287 321 mdecorde
                        String foreign_lang = "";
288 321 mdecorde
                        String nameType = "";
289 321 mdecorde
                        String anaType;
290 321 mdecorde
                        //String abType = "";
291 321 mdecorde
292 321 mdecorde
                        boolean foundtei=false, foundtext=false;
293 321 mdecorde
294 321 mdecorde
                        boolean flaglg = false;
295 321 mdecorde
                        int levelq = 0;
296 321 mdecorde
                        //boolean flagq = false;
297 321 mdecorde
                        boolean flaghead = false;
298 321 mdecorde
                        //Added:
299 321 mdecorde
                        boolean flagSp = false;
300 321 mdecorde
                        boolean flagAuthor = false;
301 321 mdecorde
                        boolean flagDate = false;
302 321 mdecorde
                        boolean flagWord = false;
303 321 mdecorde
                        boolean flagForm = false;
304 321 mdecorde
                        boolean flagAna = false;
305 321 mdecorde
306 321 mdecorde
                        boolean flagchoice = false;
307 321 mdecorde
                        boolean flagcorr = false;
308 321 mdecorde
                        boolean flagsic = false;
309 321 mdecorde
                        boolean flagreg = false;
310 321 mdecorde
                        boolean flagexpan = false;
311 321 mdecorde
                        boolean flagorig = false;
312 321 mdecorde
                        boolean flagabbr = false;
313 321 mdecorde
                        boolean flagfw = false;
314 321 mdecorde
                        //boolean flagSupplied = false;
315 321 mdecorde
                        int levelSupplied = 0;
316 321 mdecorde
                        //boolean flagSurplus = false;
317 321 mdecorde
                        boolean flagForeign = false;
318 321 mdecorde
                        //boolean flagName = false;
319 321 mdecorde
320 321 mdecorde
                        this.GoToText();
321 321 mdecorde
                        int missingId= 0
322 321 mdecorde
                        boolean USEVERSE = false; // switch default reference to verse references
323 321 mdecorde
                        String titreId; // the title to use in the reference
324 321 mdecorde
325 321 mdecorde
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
326 321 mdecorde
                                switch (event) {
327 321 mdecorde
                                        case XMLStreamConstants.START_ELEMENT:
328 321 mdecorde
329 321 mdecorde
                                                String localname = parser.getLocalName();
330 321 mdecorde
                                                if (foundtext) sattrsListener.startElement(localname);
331 321 mdecorde
332 321 mdecorde
                                                switch (localname) {
333 321 mdecorde
                                                        case "TEI":
334 321 mdecorde
                                                                foundtei = true;
335 321 mdecorde
                                                                break;
336 321 mdecorde
                                                        case "text":
337 321 mdecorde
                                                                foundtext = true;
338 321 mdecorde
                                                                sattrsListener.startElement(localname);
339 321 mdecorde
                                                                output.write("<text id=\""+text+"\"")
340 321 mdecorde
341 321 mdecorde
                                                                for (int i = 0; i < parser.getAttributeCount() ; i++) {
342 321 mdecorde
                                                                        String name = parser.getAttributeLocalName(i);
343 321 mdecorde
                                                                        if ("id" == name || "base" == name || "project" == name) continue;
344 321 mdecorde
                                                                        output.write(" "+name+"=\""+parser.getAttributeValue(i)+"\"");
345 321 mdecorde
346 321 mdecorde
                                                                        if (name == "forme") {
347 321 mdecorde
                                                                                USEVERSE = (parser.getAttributeValue(i).contains("vers"))
348 321 mdecorde
                                                                        } else if (name == "sigle") {
349 321 mdecorde
                                                                                titreId = parser.getAttributeValue(i)
350 321 mdecorde
                                                                        }
351 321 mdecorde
                                                                }
352 321 mdecorde
353 321 mdecorde
                                                                output.write(" base=\""+base+"\" project=\""+project+"\">\n");
354 321 mdecorde
                                                                captureword=true;
355 321 mdecorde
                                                                break;
356 321 mdecorde
357 321 mdecorde
                                                        case "div":
358 321 mdecorde
                                                        //output.write("<div type=\""+parser.getAttributeValue(null,"type")+"\">\n");
359 321 mdecorde
                                                                String divType = "NA";
360 321 mdecorde
                                                                String divSubtype = "NA";
361 321 mdecorde
                                                                String divN = "NA";
362 321 mdecorde
                                                                String divId ="NA";
363 321 mdecorde
                                                                for(int i = 0 ; i < parser.getAttributeCount(); i++) {
364 321 mdecorde
                                                                        if(parser.getAttributeLocalName(i) == "type") {
365 321 mdecorde
                                                                                divType = parser.getAttributeValue(i);
366 321 mdecorde
                                                                        } else if(parser.getAttributeLocalName(i) == "subtype") {
367 321 mdecorde
                                                                                divSubtype = parser.getAttributeValue(i);
368 321 mdecorde
                                                                        } else if(parser.getAttributeLocalName(i) == "n") {
369 321 mdecorde
                                                                                divN = parser.getAttributeValue(i);
370 321 mdecorde
                                                                        } else if(parser.getAttributeLocalName(i) == "id") {
371 321 mdecorde
                                                                                divId = parser.getAttributeValue(i);
372 321 mdecorde
                                                                                break;
373 321 mdecorde
                                                                        }
374 321 mdecorde
                                                                }
375 321 mdecorde
                                                                output.write("<div type=\""+divType+"\" subtype=\""+divSubtype+"\" n=\""+divN+"\" id=\""+divId+"\">\n");
376 321 mdecorde
                                                                break;
377 321 mdecorde
                                                        case "p":
378 321 mdecorde
                                                                p_id = increment(parser, p_id);
379 321 mdecorde
                                                                output.write("<p n=\""+p_id+"\">\n");
380 321 mdecorde
                                                                break;
381 321 mdecorde
                                                        case "ab":
382 321 mdecorde
                                                                ab_id = increment(parser, ab_id)
383 321 mdecorde
                                                                output.write("<ab n=\""+(ab_id)+"\" type=\""+parser.getAttributeValue(null,"type")+"\" subtype=\""+parser.getAttributeValue(null,"subtype")+"\" rend=\""+parser.getAttributeValue(null,"rend")+"\">\n");
384 321 mdecorde
                                                                break;
385 321 mdecorde
                                                        case "q":
386 321 mdecorde
                                                                q_id = increment(parser, q_id)
387 321 mdecorde
                                                                output.write("<q n=\""+(q_id)+"\">\n");
388 321 mdecorde
                                                        //flagq=true;
389 321 mdecorde
                                                                levelq = levelq + 1;
390 321 mdecorde
                                                                break;
391 321 mdecorde
                                                        case "sp":
392 321 mdecorde
                                                                sp_id = increment(parser, sp_id)
393 321 mdecorde
                                                                output.write("<sp n=\""+(sp_id)+"\">\n");
394 321 mdecorde
                                                                flagSp = true;
395 321 mdecorde
                                                                break;
396 321 mdecorde
                                                        case "front":
397 321 mdecorde
                                                                front_id = increment(parser, front_id)
398 321 mdecorde
                                                                output.write("<front n=\""+front_id+"\">\n");
399 321 mdecorde
                                                                break;
400 321 mdecorde
                                                        case "body":
401 321 mdecorde
                                                                body_id= increment(parser, body_id)
402 321 mdecorde
                                                                output.write("<body n=\""+body_id+"\">\n");
403 321 mdecorde
                                                                break;
404 321 mdecorde
                                                        case "back":
405 321 mdecorde
                                                                back_id = increment(parser, back_id)
406 321 mdecorde
                                                                output.write("<back n=\""+back_id+"\">\n");
407 321 mdecorde
                                                                break;
408 321 mdecorde
                                                        case "lb":
409 321 mdecorde
                                                                lb_id = increment(parser, lb_id)
410 321 mdecorde
                                                                break;
411 321 mdecorde
                                                        case "pb":
412 321 mdecorde
                                                                pb_id = increment(parser, pb_id)
413 321 mdecorde
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
414 321 mdecorde
                                                                        if (parser.getAttributeLocalName(i) == "n") {
415 321 mdecorde
                                                                                pb_n = parser.getAttributeValue(i);
416 321 mdecorde
                                                                        }
417 321 mdecorde
                                                                }
418 321 mdecorde
                                                                break;
419 321 mdecorde
                                                        case "s":
420 321 mdecorde
                                                                s_id = increment(parser, s_id)
421 321 mdecorde
                                                                output.write("<s n=\""+s_id+"\">\n");
422 321 mdecorde
                                                                break;
423 321 mdecorde
                                                        case "choice":
424 321 mdecorde
                                                                flagchoice = true;
425 321 mdecorde
                                                                break;
426 321 mdecorde
                                                        case "corr":
427 321 mdecorde
                                                                flagcorr = true;
428 321 mdecorde
                                                                vCorr= "";
429 321 mdecorde
                                                                break;
430 321 mdecorde
                                                        case "reg":
431 321 mdecorde
                                                                flagreg = true;
432 321 mdecorde
                                                                vReg= "";
433 321 mdecorde
                                                                break;
434 321 mdecorde
                                                        case "expan":
435 321 mdecorde
                                                                flagexpan = true;
436 321 mdecorde
                                                                vExpan= "";
437 321 mdecorde
                                                                break;
438 321 mdecorde
                                                        case "orig":
439 321 mdecorde
                                                                flagreg = true;
440 321 mdecorde
                                                                vOrig= "";
441 321 mdecorde
                                                                break;
442 321 mdecorde
                                                        case "sic":
443 321 mdecorde
                                                                flagsic = true;
444 321 mdecorde
                                                                vSic= "";
445 321 mdecorde
                                                                break;
446 321 mdecorde
                                                        case "abbr":
447 321 mdecorde
                                                                flagreg = true;
448 321 mdecorde
                                                                vAbbr= "";
449 321 mdecorde
                                                                break;
450 321 mdecorde
                                                        case "foreign":
451 321 mdecorde
                                                                flagForeign = true;
452 321 mdecorde
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
453 321 mdecorde
                                                                        if (parser.getAttributeLocalName(i) == "lang") {
454 321 mdecorde
                                                                                lang = parser.getAttributeValue(i);
455 321 mdecorde
                                                                                break;
456 321 mdecorde
                                                                        }
457 321 mdecorde
                                                                }
458 321 mdecorde
459 321 mdecorde
                                                                output.write("<foreign n=\""+(foreign_id++)+"\" lang=\""+lang+"\">\n");
460 321 mdecorde
                                                        //vForeign = "";
461 321 mdecorde
                                                                break;
462 321 mdecorde
463 321 mdecorde
                                                        case "name":
464 321 mdecorde
                                                        //flagName = true;
465 321 mdecorde
                                                                for(int i = 0 ; i < parser.getAttributeCount(); i++)
466 321 mdecorde
                                                                        if(parser.getAttributeLocalName(i) == "type")
467 321 mdecorde
                                                                {
468 321 mdecorde
                                                                        nameType = parser.getAttributeValue(i);
469 321 mdecorde
                                                                        break;
470 321 mdecorde
                                                                }
471 321 mdecorde
472 321 mdecorde
                                                                output.write("<name n=\""+(name_id++)+"\" type=\""+nameType+"\">\n");
473 321 mdecorde
                                                                break;
474 321 mdecorde
                                                        case "supplied":
475 321 mdecorde
                                                        //flagSupplied = true;
476 321 mdecorde
                                                                levelSupplied = levelSupplied + 1;
477 321 mdecorde
                                                                break;
478 321 mdecorde
479 321 mdecorde
                                                        case "surplus":
480 321 mdecorde
                                                                flagfw = true;
481 321 mdecorde
                                                                break;
482 321 mdecorde
483 321 mdecorde
                                                        case "del":
484 321 mdecorde
                                                                flagfw = true;
485 321 mdecorde
                                                                break;
486 321 mdecorde
487 321 mdecorde
                                                        case "w":
488 321 mdecorde
                                                                givenpos = "";
489 321 mdecorde
                                                                wordid = "w_"+text+"_m"+missingId++
490 321 mdecorde
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
491 321 mdecorde
                                                                        if (parser.getAttributeLocalName(i) == "id") {
492 321 mdecorde
                                                                                wordid = parser.getAttributeValue(i);
493 321 mdecorde
                                                                        } else if (parser.getAttributeLocalName(i) == "type") {
494 321 mdecorde
                                                                                givenpos = parser.getAttributeValue(i);
495 321 mdecorde
                                                                        }
496 321 mdecorde
                                                                }
497 321 mdecorde
                                                                if (wordid.startsWith("w")) {
498 321 mdecorde
                                                                        if (!wordid.startsWith("w_"))
499 321 mdecorde
                                                                                wordid = "w_"+wordid.substring(1)
500 321 mdecorde
                                                                } else {
501 321 mdecorde
                                                                        wordid = "w_"+wordid;
502 321 mdecorde
                                                                }
503 321 mdecorde
504 321 mdecorde
                                                                if (givenpos == null || givenpos == "")
505 321 mdecorde
                                                                        givenpos = "NA";
506 321 mdecorde
                                                                vForm = "";
507 321 mdecorde
                                                                anahash.clear(); // remove previous word ana values
508 321 mdecorde
                                                                flagWord = true;
509 321 mdecorde
                                                                break;
510 321 mdecorde
                                                        case "form":
511 321 mdecorde
                                                                flagForm = true;
512 321 mdecorde
                                                                vForm = "";
513 321 mdecorde
                                                                break;
514 321 mdecorde
515 321 mdecorde
                                                        case "ana":
516 321 mdecorde
                                                                flagAna = true;
517 321 mdecorde
                                                                anaType = parser.getAttributeValue(null, "type")
518 321 mdecorde
                                                                anahash.put(anaType, "");
519 321 mdecorde
                                                                if (firstWord) {
520 321 mdecorde
                                                                        anaTypes << anaType;
521 321 mdecorde
                                                                }
522 321 mdecorde
                                                                break;
523 321 mdecorde
                                                }
524 321 mdecorde
                                                break;
525 321 mdecorde
526 321 mdecorde
                                        case XMLStreamConstants.END_ELEMENT:
527 321 mdecorde
                                                String localname = parser.getLocalName();
528 321 mdecorde
                                                if (foundtext) sattrsListener.endElement(localname);
529 321 mdecorde
530 321 mdecorde
                                                switch (localname) {
531 321 mdecorde
                                                        case "div":
532 321 mdecorde
                                                                output.write("</div>\n");
533 321 mdecorde
                                                                break;
534 321 mdecorde
                                                        case "text":
535 321 mdecorde
                                                                output.write("</text>\n");
536 321 mdecorde
                                                                captureword=false;
537 321 mdecorde
                                                                break;
538 321 mdecorde
                                                        case "p":
539 321 mdecorde
                                                                output.write("</p>\n");
540 321 mdecorde
                                                                break;
541 321 mdecorde
                                                        case "s":
542 321 mdecorde
                                                                output.write("</s>\n");
543 321 mdecorde
                                                                break;
544 321 mdecorde
                                                        case "ab":
545 321 mdecorde
                                                                output.write("</ab>\n");
546 321 mdecorde
                                                                break;
547 321 mdecorde
                                                        case "q":
548 321 mdecorde
                                                                output.write("</q>\n");
549 321 mdecorde
                                                        //flagq= false;
550 321 mdecorde
                                                                levelq = levelq - 1;
551 321 mdecorde
                                                                break;
552 321 mdecorde
                                                        case "sp":
553 321 mdecorde
                                                                output.write("</sp>\n");
554 321 mdecorde
                                                                flagSp = false;
555 321 mdecorde
                                                                break;
556 321 mdecorde
                                                        case "front":
557 321 mdecorde
                                                                output.write("</front>\n");
558 321 mdecorde
                                                                break;
559 321 mdecorde
                                                        case "body":
560 321 mdecorde
                                                                output.write("</body>\n");
561 321 mdecorde
                                                                break;
562 321 mdecorde
                                                        case "back":
563 321 mdecorde
                                                                output.write("</back>\n");
564 321 mdecorde
                                                                break;
565 321 mdecorde
566 321 mdecorde
                                                        //                                                        case "fw":
567 321 mdecorde
                                                        //                                                        flagfw = false;
568 321 mdecorde
                                                        //                                                        break;
569 321 mdecorde
570 321 mdecorde
                                                        case "choice":
571 321 mdecorde
                                                                if(vOrig == "")
572 321 mdecorde
                                                                        vOrig="NA";
573 321 mdecorde
                                                                if(vSic == "")
574 321 mdecorde
                                                                        vSic="NA";
575 321 mdecorde
                                                                if(vAbbr == "")
576 321 mdecorde
                                                                        vAbbr="NA";
577 321 mdecorde
578 321 mdecorde
                                                                String ref;
579 321 mdecorde
                                                                if(USEVERSE)
580 321 mdecorde
                                                                        ref = titreId+", p."+pb_n+", v."+lb_id;
581 321 mdecorde
                                                                else
582 321 mdecorde
                                                                        ref = titreId+", p."+pb_n;
583 321 mdecorde
584 321 mdecorde
                                                                if (flagfw) {
585 321 mdecorde
                                                                        // on est hors texte
586 321 mdecorde
                                                                } else {
587 321 mdecorde
                                                                        String vFormToWrite = vForm;
588 321 mdecorde
                                                                        if (vCorr != "") {
589 321 mdecorde
                                                                                vFormToWrite = vCorr;
590 321 mdecorde
                                                                        } else if(vReg != "") {
591 321 mdecorde
                                                                                vFormToWrite = vReg;
592 321 mdecorde
                                                                        } else if(vExpan != "") {
593 321 mdecorde
                                                                                vFormToWrite = vExpan
594 321 mdecorde
                                                                        }
595 321 mdecorde
                                                                        firstWord = false;
596 321 mdecorde
                                                                        output.write( vFormToWrite +"\t"+wordid+"\t"+levelq.toString().substring(0,1)+"\t"+flagSp.toString().substring(0,1)+"\t"+pb_n+"\t"+lb_id+"\t"+vOrig+
597 321 mdecorde
                                                                                        "\t"+vSic+"\t"+vAbbr+"\t"+ref+"\t"+givenpos+"\t"+levelSupplied.toString().substring(0,1)+"\t"+lang+"\t"+nameType);
598 321 mdecorde
                                                                        for(String type : anaTypes) {
599 321 mdecorde
                                                                                output.write("\t"+anahash.get(type));
600 321 mdecorde
                                                                        }
601 321 mdecorde
                                                                        output.write("\n")
602 321 mdecorde
                                                                }
603 321 mdecorde
                                                                flagchoice = false;
604 321 mdecorde
                                                                vCorr= "";
605 321 mdecorde
                                                                vSic= "";
606 321 mdecorde
                                                                break;
607 321 mdecorde
                                                        case "corr":
608 321 mdecorde
                                                                flagcorr = false;
609 321 mdecorde
610 321 mdecorde
                                                                break;
611 321 mdecorde
                                                        case "reg":
612 321 mdecorde
                                                                flagreg = false;
613 321 mdecorde
                                                                vReg = "";
614 321 mdecorde
                                                                break;
615 321 mdecorde
                                                        case "expan":
616 321 mdecorde
                                                                flagexpan = false;
617 321 mdecorde
                                                                vExpan= "";
618 321 mdecorde
                                                                break;
619 321 mdecorde
                                                        case "orig":
620 321 mdecorde
                                                                flagreg = false;
621 321 mdecorde
                                                                vOrig= "";
622 321 mdecorde
                                                                break;
623 321 mdecorde
                                                        case "sic":
624 321 mdecorde
                                                                flagsic = false;
625 321 mdecorde
626 321 mdecorde
                                                                break;
627 321 mdecorde
                                                        case "abbr":
628 321 mdecorde
                                                                flagreg = false;
629 321 mdecorde
                                                                vAbbr= "";
630 321 mdecorde
                                                                break;
631 321 mdecorde
632 321 mdecorde
                                                        case "foreign":
633 321 mdecorde
                                                                flagForeign = false;
634 321 mdecorde
                                                                lang = "";
635 321 mdecorde
                                                                output.write("</foreign>\n");
636 321 mdecorde
                                                                break;
637 321 mdecorde
638 321 mdecorde
                                                        case "name":
639 321 mdecorde
                                                        //flagName = false;
640 321 mdecorde
                                                                nameType = "";
641 321 mdecorde
                                                                output.write("</name>\n");
642 321 mdecorde
                                                                break;
643 321 mdecorde
644 321 mdecorde
                                                        case "supplied":
645 321 mdecorde
                                                        //flagSupplied = false;
646 321 mdecorde
                                                                levelSupplied = levelSupplied - 1;
647 321 mdecorde
                                                                break;
648 321 mdecorde
649 321 mdecorde
                                                        case "surplus":
650 321 mdecorde
                                                                flagfw = false;
651 321 mdecorde
                                                                break;
652 321 mdecorde
653 321 mdecorde
                                                        case "del":
654 321 mdecorde
                                                                flagfw = false;
655 321 mdecorde
                                                                break;
656 321 mdecorde
657 321 mdecorde
                                                        case "w":
658 321 mdecorde
                                                                if (captureword) {
659 321 mdecorde
                                                                        if (flagchoice) {
660 321 mdecorde
661 321 mdecorde
                                                                        } else if(flagfw) {
662 321 mdecorde
663 321 mdecorde
                                                                        } else {
664 321 mdecorde
                                                                                if (vOrig == "")
665 321 mdecorde
                                                                                        vOrig="NA";
666 321 mdecorde
                                                                                if(vSic == "")
667 321 mdecorde
                                                                                        vSic="NA";
668 321 mdecorde
                                                                                if(vAbbr == "")
669 321 mdecorde
                                                                                        vAbbr="NA";
670 321 mdecorde
                                                                                if (nameType == "")
671 321 mdecorde
                                                                                        nameType = "NA";
672 321 mdecorde
                                                                                if(lang == "")
673 321 mdecorde
                                                                                        lang="fr"
674 321 mdecorde
675 321 mdecorde
                                                                                String ref;
676 321 mdecorde
                                                                                if(USEVERSE)
677 321 mdecorde
                                                                                        ref = titreId+", p."+pb_n+", v."+lb_id;
678 321 mdecorde
                                                                                else
679 321 mdecorde
                                                                                        ref = titreId+", p."+pb_n;
680 321 mdecorde
681 321 mdecorde
                                                                                firstWord = false;
682 321 mdecorde
                                                                                output.write(vForm.replaceAll("&", "&amp;").replaceAll("<", "&lt;") +"\t"+wordid+"\t"+levelq.toString().substring(0,1)+"\t"+flagSp.toString().substring(0,1)+"\t"+pb_n+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\t"+ref+"\t"+givenpos+"\t"+levelSupplied.toString().substring(0,1)+"\t"+lang+"\t"+nameType);
683 321 mdecorde
                                                                                for(String type : anaTypes) {
684 321 mdecorde
                                                                                        output.write("\t"+anahash.get(type));
685 321 mdecorde
                                                                                }
686 321 mdecorde
                                                                                output.write("\n")
687 321 mdecorde
                                                                        }
688 321 mdecorde
689 321 mdecorde
                                                                        flagWord = false;
690 321 mdecorde
                                                                }
691 321 mdecorde
                                                                break;
692 321 mdecorde
693 321 mdecorde
                                                        case "form":
694 321 mdecorde
                                                                flagForm = false;
695 321 mdecorde
                                                                break;
696 321 mdecorde
697 321 mdecorde
                                                        case "ana":
698 321 mdecorde
                                                                flagAna = false;
699 321 mdecorde
                                                                anahash.put(anaType, vAna);
700 321 mdecorde
                                                                vAna = "";
701 321 mdecorde
                                                                break;
702 321 mdecorde
                                                }
703 321 mdecorde
                                                break; // end elem
704 321 mdecorde
705 321 mdecorde
                                        case XMLStreamConstants.CHARACTERS:
706 321 mdecorde
                                                if (flagAna) {
707 321 mdecorde
                                                        vAna += parser.getText().trim()
708 321 mdecorde
                                                }
709 321 mdecorde
710 321 mdecorde
                                                if (flagForm) {
711 321 mdecorde
                                                        vForm += parser.getText().trim();
712 321 mdecorde
                                                        if (flagchoice) {
713 321 mdecorde
                                                                if (flagsic) {
714 321 mdecorde
                                                                        vSic += parser.getText().trim();
715 321 mdecorde
                                                                }
716 321 mdecorde
                                                                if (flagorig) {
717 321 mdecorde
                                                                        vOrig += parser.getText().trim();
718 321 mdecorde
                                                                }
719 321 mdecorde
                                                                if (flagabbr) {
720 321 mdecorde
                                                                        vAbbr += parser.getText().trim();
721 321 mdecorde
                                                                }
722 321 mdecorde
                                                                if (flagcorr) {
723 321 mdecorde
                                                                        vCorr += parser.getText().trim();
724 321 mdecorde
                                                                }
725 321 mdecorde
                                                        }
726 321 mdecorde
                                                }
727 321 mdecorde
                                }
728 321 mdecorde
                        }
729 321 mdecorde
                        //output.write("</txmcorpus>");
730 321 mdecorde
                        output.close();
731 321 mdecorde
                        parser.close();
732 321 mdecorde
                        inputData.close();
733 321 mdecorde
                } catch (Exception ex) {
734 321 mdecorde
                        System.out.println("Exception while parsing " + inputData);
735 321 mdecorde
                        ex.printStackTrace();
736 321 mdecorde
                        return false;
737 321 mdecorde
                }
738 321 mdecorde
739 321 mdecorde
                return true;
740 321 mdecorde
        }
741 321 mdecorde
742 321 mdecorde
743 321 mdecorde
        /**
744 321 mdecorde
         * Run.
745 321 mdecorde
         *
746 321 mdecorde
         * @param rootDirFile contains the TEI-TXM files
747 321 mdecorde
         * @param basename the basename
748 321 mdecorde
         * @return true, if successful
749 321 mdecorde
         */
750 321 mdecorde
        public boolean run(File binDir, File txmDir, String corpusname, Properties metadataXPath)
751 321 mdecorde
        {
752 321 mdecorde
                sattrsListener = null; // reset SAttribute Listener for each new import
753 321 mdecorde
                this.metadataXPath = metadataXPath;
754 321 mdecorde
755 714 mdecorde
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
756 714 mdecorde
                        println ("Error: CWB executables not well set.")
757 321 mdecorde
                        return false;
758 321 mdecorde
                }
759 321 mdecorde
                if (!txmDir.exists()) {
760 321 mdecorde
                        println ("binary directory does not exists: "+txmDir)
761 321 mdecorde
                        return false;
762 321 mdecorde
                }
763 321 mdecorde
764 803 mdecorde
                File cqpFile = new File(binDir, "cqp/${corpusname}.cqp");
765 803 mdecorde
                new File(binDir, "/cqp/").deleteDir();
766 803 mdecorde
                new File(binDir, "/cqp/").mkdir();
767 321 mdecorde
                new File(binDir, "/data/${corpusname}").deleteDir();
768 321 mdecorde
                new File(binDir, "/data/${corpusname}").mkdir();
769 321 mdecorde
                new File(binDir, "registry/").mkdir();
770 321 mdecorde
771 321 mdecorde
                String textid = "";
772 321 mdecorde
                int counttext = 0;
773 321 mdecorde
                List<File> files = txmDir.listFiles();
774 321 mdecorde
775 321 mdecorde
                // get text siecles to be able to sort with it
776 321 mdecorde
                HashMap<File,Integer[]> filesiecle = new HashMap<File, Integer[]>()
777 321 mdecorde
                for (File f : files) {
778 321 mdecorde
                        Integer[] date = new Integer[3];
779 321 mdecorde
                        date[0] = date[1] = date[2] = 0;
780 321 mdecorde
                        String xpath = "//tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:date[@type='compo']/@when"
781 321 mdecorde
                        if (metadataXPath.containsKey("datecompo"))
782 321 mdecorde
                                xpath = metadataXPath.get("datecompo")
783 321 mdecorde
                        String datecompo = XPathResult.getXpathResponse(f, xpath);
784 321 mdecorde
                        if (datecompo != null) {
785 321 mdecorde
                                //println f.getName()+" > "+datecompo
786 321 mdecorde
                                String[] split = datecompo.split("-"); // yyyy-mm-dd
787 321 mdecorde
                                if (split.length == 3) {
788 321 mdecorde
                                        date[0] = Integer.parseInt(split[0]);
789 321 mdecorde
                                        date[1] = Integer.parseInt(split[1]);
790 321 mdecorde
                                        date[2] = Integer.parseInt(split[2]);
791 321 mdecorde
                                }
792 321 mdecorde
                                else if (split.length == 1) { // yyyy
793 321 mdecorde
                                        date[0] = Integer.parseInt(split[0]);
794 321 mdecorde
                                        date[1] = 1;
795 321 mdecorde
                                        date[2] = 1;
796 321 mdecorde
                                }
797 321 mdecorde
                        }
798 321 mdecorde
                        filesiecle.put(f, date);
799 321 mdecorde
                }
800 321 mdecorde
                //println "date compos: "+filesiecle
801 321 mdecorde
                Collections.sort(files); // Alpha order
802 321 mdecorde
                Collections.sort(files, new Comparator<File>() { // Date order
803 321 mdecorde
                                        @Override
804 321 mdecorde
                                        public int compare(File o1, File o2) {
805 321 mdecorde
                                                Integer[] date1 = filesiecle.get(o1);
806 321 mdecorde
                                                Integer[] date2 = filesiecle.get(o2);
807 321 mdecorde
                                                if (date1[0] < date2[0]) {
808 321 mdecorde
                                                        return -1;
809 321 mdecorde
                                                } else if(date1[0] > date2[0]) {
810 321 mdecorde
                                                        return 1;
811 321 mdecorde
                                                }
812 321 mdecorde
813 321 mdecorde
                                                if (date1[1] < date2[1]) {
814 321 mdecorde
                                                        return -1;
815 321 mdecorde
                                                } else if(date1[1] > date2[1]) {
816 321 mdecorde
                                                        return 1;
817 321 mdecorde
                                                }
818 321 mdecorde
819 321 mdecorde
                                                if (date1[2] < date2[2]) {
820 321 mdecorde
                                                        return -1;
821 321 mdecorde
                                                } else if(date1[2] > date2[2]) {
822 321 mdecorde
                                                        return 1;
823 321 mdecorde
                                                }
824 321 mdecorde
825 321 mdecorde
                                                return 0;
826 321 mdecorde
                                        }
827 321 mdecorde
                                });
828 321 mdecorde
829 321 mdecorde
                this.orderedFiles = files;
830 321 mdecorde
                println("process "+files.size()+" files ")
831 321 mdecorde
                //println("files: $files")
832 321 mdecorde
                //write txmcorpus
833 803 mdecorde
                if (!createOutput(cqpFile)) {
834 803 mdecorde
                        println "Error: could not write cqp file"
835 321 mdecorde
                        return false;
836 321 mdecorde
                } else {
837 321 mdecorde
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
838 321 mdecorde
                        output.close();
839 321 mdecorde
                }
840 321 mdecorde
841 804 mdecorde
                //1- Transform into CQP file
842 321 mdecorde
                for (File f : files) {
843 321 mdecorde
                        counttext++;
844 321 mdecorde
                        if (!f.exists()) {
845 321 mdecorde
                                println("file "+f+ " does not exists")
846 321 mdecorde
                        } else {
847 321 mdecorde
                                print "."
848 321 mdecorde
                                String txtname = f.getName().substring(0, f.getName().length()-4);
849 321 mdecorde
                                def builder = new compiler(f.toURI().toURL(), txtname, corpusname, "default", metadataXPath);
850 321 mdecorde
                                builder.setLang(lang)
851 804 mdecorde
                                if (!builder.transfomFileCqp(cqpFile)) {
852 321 mdecorde
                                        println "Failed to compile "+f
853 321 mdecorde
                                }
854 321 mdecorde
                                builder.setAnnotationDone(this.annotate_status);
855 321 mdecorde
                        }
856 321 mdecorde
                }
857 321 mdecorde
858 321 mdecorde
                //close txmcorpus
859 803 mdecorde
                if (!createOutput(cqpFile)) {
860 803 mdecorde
                        println "Error: could not write cqp file"
861 321 mdecorde
                        return false;
862 321 mdecorde
                } else {
863 321 mdecorde
                        output.write("</txmcorpus>\n");
864 321 mdecorde
                        output.close();
865 321 mdecorde
                }
866 321 mdecorde
                println ""
867 321 mdecorde
868 321 mdecorde
                //2- Import into CWB
869 321 mdecorde
                def outDir = binDir.getAbsolutePath();;
870 321 mdecorde
                CwbEncode cwbEn = new CwbEncode();
871 321 mdecorde
                cwbEn.setDebug(debug);
872 321 mdecorde
                CwbMakeAll cwbMa = new CwbMakeAll();
873 321 mdecorde
                cwbMa.setDebug(debug);
874 321 mdecorde
875 321 mdecorde
                def pAttrs = ["id","q","sp","pb","lb","orig","sic","abbr","ref","pos","supplied","lang","nametype"];
876 321 mdecorde
                for(String type : anaTypes)
877 321 mdecorde
                        pAttrs.add(type.substring(1)); // remove #
878 321 mdecorde
879 321 mdecorde
                structs = sattrsListener.getStructs();
880 321 mdecorde
                structsProf = sattrsListener.getProfs();
881 321 mdecorde
                if (debug) {
882 321 mdecorde
                        println structs
883 321 mdecorde
                        println structsProf
884 321 mdecorde
                }
885 321 mdecorde
                // add structures+properties found in sources
886 321 mdecorde
                List<String> sargs = new ArrayList<String>();
887 321 mdecorde
                for (String name : structs.keySet()) {
888 321 mdecorde
                        if ( name == "text") continue; // added after
889 321 mdecorde
                        //if ( name == "q") continue; // added after
890 321 mdecorde
                        //if ( name == "foreign") continue; // added after
891 321 mdecorde
                        String concat = name+":"+structsProf.get(name); // append the depth
892 321 mdecorde
                        for (String value : structs.get(name)) // append the attributes
893 321 mdecorde
                                concat += "+"+value;
894 321 mdecorde
                        if ((name == "p" || name == "body" || name == "back" || name == "front") &&
895 321 mdecorde
                        !(concat.endsWith("+n") || concat.contains("+n+")))
896 321 mdecorde
                                concat += "+n"
897 321 mdecorde
                        sargs.add(concat);
898 321 mdecorde
                }
899 321 mdecorde
900 321 mdecorde
                String textSAttributes = "text:0+id+base+project";
901 321 mdecorde
                if (metadataXPath != null) {
902 321 mdecorde
                        for (String meta : metadataXPath.keySet()) // text property declarations from metadata.csv
903 321 mdecorde
                                textSAttributes+="+"+meta;
904 321 mdecorde
                }
905 321 mdecorde
                if (!metadataXPath.keySet().contains("sigle"))
906 321 mdecorde
                        textSAttributes+="+sigle";
907 321 mdecorde
908 321 mdecorde
                sargs.add(textSAttributes)
909 321 mdecorde
                sargs.add("txmcorpus:0+lang")
910 321 mdecorde
                //sargs.add("q:0+n+lang")
911 321 mdecorde
                for (int c = 0 ; c < sargs.size() ; c++) {
912 321 mdecorde
                        String sarg = sargs.get(c);
913 321 mdecorde
                        if (sarg.startsWith("q:")) {
914 321 mdecorde
                                if (! sarg.contains("+n")) sarg +="+n"
915 321 mdecorde
                                if (! sarg.contains("+lang")) sarg +="+lang"
916 321 mdecorde
917 321 mdecorde
                                sargs.set(c, sarg);
918 321 mdecorde
                        } else if(sarg.startsWith("foreign:")) {
919 321 mdecorde
                                if (! sarg.contains("+n")) sarg +="+n"
920 321 mdecorde
                                if (! sarg.contains("+lang")) sarg +="+lang"
921 321 mdecorde
                                sargs.set(c, sarg);
922 321 mdecorde
                        } else if(sarg.startsWith("ab:") || sarg.startsWith("sp:")) {
923 321 mdecorde
                                if (! sarg.contains("+n")) sarg +="+n"
924 321 mdecorde
                                if (! sarg.contains("+subtype")) sarg +="+subtype"
925 321 mdecorde
                                if (! sarg.contains("+rend")) sarg +="+rend"
926 321 mdecorde
                                sargs.set(c, sarg);
927 321 mdecorde
                        }  else if(sarg.startsWith("div:")) {
928 321 mdecorde
                                if (! sarg.contains("+n")) sarg +="+n"
929 321 mdecorde
                                if (! sarg.contains("+id")) sarg +="+id"
930 321 mdecorde
                                if (! sarg.contains("+type")) sarg +="+type"
931 321 mdecorde
                                if (! sarg.contains("+subtype")) sarg +="+subtype"
932 321 mdecorde
                                sargs.set(c, sarg);
933 321 mdecorde
                        } else if(sarg.startsWith("name:")) {
934 321 mdecorde
                                if (! sarg.contains("+n")) sarg +="+n"
935 321 mdecorde
                                if (! sarg.contains("+type")) sarg +="+type"
936 321 mdecorde
                                sargs.set(c, sarg);
937 321 mdecorde
                        }
938 321 mdecorde
                }
939 321 mdecorde
                sargs.sort();
940 321 mdecorde
941 321 mdecorde
                String[] sAttributes = sargs;
942 321 mdecorde
                String[] pAttributes = pAttrs;
943 321 mdecorde
                println "P-attributes: "+pAttributes
944 321 mdecorde
                println "S-attributes: "+sargs
945 321 mdecorde
946 321 mdecorde
                try {
947 321 mdecorde
                        String regPath = outDir + "/registry/"+corpusname.toLowerCase(); // CQP wants lower case registry files
948 803 mdecorde
                        cwbEn.run(outDir + "/data/${corpusname}", outDir + "/cqp/"+corpusname+".cqp", regPath,pAttributes, sAttributes);
949 321 mdecorde
                        if (!new File(regPath).exists()) {
950 321 mdecorde
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
951 321 mdecorde
                                return false;
952 321 mdecorde
                        }
953 714 mdecorde
                        cwbMa.run(corpusname, outDir + "/registry");
954 321 mdecorde
                } catch (Exception ex) {System.out.println(ex);return false;}
955 321 mdecorde
956 321 mdecorde
                return true;
957 321 mdecorde
        }
958 321 mdecorde
959 321 mdecorde
        /**
960 321 mdecorde
         * show cwb utils messages.
961 321 mdecorde
         */
962 321 mdecorde
        public void setDebug()
963 321 mdecorde
        {
964 321 mdecorde
                this.debug = true;
965 321 mdecorde
        }
966 321 mdecorde
967 321 mdecorde
        /**
968 321 mdecorde
         * test purpose.
969 321 mdecorde
         *
970 321 mdecorde
         * @param args the arguments
971 321 mdecorde
         */
972 321 mdecorde
        public static void main(String[] args)
973 321 mdecorde
        {
974 321 mdecorde
                File dir = new File("~/xml/bfm");
975 321 mdecorde
                def c = new compiler();
976 321 mdecorde
                c.setDebug();
977 321 mdecorde
                c.setCwbPath("~/TXM/cwb/bin");
978 321 mdecorde
                c.run(dir);
979 321 mdecorde
        }
980 321 mdecorde
}