Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / RGAQCJ / compiler.groovy @ 479

History | View | Annotate | Download (14.5 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
//
22 321 mdecorde
//
23 321 mdecorde
//
24 479 mdecorde
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25 321 mdecorde
// $LastChangedRevision: 3400 $
26 321 mdecorde
// $LastChangedBy: mdecorde $
27 321 mdecorde
//
28 321 mdecorde
package org.txm.importer.RGAQCJ
29 321 mdecorde
30 321 mdecorde
31 321 mdecorde
32 321 mdecorde
import org.txm.importer.cwb.CwbEncode
33 321 mdecorde
import org.txm.importer.cwb.CwbMakeAll
34 321 mdecorde
import org.txm.importer.*;
35 321 mdecorde
import org.txm.scripts.*;
36 321 mdecorde
import org.txm.scripts.teitxm.BuildTTSrc;
37 321 mdecorde
import org.txm.scripts.teitxm.*;
38 321 mdecorde
import org.txm.utils.treetagger.TreeTagger;
39 321 mdecorde
40 321 mdecorde
import javax.xml.stream.*;
41 321 mdecorde
import java.net.URL;
42 321 mdecorde
import java.io.File;
43 321 mdecorde
import java.util.HashMap;
44 321 mdecorde
import java.util.List;
45 321 mdecorde
46 321 mdecorde
// TODO: Auto-generated Javadoc
47 321 mdecorde
/**
48 321 mdecorde
 * The Class compiler.
49 321 mdecorde
 */
50 321 mdecorde
class compiler
51 321 mdecorde
{
52 321 mdecorde
53 321 mdecorde
        /** The debug. */
54 321 mdecorde
        private boolean debug= false;
55 321 mdecorde
56 321 mdecorde
        /** The input data. */
57 321 mdecorde
        private def inputData;
58 321 mdecorde
59 321 mdecorde
        /** The factory. */
60 321 mdecorde
        private def factory;
61 321 mdecorde
62 321 mdecorde
        /** The parser. */
63 321 mdecorde
        private XMLStreamReader parser;
64 321 mdecorde
65 321 mdecorde
        /** The dir. */
66 321 mdecorde
        private def dir;
67 321 mdecorde
68 321 mdecorde
        /** The output. */
69 321 mdecorde
        private def output;
70 321 mdecorde
71 321 mdecorde
        /** The url. */
72 321 mdecorde
        private def url;
73 321 mdecorde
74 321 mdecorde
        /** The anahash. */
75 321 mdecorde
        private HashMap<String,String> anahash =new HashMap<String,String>() ;
76 321 mdecorde
77 321 mdecorde
        /** The text. */
78 321 mdecorde
        String text="";
79 321 mdecorde
80 321 mdecorde
        /** The base. */
81 321 mdecorde
        String base="";
82 321 mdecorde
83 321 mdecorde
        /** The project. */
84 321 mdecorde
        String project="";
85 321 mdecorde
86 321 mdecorde
        /** The cwb loc. */
87 321 mdecorde
        String cwbLoc;
88 321 mdecorde
89 321 mdecorde
        /** The lang. */
90 321 mdecorde
        private String lang ="fr";
91 321 mdecorde
92 321 mdecorde
        /**
93 321 mdecorde
         * initialize.
94 321 mdecorde
         *
95 321 mdecorde
         */
96 321 mdecorde
        public compiler(){}
97 321 mdecorde
98 321 mdecorde
        /**
99 321 mdecorde
         * Instantiates a new compiler.
100 321 mdecorde
         *
101 321 mdecorde
         * @param url the url
102 321 mdecorde
         * @param text the text
103 321 mdecorde
         * @param base the base
104 321 mdecorde
         * @param project the project
105 321 mdecorde
         */
106 321 mdecorde
        public compiler(URL url,String text,String base, String project)
107 321 mdecorde
        {
108 321 mdecorde
                this.text = text
109 321 mdecorde
                this.base = base;
110 321 mdecorde
                this.project = project;
111 321 mdecorde
                try {
112 321 mdecorde
                        this.url = url;
113 321 mdecorde
                        inputData = url.openStream();
114 321 mdecorde
115 321 mdecorde
                        factory = XMLInputFactory.newInstance();
116 321 mdecorde
                        parser = factory.createXMLStreamReader(inputData);
117 321 mdecorde
                } catch (XMLStreamException ex) {
118 321 mdecorde
                        System.out.println(ex);
119 321 mdecorde
                }catch (IOException ex) {
120 321 mdecorde
                        System.out.println("IOException while parsing ");
121 321 mdecorde
                }
122 321 mdecorde
        }
123 321 mdecorde
124 321 mdecorde
        /**
125 321 mdecorde
         * set the language of the corpus.
126 321 mdecorde
         *
127 321 mdecorde
         * @param lang the lang
128 321 mdecorde
         * @return the java.lang. object
129 321 mdecorde
         */
130 321 mdecorde
        public setLang(String lang)
131 321 mdecorde
        {
132 321 mdecorde
                this.lang = lang;
133 321 mdecorde
        }
134 321 mdecorde
135 321 mdecorde
        /**
136 321 mdecorde
         * Sets the cwb path.
137 321 mdecorde
         *
138 321 mdecorde
         * @param path the new cwb path
139 321 mdecorde
         */
140 321 mdecorde
        public void setCwbPath(String path)
141 321 mdecorde
        {
142 321 mdecorde
                if(!new File(path).exists())
143 321 mdecorde
                        System.err.println("CWB Path : "+path+" does not exists")
144 321 mdecorde
                cwbLoc = path;
145 321 mdecorde
        }
146 321 mdecorde
147 321 mdecorde
        /**
148 321 mdecorde
         * Creates the output.
149 321 mdecorde
         *
150 321 mdecorde
         * @param dirPathName the dir path name
151 321 mdecorde
         * @param fileName the file name
152 321 mdecorde
         * @return true, if successful
153 321 mdecorde
         */
154 321 mdecorde
        private boolean createOutput(String dirPathName, String fileName){
155 321 mdecorde
                try {
156 321 mdecorde
                        File f = new File(dirPathName, fileName)
157 321 mdecorde
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
158 321 mdecorde
                        return true;
159 321 mdecorde
                } catch (Exception e) {
160 321 mdecorde
                        System.out.println(e.getLocalizedMessage());
161 321 mdecorde
                        return false;
162 321 mdecorde
                }
163 321 mdecorde
        }
164 321 mdecorde
165 321 mdecorde
        /**
166 321 mdecorde
         * Go to text.
167 321 mdecorde
         */
168 321 mdecorde
        private void GoToText()
169 321 mdecorde
        {
170 321 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
171 321 mdecorde
                {
172 321 mdecorde
                        if(event == XMLStreamConstants.END_ELEMENT)
173 321 mdecorde
                                if(parser.getLocalName().equals("teiHeader"))
174 321 mdecorde
                                        return;
175 321 mdecorde
                }
176 321 mdecorde
        }
177 321 mdecorde
178 321 mdecorde
        /**
179 321 mdecorde
         * Transfom file wtc.
180 321 mdecorde
         *
181 321 mdecorde
         * @param dirPathName the dir path name
182 321 mdecorde
         * @param fileName the file name
183 321 mdecorde
         * @return true, if successful
184 321 mdecorde
         */
185 321 mdecorde
        public boolean transfomFileWtc(String dirPathName, String fileName)
186 321 mdecorde
        {
187 321 mdecorde
                createOutput(dirPathName, fileName);
188 321 mdecorde
189 321 mdecorde
                String headvalue=""
190 321 mdecorde
                String vAna = "";
191 321 mdecorde
                String vForm = "";
192 321 mdecorde
                String wordid= "";
193 321 mdecorde
                String vHead = "";
194 321 mdecorde
195 321 mdecorde
                int p_id = 0;
196 321 mdecorde
                int q_id = 0;
197 321 mdecorde
                int body_id = 0;
198 321 mdecorde
                int front_id = 0;
199 321 mdecorde
                int back_id = 0;
200 321 mdecorde
                String lb_id = 0;
201 321 mdecorde
                int lb_count = 0;
202 321 mdecorde
                String pb_id = 0;
203 321 mdecorde
                int pb_count = 0;
204 321 mdecorde
                String ab_id = 0;
205 321 mdecorde
206 321 mdecorde
                boolean captureword = false;
207 321 mdecorde
208 321 mdecorde
                String vExpan = "";
209 321 mdecorde
                String vCorr = "";
210 321 mdecorde
                String vReg = "";
211 321 mdecorde
                String vOrig = "";
212 321 mdecorde
                String vSic = "";
213 321 mdecorde
                String vAbbr = "";
214 321 mdecorde
215 321 mdecorde
                boolean flaglg = false;
216 321 mdecorde
                boolean flaghead = false;
217 321 mdecorde
                boolean flagAuthor = false;
218 321 mdecorde
                boolean flagDate = false;
219 321 mdecorde
                boolean flagWord = false;
220 321 mdecorde
                boolean flagForm = false;
221 321 mdecorde
                boolean flagAna = false;
222 321 mdecorde
223 321 mdecorde
                boolean flagchoice = false;
224 321 mdecorde
                boolean flagcorr = false;
225 321 mdecorde
                boolean flagsic = false;
226 321 mdecorde
                boolean flagreg = false;
227 321 mdecorde
                boolean flagexpan = false;
228 321 mdecorde
                boolean flagorig = false;
229 321 mdecorde
                boolean flagabbr = false;
230 321 mdecorde
                boolean flagfw = false;
231 321 mdecorde
232 321 mdecorde
                File xpathfile = new File(url.getFile());
233 321 mdecorde
                String titreId = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title[@type='reference']/text()");
234 321 mdecorde
                String auteur = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:author/text()");
235 321 mdecorde
                String datecompo = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:date[@type='compo']/@when");
236 321 mdecorde
                String ssiecle = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:date[@type='compo_sous_siecle']/@n");
237 321 mdecorde
                String domaine = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:textDesc/tei:domain/@type");
238 321 mdecorde
                String genre = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:textDesc/@n");
239 321 mdecorde
                String forme = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:textClass/tei:catRef/@target[contains(.,'forme')]").substring(1);
240 321 mdecorde
                String dialecte = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:region[@type='dialecte_auteur']/text()");
241 321 mdecorde
242 321 mdecorde
                this.GoToText()
243 321 mdecorde
                output.write("<txmcorpus lang=\""+lang+"\">\n");
244 321 mdecorde
                try
245 321 mdecorde
                {
246 321 mdecorde
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
247 321 mdecorde
                        {
248 321 mdecorde
                                switch (event)
249 321 mdecorde
                                {
250 321 mdecorde
                                        case XMLStreamConstants.START_ELEMENT:
251 321 mdecorde
                                                switch (parser.getLocalName())
252 321 mdecorde
                                                {
253 321 mdecorde
                                                        case "text":
254 321 mdecorde
                                                        output.write("<text id=\""+text+"\"" +
255 321 mdecorde
                                                        " titre=\""+titreId+"\"" +
256 321 mdecorde
                                                        " auteur=\""+auteur+"\"" +
257 321 mdecorde
                                                        " datecompo=\""+datecompo+"\"" +
258 321 mdecorde
                                                        " ssiecle=\""+ssiecle+"\"" +
259 321 mdecorde
                                                        " domaine=\""+domaine+"\"" +
260 321 mdecorde
                                                        " genre=\""+genre+"\"" +
261 321 mdecorde
                                                        " forme=\""+forme+"\"" +
262 321 mdecorde
                                                        " dialecte=\""+dialecte+"\"" +
263 321 mdecorde
                                                        " base=\""+base+"\"" +
264 321 mdecorde
                                                        " project=\""+project+"\">\n");
265 321 mdecorde
                                                        captureword=true;
266 321 mdecorde
                                                        break;
267 321 mdecorde
268 321 mdecorde
                                                        case "div":
269 321 mdecorde
                                                        //output.write("<div type=\""+parser.getAttributeValue(null,"type")+"\">\n");
270 321 mdecorde
                                                        break;
271 321 mdecorde
272 321 mdecorde
                                                        case "p":
273 321 mdecorde
                                                        output.write("<p n=\""+(p_id++)+"\">\n");
274 321 mdecorde
                                                        break;
275 321 mdecorde
                                                        case "ab":
276 321 mdecorde
                                                        output.write("<ab n=\""+ab_id+++"\" rend=\""+parser.getAttributeValue(null,"rend")+"\">\n");
277 321 mdecorde
                                                        break;
278 321 mdecorde
                                                        case "q":
279 321 mdecorde
                                                        output.write("<q n=\""+(q_id++)+"\">\n");
280 321 mdecorde
                                                        break;
281 321 mdecorde
                                                        case "front":
282 321 mdecorde
                                                        output.write("<front n=\""+(front_id++)+"\">\n");
283 321 mdecorde
                                                        break;
284 321 mdecorde
                                                        case "body":
285 321 mdecorde
                                                        output.write("<body n=\""+(body_id++)+"\">\n");
286 321 mdecorde
                                                        break;
287 321 mdecorde
                                                        case "back":
288 321 mdecorde
                                                        output.write("<back n=\""+(back_id++)+"\">\n");
289 321 mdecorde
                                                        break;
290 321 mdecorde
                                                        case "lb":
291 321 mdecorde
                                                        String n = parser.getAttributeValue(null,"n")
292 321 mdecorde
                                                        if(n != null)
293 321 mdecorde
                                                                lb_id = n;
294 321 mdecorde
                                                        else
295 321 mdecorde
                                                                lb_id =""+lb_count++;
296 321 mdecorde
                                                        break;
297 321 mdecorde
                                                        case "pb":
298 321 mdecorde
                                                        String n = parser.getAttributeValue(null,"n");
299 321 mdecorde
                                                        if(n != null)
300 321 mdecorde
                                                                pb_id = n;
301 321 mdecorde
                                                        else
302 321 mdecorde
                                                                pb_id = ""+(pb_count++);
303 321 mdecorde
304 321 mdecorde
                                                        break;
305 321 mdecorde
306 321 mdecorde
                                                        case "choice":
307 321 mdecorde
                                                        flagchoice = true;
308 321 mdecorde
                                                        break;
309 321 mdecorde
                                                        case "corr":
310 321 mdecorde
                                                        flagcorr = true;
311 321 mdecorde
                                                        vCorr= "";
312 321 mdecorde
                                                        break;
313 321 mdecorde
                                                        case "reg":
314 321 mdecorde
                                                        flagreg = true;
315 321 mdecorde
                                                        vReg= "";
316 321 mdecorde
                                                        break;
317 321 mdecorde
                                                        case "expan":
318 321 mdecorde
                                                        flagexpan = true;
319 321 mdecorde
                                                        vExpan= "";
320 321 mdecorde
                                                        break;
321 321 mdecorde
                                                        case "orig":
322 321 mdecorde
                                                        flagreg = true;
323 321 mdecorde
                                                        vOrig= "";
324 321 mdecorde
                                                        break;
325 321 mdecorde
                                                        case "sic":
326 321 mdecorde
                                                        flagsic = true;
327 321 mdecorde
                                                        vSic= "";
328 321 mdecorde
                                                        break;
329 321 mdecorde
                                                        case "abbr":
330 321 mdecorde
                                                        flagreg = true;
331 321 mdecorde
                                                        vAbbr= "";
332 321 mdecorde
                                                        break;
333 321 mdecorde
334 321 mdecorde
                                                        case "w":
335 321 mdecorde
                                                        for(int i = 0 ; i < parser.getAttributeCount(); i++)
336 321 mdecorde
                                                                if(parser.getAttributeLocalName(i).equals("id"))
337 321 mdecorde
                                                                {
338 321 mdecorde
                                                                        wordid = parser.getAttributeValue(i);
339 321 mdecorde
                                                                        break;
340 321 mdecorde
                                                                }
341 321 mdecorde
                                                        vAna = "";
342 321 mdecorde
                                                        vForm = "";
343 321 mdecorde
                                                        flagWord = true;
344 321 mdecorde
                                                        break;
345 321 mdecorde
                                                        case "form":
346 321 mdecorde
                                                        flagForm = true;
347 321 mdecorde
                                                        vForm = "";
348 321 mdecorde
                                                        vAna ="";
349 321 mdecorde
                                                        break;
350 321 mdecorde
351 321 mdecorde
                                                        case "ana":
352 321 mdecorde
                                                        flagAna = true;
353 321 mdecorde
                                                        break;
354 321 mdecorde
                                                }
355 321 mdecorde
                                                break;
356 321 mdecorde
357 321 mdecorde
                                        case XMLStreamConstants.END_ELEMENT:
358 321 mdecorde
                                                switch (parser.getLocalName())
359 321 mdecorde
                                                {
360 321 mdecorde
                                                        case "div":
361 321 mdecorde
                                                        break;
362 321 mdecorde
363 321 mdecorde
                                                        case "text":
364 321 mdecorde
                                                        output.write("</text>\n");
365 321 mdecorde
                                                        captureword=false;
366 321 mdecorde
                                                        break;
367 321 mdecorde
                                                        case "p":
368 321 mdecorde
                                                        output.write("</p>\n");
369 321 mdecorde
                                                        break;
370 321 mdecorde
                                                        case "ab":
371 321 mdecorde
                                                        output.write("</ab>\n");
372 321 mdecorde
                                                        break;
373 321 mdecorde
                                                        case "q":
374 321 mdecorde
                                                        output.write("</q>\n");
375 321 mdecorde
                                                        break;
376 321 mdecorde
                                                        case "front":
377 321 mdecorde
                                                        output.write("</front>\n");
378 321 mdecorde
                                                        break;
379 321 mdecorde
                                                        case "body":
380 321 mdecorde
                                                        output.write("</body>\n");
381 321 mdecorde
                                                        break;
382 321 mdecorde
                                                        case "back":
383 321 mdecorde
                                                        output.write("</back>\n");
384 321 mdecorde
                                                        break;
385 321 mdecorde
386 321 mdecorde
                                                        case "fw":
387 321 mdecorde
                                                        flagfw = false;
388 321 mdecorde
                                                        break;
389 321 mdecorde
                                                        case "choice":
390 321 mdecorde
391 321 mdecorde
                                                                if(vCorr != "")
392 321 mdecorde
                                                                {
393 321 mdecorde
                                                                        //System.out.println(vCorr+" >> write corr "+vForm +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr)
394 321 mdecorde
                                                                        output.write( vCorr +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
395 321 mdecorde
                                                                }
396 321 mdecorde
                                                                else if(vReg != "")
397 321 mdecorde
                                                                {
398 321 mdecorde
                                                                        //System.out.println("write reg "+vForm)
399 321 mdecorde
                                                                        output.write( vReg +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
400 321 mdecorde
                                                                }
401 321 mdecorde
                                                                else if(vExpan != "")
402 321 mdecorde
                                                                {
403 321 mdecorde
                                                                        //System.out.println("write expan "+vForm)
404 321 mdecorde
                                                                        output.write( vExpan +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\n");
405 321 mdecorde
                                                                }
406 321 mdecorde
                                                        flagchoice = false;
407 321 mdecorde
                                                        vCorr= "";
408 321 mdecorde
                                                        vSic= "";
409 321 mdecorde
                                                        break;
410 321 mdecorde
                                                        case "corr":
411 321 mdecorde
                                                        flagcorr = false;
412 321 mdecorde
413 321 mdecorde
                                                        break;
414 321 mdecorde
                                                        case "reg":
415 321 mdecorde
                                                        flagreg = false;
416 321 mdecorde
                                                        vReg = "";
417 321 mdecorde
                                                        break;
418 321 mdecorde
                                                        case "expan":
419 321 mdecorde
                                                        flagexpan = false;
420 321 mdecorde
                                                        vExpan= "";
421 321 mdecorde
                                                        break;
422 321 mdecorde
                                                        case "orig":
423 321 mdecorde
                                                        flagreg = false;
424 321 mdecorde
                                                        vOrig= "";
425 321 mdecorde
                                                        break;
426 321 mdecorde
                                                        case "sic":
427 321 mdecorde
                                                        flagsic = false;
428 321 mdecorde
429 321 mdecorde
                                                        break;
430 321 mdecorde
                                                        case "abbr":
431 321 mdecorde
                                                        flagreg = false;
432 321 mdecorde
                                                        vAbbr= "";
433 321 mdecorde
                                                        break;
434 321 mdecorde
435 321 mdecorde
                                                        case "w":
436 321 mdecorde
                                                        if(vAna != null)
437 321 mdecorde
                                                                if(captureword)
438 321 mdecorde
                                                                {
439 321 mdecorde
                                                                        if(flagchoice)
440 321 mdecorde
                                                                        {
441 321 mdecorde
442 321 mdecorde
                                                                        }
443 321 mdecorde
                                                                        else if(flagfw)
444 321 mdecorde
                                                                        {
445 321 mdecorde
446 321 mdecorde
                                                                        }
447 321 mdecorde
                                                                        else
448 321 mdecorde
                                                                        {
449 321 mdecorde
                                                                                output.write( vForm +"\t"+vAna+wordid+"\t"+pb_id+"\t"+lb_id+"\n");
450 321 mdecorde
                                                                        }
451 321 mdecorde
452 321 mdecorde
                                                                }
453 321 mdecorde
454 321 mdecorde
                                                        flagWord = false;
455 321 mdecorde
                                                        break;
456 321 mdecorde
457 321 mdecorde
                                                        case "form":
458 321 mdecorde
                                                        flagForm = false;
459 321 mdecorde
                                                        break;
460 321 mdecorde
461 321 mdecorde
                                                        case "ana":
462 321 mdecorde
                                                        vAna += "\t";
463 321 mdecorde
                                                        flagAna = false;
464 321 mdecorde
                                                        break;
465 321 mdecorde
                                                }
466 321 mdecorde
                                                break;
467 321 mdecorde
468 321 mdecorde
                                        case XMLStreamConstants.CHARACTERS:
469 321 mdecorde
                                                if(flagAna)
470 321 mdecorde
                                                {
471 321 mdecorde
                                                        vAna += parser.getText().trim();
472 321 mdecorde
                                                }
473 321 mdecorde
474 321 mdecorde
                                                if(flagForm)
475 321 mdecorde
                                                {
476 321 mdecorde
                                                        vForm += parser.getText().trim();
477 321 mdecorde
                                                        if(flagchoice)
478 321 mdecorde
                                                        {
479 321 mdecorde
                                                                if(flagsic)
480 321 mdecorde
                                                                {
481 321 mdecorde
                                                                        vSic += parser.getText().trim();
482 321 mdecorde
                                                                }
483 321 mdecorde
                                                                if(flagorig)
484 321 mdecorde
                                                                {
485 321 mdecorde
                                                                        vOrig += parser.getText().trim();
486 321 mdecorde
                                                                }
487 321 mdecorde
                                                                if(flagabbr)
488 321 mdecorde
                                                                {
489 321 mdecorde
                                                                        vAbbr += parser.getText().trim();
490 321 mdecorde
                                                                }
491 321 mdecorde
                                                                if(flagcorr)
492 321 mdecorde
                                                                {
493 321 mdecorde
                                                                        vCorr += parser.getText().trim();
494 321 mdecorde
                                                                }
495 321 mdecorde
                                                        }
496 321 mdecorde
                                                }
497 321 mdecorde
                                }
498 321 mdecorde
                        }
499 321 mdecorde
                        output.write("</txmcorpus>");
500 321 mdecorde
                        output.close();
501 321 mdecorde
                        parser.close();
502 321 mdecorde
                }
503 321 mdecorde
                catch (XMLStreamException ex) {
504 321 mdecorde
                        System.out.println(ex);
505 321 mdecorde
                }
506 321 mdecorde
                catch (IOException ex) {
507 321 mdecorde
                        System.out.println("IOException while parsing " + inputData);
508 321 mdecorde
                }
509 321 mdecorde
510 321 mdecorde
                return true;
511 321 mdecorde
        }
512 321 mdecorde
513 321 mdecorde
514 321 mdecorde
515 321 mdecorde
        /**
516 321 mdecorde
         * Run.
517 321 mdecorde
         *
518 321 mdecorde
         * @param rootDirFile the root dir file
519 321 mdecorde
         * @return true, if successful
520 321 mdecorde
         */
521 321 mdecorde
        public boolean run(File rootDirFile)
522 321 mdecorde
        {
523 321 mdecorde
                String rootDir =rootDirFile.getAbsolutePath();
524 321 mdecorde
525 321 mdecorde
                if(cwbLoc == null)
526 321 mdecorde
                        cwbLoc = org.txm.Toolbox.getParam(org.txm.Toolbox.CQI_SERVER_PATH_TO_CQPLIB)+File.separator;
527 321 mdecorde
528 321 mdecorde
                if(!new File(cwbLoc).exists())
529 321 mdecorde
                {
530 321 mdecorde
                        println ("CWB path error: "+cwbLoc)
531 321 mdecorde
                        return false;
532 321 mdecorde
                }
533 321 mdecorde
                if(!new File(rootDir).exists())
534 321 mdecorde
                {
535 321 mdecorde
                        println ("binary directory does not exists: "+rootDir)
536 321 mdecorde
                        return false;
537 321 mdecorde
                }
538 321 mdecorde
                new File(rootDir+"/wtc/","bfm3tt.wtc").delete();//cleaning&preparing
539 321 mdecorde
                new File(rootDir,"/wtc/").deleteDir();
540 321 mdecorde
                new File(rootDir,"/wtc/").mkdir();
541 321 mdecorde
                new File(rootDir,"/data/").deleteDir();
542 321 mdecorde
                new File(rootDir,"/data/").mkdir();
543 321 mdecorde
                new File(rootDir,"registry/").mkdir();
544 321 mdecorde
545 321 mdecorde
                String textid="";
546 321 mdecorde
                int counttext =0;
547 321 mdecorde
                List<File> files = new File(rootDirFile,"txm").listFiles();
548 321 mdecorde
                //1- Transform into WTC file
549 321 mdecorde
                for(File f : files)
550 321 mdecorde
                {
551 321 mdecorde
                        counttext++;
552 321 mdecorde
                        if(!f.exists())
553 321 mdecorde
                        {
554 321 mdecorde
                                println("file "+f+ " does not exists")
555 321 mdecorde
                        }
556 321 mdecorde
                        else
557 321 mdecorde
                        {
558 321 mdecorde
                                println("process file "+f)
559 321 mdecorde
                                def builder = new compiler(f.toURL(),f.getName(),"bfm3tt","default");
560 321 mdecorde
                                builder.setLang(lang);
561 321 mdecorde
                                builder.transfomFileWtc(rootDir+"/wtc","bfm3tt.wtc");
562 321 mdecorde
                        }
563 321 mdecorde
                }
564 321 mdecorde
565 321 mdecorde
                 //2- Import into CWB
566 321 mdecorde
                 def outDir =rootDir;
567 321 mdecorde
                 def outDirTxm = rootDir;
568 321 mdecorde
                 CwbEncode cwbEn = new CwbEncode();
569 321 mdecorde
                 cwbEn.setDebug(debug);
570 321 mdecorde
                 CwbMakeAll cwbMa = new CwbMakeAll();
571 321 mdecorde
                 cwbMa.setDebug(debug);
572 321 mdecorde
                 String[] pAttributes = ["pos","id","pb","lb"];
573 321 mdecorde
                 String[] sAttributes = ["txmcorpus:0+lang","text:0+id+titre+auteur+datecompo+ssiecle+domaine+genre+forme+dialecte+base+project","front:0+n","body:0+n","ab:0+n+rend","div:0+id+type","q:1+n","p:0+n","back:0+n"];
574 321 mdecorde
                 try
575 321 mdecorde
                 {
576 321 mdecorde
                 cwbEn.run(new File(cwbLoc,"cwb-encode").getAbsolutePath(), outDirTxm + "/data", outDir + "/wtc/"+"bfm3tt.wtc", outDirTxm + "/registry/"+"bfm3tt",pAttributes, sAttributes);
577 321 mdecorde
                 cwbMa.run(new File(cwbLoc,"cwb-makeall").getAbsolutePath(), "BFM3TT", outDirTxm + "/registry");
578 321 mdecorde
                 } catch (Exception ex) {System.out.println(ex); return false;}
579 321 mdecorde
580 321 mdecorde
                System.out.println("Done.")
581 321 mdecorde
582 321 mdecorde
                return true;
583 321 mdecorde
        }
584 321 mdecorde
585 321 mdecorde
        /**
586 321 mdecorde
         * Sets the debug.
587 321 mdecorde
         */
588 321 mdecorde
        public void setDebug()
589 321 mdecorde
        {
590 321 mdecorde
                this.debug = true;
591 321 mdecorde
        }
592 321 mdecorde
593 321 mdecorde
        /**
594 321 mdecorde
         * The main method.
595 321 mdecorde
         *
596 321 mdecorde
         * @param args the arguments
597 321 mdecorde
         */
598 321 mdecorde
        public static void main(String[] args)
599 321 mdecorde
        {
600 321 mdecorde
                File dir = new File("~/xml/bfm3tt");
601 321 mdecorde
                def c = new compiler();
602 321 mdecorde
                c.setDebug();
603 321 mdecorde
                c.setCwbPath("~/TXM/cwb/bin");
604 321 mdecorde
                c.run(dir);
605 321 mdecorde
        }
606 321 mdecorde
}