Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xml / compiler.groovy @ 1688

History | View | Annotate | Download (20 kB)

1 321 mdecorde
2 321 mdecorde
3 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
4 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
5 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
6 321 mdecorde
// Sophia Antipolis, University of Paris 3.
7 321 mdecorde
//
8 321 mdecorde
// The TXM platform is free software: you can redistribute it
9 321 mdecorde
// and/or modify it under the terms of the GNU General Public
10 321 mdecorde
// License as published by the Free Software Foundation,
11 321 mdecorde
// either version 2 of the License, or (at your option) any
12 321 mdecorde
// later version.
13 321 mdecorde
//
14 321 mdecorde
// The TXM platform is distributed in the hope that it will be
15 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
16 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
17 321 mdecorde
// PURPOSE. See the GNU General Public License for more
18 321 mdecorde
// details.
19 321 mdecorde
//
20 321 mdecorde
// You should have received a copy of the GNU General
21 321 mdecorde
// Public License along with the TXM platform. If not, see
22 321 mdecorde
// http://www.gnu.org/licenses.
23 321 mdecorde
//
24 321 mdecorde
//
25 321 mdecorde
//
26 479 mdecorde
// $LastChangedDate: 2016-05-26 17:42:36 +0200 (jeu. 26 mai 2016) $
27 321 mdecorde
// $LastChangedRevision: 3219 $
28 321 mdecorde
// $LastChangedBy: mdecorde $
29 321 mdecorde
//
30 321 mdecorde
31 321 mdecorde
32 986 mdecorde
package org.txm.scripts.importer.xml;
33 321 mdecorde
34 321 mdecorde
import java.util.ArrayList
35 321 mdecorde
import java.util.Collections
36 1000 mdecorde
import org.txm.importer.cwb.BuildCwbEncodeArgs
37 1000 mdecorde
import org.txm.importer.cwb.CwbEncode
38 1000 mdecorde
import org.txm.importer.cwb.CwbMakeAll
39 986 mdecorde
import org.txm.scripts.importer.*
40 321 mdecorde
import org.txm.scripts.*
41 1000 mdecorde
import org.txm.importer.scripts.xmltxm.*
42 321 mdecorde
import org.txm.utils.treetagger.TreeTagger
43 1110 mdecorde
import org.txm.objects.*
44 321 mdecorde
import javax.xml.stream.*
45 321 mdecorde
46 321 mdecorde
import java.net.URL
47 321 mdecorde
import java.io.File
48 321 mdecorde
import java.util.HashMap
49 321 mdecorde
import java.util.List
50 321 mdecorde
import java.util.HashMap
51 321 mdecorde
import java.util.HashSet
52 321 mdecorde
import org.txm.metadatas.*
53 1613 mdecorde
import org.txm.utils.ConsoleProgressBar
54 479 mdecorde
import org.txm.utils.io.FileCopy
55 1110 mdecorde
import org.txm.searchengine.cqp.corpus.*
56 321 mdecorde
57 321 mdecorde
/**
58 321 mdecorde
 * The "compiler" Class of the XML/w import module.
59 321 mdecorde
 */
60 321 mdecorde
class compiler {
61 321 mdecorde
62 321 mdecorde
        /** The debug. */
63 321 mdecorde
        private boolean debug= false;
64 321 mdecorde
65 321 mdecorde
        /** The input data. */
66 321 mdecorde
        private def inputData;
67 321 mdecorde
68 321 mdecorde
        /** The factory. */
69 321 mdecorde
        private def factory;
70 321 mdecorde
71 321 mdecorde
        /** The parser. */
72 321 mdecorde
        private XMLStreamReader parser;
73 321 mdecorde
74 321 mdecorde
        /** The dir. */
75 321 mdecorde
        private def dir;
76 321 mdecorde
77 321 mdecorde
        /** The output. */
78 321 mdecorde
        private def output;
79 321 mdecorde
80 321 mdecorde
        /** The url. */
81 321 mdecorde
        private def url;
82 321 mdecorde
83 321 mdecorde
        /** The anatypes. */
84 321 mdecorde
        private static anatypes = []
85 321 mdecorde
        private static anavalues = [:]
86 321 mdecorde
87 321 mdecorde
        /** The anahash. */
88 321 mdecorde
        private HashMap<String, String> anahash = new HashMap<String, String>() ;
89 321 mdecorde
90 321 mdecorde
        private static SAttributesListener sattrsListener;
91 321 mdecorde
        private static HashMap<String, ArrayList<String>> structs;
92 321 mdecorde
        private static HashMap<String, Integer> structsProf;
93 321 mdecorde
94 321 mdecorde
        /** The text. */
95 321 mdecorde
        String text="";
96 321 mdecorde
97 321 mdecorde
        /** The base. */
98 321 mdecorde
        String base="";
99 321 mdecorde
100 321 mdecorde
        /** The text attributes. */
101 321 mdecorde
        String[] textAttributes = null;
102 321 mdecorde
103 321 mdecorde
        /** The lang. */
104 321 mdecorde
        private String lang ="fr";
105 321 mdecorde
106 321 mdecorde
        public static sortMetadata = null;
107 321 mdecorde
        public static normalizeMetadata = false;
108 321 mdecorde
109 321 mdecorde
        /**
110 321 mdecorde
         * initialize.
111 321 mdecorde
         *
112 321 mdecorde
         */
113 321 mdecorde
        public compiler(){}
114 321 mdecorde
115 321 mdecorde
        public void setOptions(String sortmetadata, boolean normalizemetadata)
116 321 mdecorde
        {
117 321 mdecorde
                sortMetadata = sortmetadata;
118 321 mdecorde
                normalizeMetadata = normalizemetadata;
119 321 mdecorde
        }
120 321 mdecorde
121 321 mdecorde
        /**
122 321 mdecorde
         * Instantiates a new compiler.
123 321 mdecorde
         *
124 321 mdecorde
         * @param url the url
125 321 mdecorde
         * @param text the text
126 321 mdecorde
         * @param base the base
127 321 mdecorde
         * @param project the project
128 321 mdecorde
         */
129 1110 mdecorde
        public compiler(URL url, String text, String base, String projectName)
130 321 mdecorde
        {
131 321 mdecorde
                this.text = text
132 321 mdecorde
                this.base = base;
133 321 mdecorde
                this.textAttributes = textAttributes;
134 321 mdecorde
                try {
135 321 mdecorde
                        this.url = url;
136 321 mdecorde
                        inputData = url.openStream();
137 321 mdecorde
138 321 mdecorde
                        factory = XMLInputFactory.newInstance();
139 321 mdecorde
                        parser = factory.createXMLStreamReader(inputData);
140 321 mdecorde
141 321 mdecorde
                        if (sattrsListener == null)
142 321 mdecorde
                                sattrsListener = new SAttributesListener(parser);
143 321 mdecorde
                        else
144 321 mdecorde
                                sattrsListener.start(parser)
145 321 mdecorde
146 321 mdecorde
                } catch (XMLStreamException ex) {
147 321 mdecorde
                        System.out.println(ex);
148 321 mdecorde
                }catch (IOException ex) {
149 321 mdecorde
                        System.err.println("IOException while parsing ");
150 321 mdecorde
                }
151 321 mdecorde
        }
152 321 mdecorde
153 321 mdecorde
        /**
154 321 mdecorde
         * set the language of the corpus.
155 321 mdecorde
         *
156 321 mdecorde
         * @param lang the lang
157 321 mdecorde
         * @return the java.lang. object
158 321 mdecorde
         */
159 321 mdecorde
        public setLang(String lang)
160 321 mdecorde
        {
161 321 mdecorde
                this.lang = lang;
162 321 mdecorde
        }
163 321 mdecorde
164 321 mdecorde
        /** The annotation success. */
165 321 mdecorde
        boolean annotationSuccess = false;
166 321 mdecorde
167 321 mdecorde
        /**
168 321 mdecorde
         * Sets the annotation success.
169 321 mdecorde
         *
170 321 mdecorde
         * @param val the new annotation success
171 321 mdecorde
         */
172 321 mdecorde
        public void setAnnotationSuccess(boolean val)
173 321 mdecorde
        {
174 321 mdecorde
                this.annotationSuccess = val;
175 321 mdecorde
        }
176 321 mdecorde
177 321 mdecorde
        /**
178 321 mdecorde
         * Creates the output.
179 321 mdecorde
         *
180 321 mdecorde
         * @param dirPathName the dir path name
181 321 mdecorde
         * @param fileName the file name
182 321 mdecorde
         * @return true, if successful
183 321 mdecorde
         */
184 321 mdecorde
        private boolean createOutput(File f){
185 321 mdecorde
                try {
186 1613 mdecorde
                        output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f, f.exists())) , "UTF-8");
187 321 mdecorde
                        return true;
188 321 mdecorde
                } catch (Exception e) {
189 321 mdecorde
                        System.err.println(e);
190 321 mdecorde
                        return false;
191 321 mdecorde
                }
192 321 mdecorde
        }
193 321 mdecorde
194 321 mdecorde
        /**
195 321 mdecorde
         * Go to text.
196 321 mdecorde
         */
197 321 mdecorde
        private void GoToText()
198 321 mdecorde
        {
199 321 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
200 321 mdecorde
                        if (event == XMLStreamConstants.END_ELEMENT)
201 321 mdecorde
                                if (parser.getLocalName().equals("teiHeader"))
202 321 mdecorde
                                        return;
203 321 mdecorde
                }
204 321 mdecorde
        }
205 321 mdecorde
206 321 mdecorde
        /**
207 803 mdecorde
         * Transfom file cqp.
208 321 mdecorde
         *
209 321 mdecorde
         * @param dirPathName the dir path name
210 321 mdecorde
         * @param fileName the file name
211 321 mdecorde
         * @return true, if successful
212 321 mdecorde
         */
213 1110 mdecorde
        public boolean transfomFileCqp(Project project, File cqpFile, HashMap<String, String> textmetadata)
214 321 mdecorde
        {
215 803 mdecorde
                if (!createOutput(cqpFile))
216 321 mdecorde
                        return false;
217 321 mdecorde
218 321 mdecorde
                String headvalue=""
219 321 mdecorde
                String vAna = "";
220 321 mdecorde
                String vForm = "";
221 321 mdecorde
                String wordid= "";
222 321 mdecorde
                String vHead = "";
223 321 mdecorde
224 321 mdecorde
                int p_id = 0;
225 321 mdecorde
                int s_id = 0;
226 321 mdecorde
227 321 mdecorde
                def divs = []
228 321 mdecorde
                def ncounts = [:] // contains the n values per tags with no attribute
229 321 mdecorde
230 321 mdecorde
                boolean captureword = false;
231 321 mdecorde
                boolean flagForm = false;
232 321 mdecorde
                boolean flagAna = false;
233 321 mdecorde
234 321 mdecorde
                String anatype = "";
235 321 mdecorde
                String anavalue = "";
236 321 mdecorde
                boolean stopAtFirstSort = true;
237 321 mdecorde
                boolean foundtei = false;
238 321 mdecorde
                boolean foundtext = false;
239 321 mdecorde
                //output.write("<txmcorpus lang=\""+lang+"\">\n");
240 321 mdecorde
                try {
241 321 mdecorde
                        String localname;
242 321 mdecorde
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
243 321 mdecorde
                        {
244 321 mdecorde
                                switch (event) {
245 321 mdecorde
                                        case XMLStreamConstants.START_ELEMENT:
246 321 mdecorde
                                                localname = parser.getLocalName().toLowerCase();
247 321 mdecorde
                                                if ("tei".equals(localname)) foundtei = true;
248 321 mdecorde
                                                switch (localname) {
249 321 mdecorde
                                                        case "text":
250 321 mdecorde
                                                                sattrsListener.startElement(localname);
251 321 mdecorde
                                                                foundtext = true;
252 1110 mdecorde
                                                                output.write("<text id=\""+text+"\" base=\""+base+"\"" + " project=\""+project.getName()+"\"");
253 321 mdecorde
                                                        //                                                        for (String name : textmetadata.keySet())
254 321 mdecorde
                                                        //                                                                output.write(" "+name+"=\""+textmetadata.get(name)+"\"")
255 321 mdecorde
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
256 321 mdecorde
                                                                        String attrname = parser.getAttributeLocalName(i);
257 1395 mdecorde
                                                                        String attrvalue = parser.getAttributeValue(i).replaceAll("\"", "&quot;")
258 321 mdecorde
                                                                        if (normalizeMetadata)
259 321 mdecorde
                                                                                attrvalue = attrvalue.toLowerCase();
260 321 mdecorde
                                                                        if (attrname != "id")
261 321 mdecorde
                                                                                output.write(" "+attrname.toLowerCase()+"=\""+attrvalue+"\"")
262 321 mdecorde
                                                                }
263 321 mdecorde
                                                                output.write(">\n");
264 321 mdecorde
265 321 mdecorde
                                                        //                                                                if (textAttributes == null) {
266 321 mdecorde
                                                        //                                                                        textAttributes = new String[parser.getAttributeCount()];
267 321 mdecorde
                                                        //
268 321 mdecorde
                                                        //                                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
269 321 mdecorde
                                                        //                                                                                textAttributes[i]=parser.getAttributeLocalName(i).toLowerCase();
270 321 mdecorde
                                                        //                                                                        }
271 321 mdecorde
                                                        //                                                                }
272 321 mdecorde
273 321 mdecorde
                                                                break;
274 321 mdecorde
275 321 mdecorde
276 321 mdecorde
                                                        case "w":
277 321 mdecorde
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
278 321 mdecorde
                                                                        if (parser.getAttributeLocalName(i).equals("id")) {
279 321 mdecorde
                                                                                wordid = parser.getAttributeValue(i);
280 321 mdecorde
                                                                        }
281 321 mdecorde
                                                                }
282 321 mdecorde
                                                                anavalues = [:];
283 321 mdecorde
                                                                break;
284 321 mdecorde
                                                        case "form":
285 321 mdecorde
                                                                flagForm = true;
286 321 mdecorde
                                                                vForm = "";
287 321 mdecorde
                                                                vAna ="";
288 321 mdecorde
                                                                break;
289 321 mdecorde
290 321 mdecorde
                                                        case "ana":
291 321 mdecorde
                                                                flagAna = true;
292 321 mdecorde
                                                                anavalue = "";
293 321 mdecorde
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++)
294 321 mdecorde
                                                                        if ("type".equals(parser.getAttributeLocalName(i))) {
295 321 mdecorde
                                                                                anatype = parser.getAttributeValue(i).substring(1);//remove the #
296 321 mdecorde
                                                                                break;
297 321 mdecorde
                                                                        }
298 321 mdecorde
                                                                break;
299 321 mdecorde
300 321 mdecorde
                                                        default:
301 321 mdecorde
//                                                                if ("div" == localname ) {
302 321 mdecorde
//                                                                        def type = localname;
303 321 mdecorde
//                                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
304 321 mdecorde
//                                                                                String attrname = parser.getAttributeLocalName(i);
305 321 mdecorde
//                                                                                if ("type".equals(attrname)) {
306 321 mdecorde
//                                                                                        type= parser.getAttributeValue(i)
307 321 mdecorde
//                                                                                }
308 321 mdecorde
//                                                                        }
309 321 mdecorde
//                                                                        divs << type;
310 321 mdecorde
//                                                                        localname = type
311 321 mdecorde
//                                                                }
312 321 mdecorde
313 321 mdecorde
                                                                if (foundtei && !foundtext) break;
314 321 mdecorde
315 321 mdecorde
                                                                sattrsListener.startElement(localname);
316 321 mdecorde
                                                                output.write("<"+localname);
317 321 mdecorde
318 321 mdecorde
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
319 321 mdecorde
                                                                        String attrname = parser.getAttributeLocalName(i);
320 321 mdecorde
                                                                        String attrvalue = parser.getAttributeValue(i)
321 321 mdecorde
                                                                        if (normalizeMetadata)
322 321 mdecorde
                                                                                attrvalue = attrvalue.toLowerCase();
323 1395 mdecorde
                                                                        output.write(" "+attrname.toLowerCase()+"=\""+attrvalue.replaceAll("\"", "&quot;")+"\"")
324 321 mdecorde
                                                                }
325 321 mdecorde
                                                                if (parser.getAttributeCount() == 0) { // add the n attribute
326 321 mdecorde
                                                                        if (!ncounts.containsKey(localname)) ncounts.put(localname, 0);
327 321 mdecorde
                                                                        int ncount = ncounts.get(localname);
328 321 mdecorde
                                                                        ncounts.put(localname, ncount+1);
329 321 mdecorde
                                                                        output.write(" n=\""+ncount+"\"")
330 321 mdecorde
                                                                }
331 321 mdecorde
                                                                output.write(">\n");
332 321 mdecorde
                                                }
333 321 mdecorde
                                                break;
334 321 mdecorde
335 321 mdecorde
                                        case XMLStreamConstants.END_ELEMENT:
336 321 mdecorde
                                                localname = parser.getLocalName().toLowerCase();
337 321 mdecorde
                                                switch (localname) {
338 321 mdecorde
                                                        case "w":
339 321 mdecorde
                                                                for (String type : anatypes) {
340 321 mdecorde
                                                                        def v = anavalues.get(type);
341 321 mdecorde
                                                                        if (v != null) vAna +="\t"+v;
342 321 mdecorde
                                                                        else vAna +="\t";
343 321 mdecorde
                                                                }
344 321 mdecorde
                                                                vForm = vForm.replaceAll("\n", "").replaceAll("&", "&amp;").replaceAll("<", "&lt;");
345 321 mdecorde
                                                                if (vAna != null) {
346 321 mdecorde
                                                                        output.write(vForm+"\t"+wordid+vAna+"\n");
347 321 mdecorde
                                                                }
348 321 mdecorde
                                                                vAna = "";
349 321 mdecorde
                                                                vForm = "";
350 321 mdecorde
                                                                break;
351 321 mdecorde
352 321 mdecorde
                                                        case "tei":
353 321 mdecorde
                                                                break;
354 321 mdecorde
                                                        case "form":
355 321 mdecorde
                                                                flagForm = false;
356 321 mdecorde
                                                                break;
357 321 mdecorde
                                                        case "ana":
358 321 mdecorde
                                                                anavalues.put(anatype, anavalue)
359 321 mdecorde
                                                                flagAna = false;
360 321 mdecorde
                                                                break;
361 321 mdecorde
                                                        default:
362 321 mdecorde
                                                                if (foundtei && !foundtext) break;
363 321 mdecorde
364 321 mdecorde
//                                                                if ("div" == localname && divs.size() > 0) {
365 321 mdecorde
//                                                                        localname = divs.pop()
366 321 mdecorde
//                                                                }
367 321 mdecorde
368 321 mdecorde
                                                                sattrsListener.endElement(localname);
369 321 mdecorde
                                                                output.write("</"+localname+">\n");
370 321 mdecorde
                                                }
371 321 mdecorde
                                                break;
372 321 mdecorde
373 321 mdecorde
                                        case XMLStreamConstants.CHARACTERS:
374 321 mdecorde
                                                if (flagForm)
375 321 mdecorde
                                                        vForm += parser.getText().trim();
376 321 mdecorde
                                                if (flagAna) {
377 321 mdecorde
                                                        if (normalizeMetadata)
378 321 mdecorde
                                                                anavalue += parser.getText().trim().toLowerCase();
379 321 mdecorde
                                                        else
380 321 mdecorde
                                                                anavalue += parser.getText().trim();
381 321 mdecorde
                                                }
382 321 mdecorde
                                                break;
383 321 mdecorde
                                }
384 321 mdecorde
                        }
385 321 mdecorde
                        //output.write("</txmcorpus>");
386 321 mdecorde
                        output.close();
387 1688 mdecorde
                        if (parser != null) parser.close();
388 1688 mdecorde
                if (inputData != null) inputData.close();
389 321 mdecorde
                } catch (Exception ex) {
390 321 mdecorde
                        System.out.println("Exception while parsing " + inputData+" of Text "+text);
391 321 mdecorde
                        File xmlFile = null
392 321 mdecorde
                        File errorDir = null
393 321 mdecorde
                        try {
394 321 mdecorde
                                xmlFile = new File(url.getFile())
395 803 mdecorde
                                errorDir = new File(cqpFile.getParentFile(), "compiler-error")
396 321 mdecorde
                                println "Warning: Moving $xmlFile to $errorDir"
397 321 mdecorde
                                errorDir.mkdir();
398 321 mdecorde
                                FileCopy.copy(xmlFile, new File(errorDir, xmlFile.getName()))
399 321 mdecorde
                        } catch(Exception eCopy) {
400 321 mdecorde
                                println "Error while moving "+url+" to "+errorDir
401 321 mdecorde
                        }
402 1688 mdecorde
                        if (parser != null) parser.close();
403 1688 mdecorde
                        if (inputData != null) inputData.close();
404 321 mdecorde
                        return false;
405 321 mdecorde
                }
406 321 mdecorde
                return true;
407 321 mdecorde
        }
408 321 mdecorde
409 321 mdecorde
        private void getAnaTypes(File xmlFile) {
410 321 mdecorde
                inputData = xmlFile.toURI().toURL().openStream();
411 321 mdecorde
                factory = XMLInputFactory.newInstance();
412 321 mdecorde
                parser = factory.createXMLStreamReader(inputData);
413 321 mdecorde
                String ana = "ana"
414 321 mdecorde
                HashSet<String> types = new HashSet<String>();
415 321 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
416 321 mdecorde
                        if (event == XMLStreamConstants.START_ELEMENT) { // start elem
417 321 mdecorde
                                if (ana.equals(parser.getLocalName())) { // ana elem
418 321 mdecorde
                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type
419 321 mdecorde
                                                if ("type".equals(parser.getAttributeLocalName(i))) { // @type
420 321 mdecorde
                                                        types.add(parser.getAttributeValue(i).substring(1)); //remove the #
421 321 mdecorde
                                                        break;
422 321 mdecorde
                                                }
423 321 mdecorde
                                        }
424 321 mdecorde
                                }
425 321 mdecorde
                        }
426 321 mdecorde
                }
427 1688 mdecorde
428 1688 mdecorde
                if (parser != null) parser.close();
429 1688 mdecorde
                if (inputData != null) inputData.close();
430 321 mdecorde
431 321 mdecorde
                for (String type : types)
432 321 mdecorde
                        if (!anatypes.contains(type))
433 321 mdecorde
                                anatypes << type
434 321 mdecorde
        }
435 321 mdecorde
436 321 mdecorde
        /**
437 321 mdecorde
         * Run.
438 321 mdecorde
         *
439 321 mdecorde
         * @param rootDirFile the root dir file
440 321 mdecorde
         * @param basename the basename
441 321 mdecorde
         * @param textAttributes the text attributes
442 321 mdecorde
         * @param srcfiles the srcfiles
443 321 mdecorde
         * @return true, if successful
444 321 mdecorde
         */
445 1110 mdecorde
        public boolean run(Project project, File binDir, File txmDir, String corpusname, String[] textAttributes, def srcfiles, Metadatas metadatas)
446 321 mdecorde
        {
447 321 mdecorde
                sattrsListener = null; // reset SAttribute Listener for each new import
448 321 mdecorde
                String rootDir = binDir.getAbsolutePath();
449 321 mdecorde
                anatypes = [] // reset
450 321 mdecorde
                anavalues = [:] // reset
451 714 mdecorde
452 714 mdecorde
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
453 1110 mdecorde
                        println ("Error: CWB executables rights are not well setted.")
454 321 mdecorde
                        return false;
455 321 mdecorde
                }
456 1110 mdecorde
457 1110 mdecorde
                CorpusBuild corpus = project.getCorpusBuild(project.getName());
458 1110 mdecorde
                if (corpus != null) {
459 1110 mdecorde
                        //println "CLEAN PREVIOUS CORPUS"
460 1110 mdecorde
                        corpus.delete(); // remove old files
461 321 mdecorde
                }
462 1110 mdecorde
463 1110 mdecorde
                // make new one
464 1110 mdecorde
                corpus = new MainCorpus(project);
465 1110 mdecorde
                corpus.setID(project.getName());
466 1110 mdecorde
                corpus.setName(project.getName());
467 1110 mdecorde
                corpus.setDescription("Built with the XML/w import module");
468 1110 mdecorde
469 1110 mdecorde
                File cqpFile = new File(binDir,"cqp/"+corpusname+".cqp");
470 1395 mdecorde
cqpFile.delete()
471 1395 mdecorde
472 1110 mdecorde
                new File(binDir,"cqp").mkdirs()
473 1110 mdecorde
                new File(binDir,"data").mkdirs()
474 1110 mdecorde
                new File(binDir,"registry").mkdirs()
475 321 mdecorde
476 321 mdecorde
                String textid = ""
477 321 mdecorde
                int counttext = 0
478 1110 mdecorde
                List<File> files = txmDir.listFiles(new FileFilter() {
479 1110 mdecorde
                        public boolean accept(File f) {
480 1110 mdecorde
                                return !f.isDirectory() && !f.isHidden() && f.getName().endsWith(".xml");
481 1110 mdecorde
                        }
482 1110 mdecorde
                });
483 804 mdecorde
                //1- Transform into CQP file
484 321 mdecorde
                def builder = null
485 321 mdecorde
486 321 mdecorde
                //start corpus
487 803 mdecorde
                if (createOutput(cqpFile)) {
488 321 mdecorde
                        output.write("<txmcorpus lang=\""+lang+"\">\n")
489 321 mdecorde
                        output.close()
490 321 mdecorde
                }
491 321 mdecorde
492 321 mdecorde
                // sort files
493 321 mdecorde
                if (sortMetadata == null) {
494 321 mdecorde
                        Collections.sort(files)
495 321 mdecorde
                } else {
496 321 mdecorde
                        HashMap<File, String> sortmetadatavalues = new HashMap<File, String>()
497 321 mdecorde
                        for (File f : files) {
498 321 mdecorde
                                String value = MetadataGetter.get(f,"text", sortMetadata)
499 321 mdecorde
                                sortmetadatavalues.put(f, value)
500 321 mdecorde
                        }
501 321 mdecorde
                        println "sort properties value: "+sortmetadatavalues
502 321 mdecorde
                        Collections.sort(files, new Comparator<File>() {
503 321 mdecorde
                                /**
504 321 mdecorde
                                 * Compare.
505 321 mdecorde
                                 *
506 321 mdecorde
                                 * @param o1 the o1
507 321 mdecorde
                                 * @param o2 the o2
508 321 mdecorde
                                 * @return the int
509 321 mdecorde
                                 */
510 321 mdecorde
                                                public int compare(Object o1, Object o2) {
511 321 mdecorde
                                                        String v1 = sortmetadatavalues.get((File)o1)
512 321 mdecorde
                                                        String v2 = sortmetadatavalues.get((File)o2)
513 321 mdecorde
                                                        if (v1 == null || v2 == null) return 0;
514 321 mdecorde
                                                        return v1.compareTo(v2)
515 321 mdecorde
                                                }
516 321 mdecorde
                                        });
517 321 mdecorde
                }
518 321 mdecorde
519 321 mdecorde
                // get all anatypes
520 321 mdecorde
                for (File f : files) {
521 321 mdecorde
                        getAnaTypes(f)
522 321 mdecorde
                }
523 321 mdecorde
524 1613 mdecorde
                println("Compiling "+files.size()+" files ")
525 1613 mdecorde
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
526 321 mdecorde
                for (File f : files) {
527 1613 mdecorde
                        cpb.tick()
528 321 mdecorde
                        HashMap<String, String> textmetadata;
529 321 mdecorde
                        if (metadatas != null)
530 321 mdecorde
                                textmetadata = metadatas.getTextMetadata(f)
531 321 mdecorde
                        else
532 321 mdecorde
                                textmetadata = [:]
533 321 mdecorde
534 321 mdecorde
                        counttext++;
535 321 mdecorde
                        if (!f.exists()) {
536 321 mdecorde
                                println("file "+f+ " does not exists")
537 321 mdecorde
                        } else {
538 321 mdecorde
                                String txtname = f.getName().substring(0,f.getName().length()-4)
539 321 mdecorde
                                builder = new compiler(f.toURI().toURL(), txtname, corpusname, "default")
540 321 mdecorde
                                builder.setLang(lang);
541 1110 mdecorde
                                if (!builder.transfomFileCqp(project, cqpFile, textmetadata)) {
542 321 mdecorde
                                        println("Failed to compile "+f)
543 321 mdecorde
                                }
544 321 mdecorde
                        }
545 321 mdecorde
                }
546 1613 mdecorde
                cpb.done()
547 1613 mdecorde
548 321 mdecorde
                //end corpus
549 803 mdecorde
                if (createOutput(cqpFile)) {
550 321 mdecorde
                        output.write("</txmcorpus>\n")
551 321 mdecorde
                        output.close()
552 321 mdecorde
                }
553 321 mdecorde
                println ""
554 321 mdecorde
                //2- Import into CWB
555 321 mdecorde
                def outDir = rootDir
556 321 mdecorde
557 321 mdecorde
                CwbEncode cwbEn = new CwbEncode()
558 321 mdecorde
                cwbEn.setDebug(debug)
559 321 mdecorde
                CwbMakeAll cwbMa = new CwbMakeAll()
560 321 mdecorde
                cwbMa.setDebug(debug)
561 321 mdecorde
562 321 mdecorde
                List<String> pargs = []
563 321 mdecorde
                pargs.add("id")
564 321 mdecorde
                for (String ana : anatypes)
565 321 mdecorde
                        pargs.add(ana)
566 321 mdecorde
567 321 mdecorde
                String[] pAttrs = pargs
568 321 mdecorde
569 321 mdecorde
                structs = sattrsListener.getStructs()
570 321 mdecorde
                structsProf = sattrsListener.getProfs()
571 321 mdecorde
572 321 mdecorde
                if (debug) {
573 321 mdecorde
                        println structs
574 321 mdecorde
                        println structsProf
575 321 mdecorde
                }
576 321 mdecorde
577 321 mdecorde
                List<String> sargs = new ArrayList<String>()
578 321 mdecorde
                def tmpTextAttrs = []
579 321 mdecorde
                for (String name : structs.keySet()) {
580 321 mdecorde
                        if (name == "text") {
581 321 mdecorde
                                for (String value : structs.get(name)) // append the attributes
582 321 mdecorde
                                        tmpTextAttrs << value // added after
583 321 mdecorde
                                continue;
584 321 mdecorde
                        }
585 321 mdecorde
                        //if ( name == "q") continue; // added after
586 321 mdecorde
                        //if ( name == "foreign") continue; // added after
587 321 mdecorde
                        String concat = name+":"+structsProf.get(name); // append the depth
588 321 mdecorde
                        for (String attributeName : structs.get(name)) // append the attributes
589 321 mdecorde
                                concat += "+"+attributeName.toLowerCase();
590 321 mdecorde
591 321 mdecorde
                        if (structs.get(name).size() == 0) {
592 321 mdecorde
                                concat += "+n";
593 321 mdecorde
                        } else {
594 321 mdecorde
                                if (!structs.get(name).contains("n"))
595 321 mdecorde
                                        concat += "+n"
596 321 mdecorde
                        }
597 321 mdecorde
598 321 mdecorde
                        if ((name == "p" || name == "body" || name == "back" || name == "front")
599 321 mdecorde
                                 && !concat.contains("+n+") && !concat.endsWith("+n"))
600 321 mdecorde
                                concat += "+n"
601 321 mdecorde
602 321 mdecorde
                        sargs.add(concat)
603 321 mdecorde
                }
604 321 mdecorde
605 321 mdecorde
                String textSAttributes = "text:0+id+base+project";
606 321 mdecorde
                for (String name : tmpTextAttrs) {
607 321 mdecorde
                        if (!("id".equals(name) || "base".equals(name) || "project".equals(name)))
608 321 mdecorde
                                textSAttributes += "+"+name.toLowerCase()
609 321 mdecorde
                }
610 321 mdecorde
                //                if (metadataXPath != null) {
611 321 mdecorde
                //                        for (String meta : metadataXPath.keySet()) // text property declarations from metadata.csv
612 321 mdecorde
                //                                textSAttributes+="+"+meta;
613 321 mdecorde
                //                }
614 321 mdecorde
                sargs.add(textSAttributes)
615 321 mdecorde
                sargs.add("txmcorpus:0+lang")
616 321 mdecorde
617 321 mdecorde
                sargs.sort()
618 321 mdecorde
619 321 mdecorde
                String[] sAttributes = sargs
620 321 mdecorde
                String[] pAttributes = pAttrs
621 321 mdecorde
                println "P-attributes: "+pAttributes
622 321 mdecorde
                println "S-attributes: "+sargs
623 321 mdecorde
624 321 mdecorde
                //if(!annotationSuccess)
625 321 mdecorde
                //pAttributes = ["id"];
626 321 mdecorde
627 321 mdecorde
                //println "PATTRIBUTES : "+pargs;
628 321 mdecorde
                /*
629 321 mdecorde
                 ArrayList<String> wordstag = ["w"];
630 321 mdecorde
                 println "Getting structural attributes..."
631 321 mdecorde
                 BuildCwbEncodeArgs argsgetter = new BuildCwbEncodeArgs();
632 321 mdecorde
                 HashMap<String, HashSet<String>> allStructures = new HashMap<String, HashSet<String>>();
633 321 mdecorde
                 HashMap<String, Integer> allStructuresInclusion = new HashMap<String, Integer>();
634 321 mdecorde
                 for (File srcfile: txmDir.listFiles()) {
635 321 mdecorde
                 if (!(!srcfile.getName().endsWith(".csv") && srcfile.canRead() && !srcfile.isHidden() && !srcfile.isDirectory() && ValidateXml.test(srcfile)))
636 321 mdecorde
                 continue;
637 321 mdecorde
                 print "."
638 321 mdecorde
                 argsgetter.process(srcfile, wordstag);
639 321 mdecorde
                 for (String sattr : argsgetter.getSAttributes()) {
640 321 mdecorde
                 int idx = sattr.indexOf(":");
641 321 mdecorde
                 if(idx < 0 )
642 321 mdecorde
                 continue;
643 321 mdecorde
                 String name = sattr.substring(0, idx);
644 321 mdecorde
                 if (!allStructures.containsKey(name)) {
645 321 mdecorde
                 allStructures.put(name, new HashSet<String>());
646 321 mdecorde
                 allStructuresInclusion.put(name, 0);
647 321 mdecorde
                 }
648 321 mdecorde
                 //println "sattr: "+name
649 321 mdecorde
                 String attrs = sattr.substring(idx+1);
650 321 mdecorde
                 String[] split = attrs.split("\\+");
651 321 mdecorde
                 if (split.length > 0) {
652 321 mdecorde
                 int start = 1;
653 321 mdecorde
                 try {// test if first attr is a number
654 321 mdecorde
                 int n = Integer.parseInt(split[0]);
655 321 mdecorde
                 if (n > allStructuresInclusion.get(name))
656 321 mdecorde
                 allStructuresInclusion.put(name, n);
657 321 mdecorde
                 } catch(Exception e) {start = 0;}
658 321 mdecorde
                 for (int i = start ; i < split.length ; i++)
659 321 mdecorde
                 allStructures.get(name).add(split[i]);
660 321 mdecorde
                 }
661 321 mdecorde
                 }
662 321 mdecorde
                 }
663 321 mdecorde
                 // add structures+properties found in sources
664 321 mdecorde
                 List<String> sargs = new ArrayList<String>();
665 321 mdecorde
                 for (String name : allStructuresInclusion.keySet()) {
666 321 mdecorde
                 String concat = name+":"+allStructuresInclusion.get(name);
667 321 mdecorde
                 for (String value : allStructures.get(name))
668 321 mdecorde
                 concat += "+"+value;
669 321 mdecorde
                 if (name.equals("text")) {
670 321 mdecorde
                 concat += "+base+project"
671 321 mdecorde
                 if (!concat.contains("id"))
672 321 mdecorde
                 concat += "+id";
673 321 mdecorde
                 }
674 321 mdecorde
                 sargs.add(concat);
675 321 mdecorde
                 }*/
676 321 mdecorde
677 321 mdecorde
                //                for (int i = 0 ; i < sargs.size() ; i++) {
678 321 mdecorde
                //                        if (sargs.get(i).startsWith("text:")) {
679 321 mdecorde
                //                                String str = sargs.get(i);
680 321 mdecorde
                //                                sargs.set(i, "text:"+str.substring(6));
681 321 mdecorde
                //                        }
682 321 mdecorde
                //                }
683 321 mdecorde
684 321 mdecorde
                //                String textSAttributes = "text:0+id+base+project";
685 321 mdecorde
                //                if (metadatas != null) {
686 321 mdecorde
                //                        for (String meta : metadatas.getHeadersList()) // text property declarations from metadata.csv
687 321 mdecorde
                //                                textSAttributes+="+"+meta;
688 321 mdecorde
                //                }
689 321 mdecorde
                //sargs.add(textSAttributes)
690 321 mdecorde
                //sargs.add("txmcorpus:0+lang")
691 321 mdecorde
692 321 mdecorde
                //                String[] sAttributes = sargs;
693 321 mdecorde
                //                System.out.println("\nCorpus structures: "+sAttributes);
694 321 mdecorde
                //                System.out.println("corpus word properties: "+pAttributes);
695 321 mdecorde
696 321 mdecorde
                try {
697 321 mdecorde
                        String regPath = outDir + "/registry/"+corpusname.toLowerCase();
698 714 mdecorde
                        cwbEn.run(
699 321 mdecorde
                                outDir + "/data/$corpusname",
700 803 mdecorde
                                outDir + "/cqp/"+corpusname+".cqp",
701 321 mdecorde
                                regPath, pAttributes, sAttributes);
702 321 mdecorde
                        if (!new File(regPath).exists()) {
703 321 mdecorde
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
704 321 mdecorde
                                return false;
705 321 mdecorde
                        }
706 714 mdecorde
                        cwbMa.run(corpusname, outDir + "/registry");
707 321 mdecorde
                } catch (Exception ex) {System.out.println(ex); return false;}
708 321 mdecorde
709 321 mdecorde
                return true;
710 321 mdecorde
        }
711 321 mdecorde
712 321 mdecorde
        /**
713 321 mdecorde
         * Sets the debug.
714 321 mdecorde
         */
715 321 mdecorde
        public void setDebug()
716 321 mdecorde
        {
717 321 mdecorde
                this.debug = true;
718 321 mdecorde
        }
719 321 mdecorde
720 321 mdecorde
        /**
721 321 mdecorde
         * The main method.
722 321 mdecorde
         *
723 321 mdecorde
         * @param args the arguments
724 321 mdecorde
         */
725 321 mdecorde
        public static void main(String[] args)
726 321 mdecorde
        {
727 321 mdecorde
                File dir = new File("~/xml/geo");
728 321 mdecorde
                def c = new compiler();
729 321 mdecorde
                c.setDebug();
730 321 mdecorde
                c.setCwbPath("~/TXM/cwb/bin");
731 321 mdecorde
                c.run(dir,"geo");
732 321 mdecorde
        }
733 321 mdecorde
}