Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / xml / compiler.groovy @ 966

History | View | Annotate | Download (19.2 kB)

1 321 mdecorde
2 321 mdecorde
3 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
4 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
5 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
6 321 mdecorde
// Sophia Antipolis, University of Paris 3.
7 321 mdecorde
//
8 321 mdecorde
// The TXM platform is free software: you can redistribute it
9 321 mdecorde
// and/or modify it under the terms of the GNU General Public
10 321 mdecorde
// License as published by the Free Software Foundation,
11 321 mdecorde
// either version 2 of the License, or (at your option) any
12 321 mdecorde
// later version.
13 321 mdecorde
//
14 321 mdecorde
// The TXM platform is distributed in the hope that it will be
15 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
16 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
17 321 mdecorde
// PURPOSE. See the GNU General Public License for more
18 321 mdecorde
// details.
19 321 mdecorde
//
20 321 mdecorde
// You should have received a copy of the GNU General
21 321 mdecorde
// Public License along with the TXM platform. If not, see
22 321 mdecorde
// http://www.gnu.org/licenses.
23 321 mdecorde
//
24 321 mdecorde
//
25 321 mdecorde
//
26 479 mdecorde
// $LastChangedDate: 2016-05-26 17:42:36 +0200 (jeu. 26 mai 2016) $
27 321 mdecorde
// $LastChangedRevision: 3219 $
28 321 mdecorde
// $LastChangedBy: mdecorde $
29 321 mdecorde
//
30 321 mdecorde
31 321 mdecorde
32 321 mdecorde
package org.txm.importer.xml;
33 321 mdecorde
34 321 mdecorde
import java.util.ArrayList
35 321 mdecorde
import java.util.Collections
36 321 mdecorde
import org.txm.importer.cwb.BuildCwbEncodeArgs
37 321 mdecorde
import org.txm.importer.cwb.CwbEncode
38 321 mdecorde
import org.txm.importer.cwb.CwbMakeAll
39 321 mdecorde
import org.txm.importer.*
40 321 mdecorde
import org.txm.scripts.*
41 927 mdecorde
import org.txm.importer.xmltxm.*
42 321 mdecorde
import org.txm.utils.treetagger.TreeTagger
43 321 mdecorde
44 321 mdecorde
import javax.xml.stream.*
45 321 mdecorde
46 321 mdecorde
import java.net.URL
47 321 mdecorde
import java.io.File
48 321 mdecorde
import java.util.HashMap
49 321 mdecorde
import java.util.List
50 321 mdecorde
import java.util.HashMap
51 321 mdecorde
import java.util.HashSet
52 321 mdecorde
import org.txm.metadatas.*
53 479 mdecorde
import org.txm.utils.io.FileCopy
54 321 mdecorde
55 321 mdecorde
/**
56 321 mdecorde
 * The "compiler" Class of the XML/w import module.
57 321 mdecorde
 */
58 321 mdecorde
class compiler {
59 321 mdecorde
60 321 mdecorde
        /** The debug. */
61 321 mdecorde
        private boolean debug= false;
62 321 mdecorde
63 321 mdecorde
        /** The input data. */
64 321 mdecorde
        private def inputData;
65 321 mdecorde
66 321 mdecorde
        /** The factory. */
67 321 mdecorde
        private def factory;
68 321 mdecorde
69 321 mdecorde
        /** The parser. */
70 321 mdecorde
        private XMLStreamReader parser;
71 321 mdecorde
72 321 mdecorde
        /** The dir. */
73 321 mdecorde
        private def dir;
74 321 mdecorde
75 321 mdecorde
        /** The output. */
76 321 mdecorde
        private def output;
77 321 mdecorde
78 321 mdecorde
        /** The url. */
79 321 mdecorde
        private def url;
80 321 mdecorde
81 321 mdecorde
        /** The anatypes. */
82 321 mdecorde
        private static anatypes = []
83 321 mdecorde
        private static anavalues = [:]
84 321 mdecorde
85 321 mdecorde
        /** The anahash. */
86 321 mdecorde
        private HashMap<String, String> anahash = new HashMap<String, String>() ;
87 321 mdecorde
88 321 mdecorde
        private static SAttributesListener sattrsListener;
89 321 mdecorde
        private static HashMap<String, ArrayList<String>> structs;
90 321 mdecorde
        private static HashMap<String, Integer> structsProf;
91 321 mdecorde
92 321 mdecorde
        /** The text. */
93 321 mdecorde
        String text="";
94 321 mdecorde
95 321 mdecorde
        /** The base. */
96 321 mdecorde
        String base="";
97 321 mdecorde
98 321 mdecorde
        /** The project. */
99 321 mdecorde
        String project="";
100 321 mdecorde
101 321 mdecorde
        /** The text attributes. */
102 321 mdecorde
        String[] textAttributes = null;
103 321 mdecorde
104 321 mdecorde
        /** The lang. */
105 321 mdecorde
        private String lang ="fr";
106 321 mdecorde
107 321 mdecorde
        public static sortMetadata = null;
108 321 mdecorde
        public static normalizeMetadata = false;
109 321 mdecorde
110 321 mdecorde
        /**
111 321 mdecorde
         * initialize.
112 321 mdecorde
         *
113 321 mdecorde
         */
114 321 mdecorde
        public compiler(){}
115 321 mdecorde
116 321 mdecorde
        public void setOptions(String sortmetadata, boolean normalizemetadata)
117 321 mdecorde
        {
118 321 mdecorde
                sortMetadata = sortmetadata;
119 321 mdecorde
                normalizeMetadata = normalizemetadata;
120 321 mdecorde
        }
121 321 mdecorde
122 321 mdecorde
        /**
123 321 mdecorde
         * Instantiates a new compiler.
124 321 mdecorde
         *
125 321 mdecorde
         * @param url the url
126 321 mdecorde
         * @param text the text
127 321 mdecorde
         * @param base the base
128 321 mdecorde
         * @param project the project
129 321 mdecorde
         */
130 321 mdecorde
        public compiler(URL url, String text, String base, String project)
131 321 mdecorde
        {
132 321 mdecorde
                this.text = text
133 321 mdecorde
                this.base = base;
134 321 mdecorde
                this.project = project;
135 321 mdecorde
                this.textAttributes = textAttributes;
136 321 mdecorde
                try {
137 321 mdecorde
                        this.url = url;
138 321 mdecorde
                        inputData = url.openStream();
139 321 mdecorde
140 321 mdecorde
                        factory = XMLInputFactory.newInstance();
141 321 mdecorde
                        parser = factory.createXMLStreamReader(inputData);
142 321 mdecorde
143 321 mdecorde
                        if (sattrsListener == null)
144 321 mdecorde
                                sattrsListener = new SAttributesListener(parser);
145 321 mdecorde
                        else
146 321 mdecorde
                                sattrsListener.start(parser)
147 321 mdecorde
148 321 mdecorde
                } catch (XMLStreamException ex) {
149 321 mdecorde
                        System.out.println(ex);
150 321 mdecorde
                }catch (IOException ex) {
151 321 mdecorde
                        System.err.println("IOException while parsing ");
152 321 mdecorde
                }
153 321 mdecorde
        }
154 321 mdecorde
155 321 mdecorde
        /**
156 321 mdecorde
         * set the language of the corpus.
157 321 mdecorde
         *
158 321 mdecorde
         * @param lang the lang
159 321 mdecorde
         * @return the java.lang. object
160 321 mdecorde
         */
161 321 mdecorde
        public setLang(String lang)
162 321 mdecorde
        {
163 321 mdecorde
                this.lang = lang;
164 321 mdecorde
        }
165 321 mdecorde
166 321 mdecorde
        /** The annotation success. */
167 321 mdecorde
        boolean annotationSuccess = false;
168 321 mdecorde
169 321 mdecorde
        /**
170 321 mdecorde
         * Sets the annotation success.
171 321 mdecorde
         *
172 321 mdecorde
         * @param val the new annotation success
173 321 mdecorde
         */
174 321 mdecorde
        public void setAnnotationSuccess(boolean val)
175 321 mdecorde
        {
176 321 mdecorde
                this.annotationSuccess = val;
177 321 mdecorde
        }
178 321 mdecorde
179 321 mdecorde
        /**
180 321 mdecorde
         * Creates the output.
181 321 mdecorde
         *
182 321 mdecorde
         * @param dirPathName the dir path name
183 321 mdecorde
         * @param fileName the file name
184 321 mdecorde
         * @return true, if successful
185 321 mdecorde
         */
186 321 mdecorde
        private boolean createOutput(File f){
187 321 mdecorde
                try {
188 321 mdecorde
                        output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f,f.exists())) , "UTF-8");
189 321 mdecorde
                        return true;
190 321 mdecorde
                } catch (Exception e) {
191 321 mdecorde
                        System.err.println(e);
192 321 mdecorde
                        return false;
193 321 mdecorde
                }
194 321 mdecorde
        }
195 321 mdecorde
196 321 mdecorde
        /**
197 321 mdecorde
         * Go to text.
198 321 mdecorde
         */
199 321 mdecorde
        private void GoToText()
200 321 mdecorde
        {
201 321 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
202 321 mdecorde
                        if (event == XMLStreamConstants.END_ELEMENT)
203 321 mdecorde
                                if (parser.getLocalName().equals("teiHeader"))
204 321 mdecorde
                                        return;
205 321 mdecorde
                }
206 321 mdecorde
        }
207 321 mdecorde
208 321 mdecorde
        /**
209 803 mdecorde
         * Transfom file cqp.
210 321 mdecorde
         *
211 321 mdecorde
         * @param dirPathName the dir path name
212 321 mdecorde
         * @param fileName the file name
213 321 mdecorde
         * @return true, if successful
214 321 mdecorde
         */
215 804 mdecorde
        public boolean transfomFileCqp(File cqpFile, HashMap<String, String> textmetadata)
216 321 mdecorde
        {
217 803 mdecorde
                if (!createOutput(cqpFile))
218 321 mdecorde
                        return false;
219 321 mdecorde
220 321 mdecorde
                String headvalue=""
221 321 mdecorde
                String vAna = "";
222 321 mdecorde
                String vForm = "";
223 321 mdecorde
                String wordid= "";
224 321 mdecorde
                String vHead = "";
225 321 mdecorde
226 321 mdecorde
                int p_id = 0;
227 321 mdecorde
                int s_id = 0;
228 321 mdecorde
229 321 mdecorde
                def divs = []
230 321 mdecorde
                def ncounts = [:] // contains the n values per tags with no attribute
231 321 mdecorde
232 321 mdecorde
                boolean captureword = false;
233 321 mdecorde
                boolean flagForm = false;
234 321 mdecorde
                boolean flagAna = false;
235 321 mdecorde
236 321 mdecorde
                String anatype = "";
237 321 mdecorde
                String anavalue = "";
238 321 mdecorde
                boolean stopAtFirstSort = true;
239 321 mdecorde
                boolean foundtei = false;
240 321 mdecorde
                boolean foundtext = false;
241 321 mdecorde
                //output.write("<txmcorpus lang=\""+lang+"\">\n");
242 321 mdecorde
                try {
243 321 mdecorde
                        String localname;
244 321 mdecorde
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
245 321 mdecorde
                        {
246 321 mdecorde
                                switch (event) {
247 321 mdecorde
                                        case XMLStreamConstants.START_ELEMENT:
248 321 mdecorde
                                                localname = parser.getLocalName().toLowerCase();
249 321 mdecorde
                                                if ("tei".equals(localname)) foundtei = true;
250 321 mdecorde
                                                switch (localname) {
251 321 mdecorde
                                                        case "text":
252 321 mdecorde
                                                                sattrsListener.startElement(localname);
253 321 mdecorde
                                                                foundtext = true;
254 321 mdecorde
                                                                output.write("<text id=\""+text+"\" base=\""+base+"\"" + " project=\""+project+"\"");
255 321 mdecorde
                                                        //                                                        for (String name : textmetadata.keySet())
256 321 mdecorde
                                                        //                                                                output.write(" "+name+"=\""+textmetadata.get(name)+"\"")
257 321 mdecorde
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
258 321 mdecorde
                                                                        String attrname = parser.getAttributeLocalName(i);
259 321 mdecorde
                                                                        String attrvalue = parser.getAttributeValue(i)
260 321 mdecorde
                                                                        if (normalizeMetadata)
261 321 mdecorde
                                                                                attrvalue = attrvalue.toLowerCase();
262 321 mdecorde
                                                                        if (attrname != "id")
263 321 mdecorde
                                                                                output.write(" "+attrname.toLowerCase()+"=\""+attrvalue+"\"")
264 321 mdecorde
                                                                }
265 321 mdecorde
                                                                output.write(">\n");
266 321 mdecorde
267 321 mdecorde
                                                        //                                                                if (textAttributes == null) {
268 321 mdecorde
                                                        //                                                                        textAttributes = new String[parser.getAttributeCount()];
269 321 mdecorde
                                                        //
270 321 mdecorde
                                                        //                                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
271 321 mdecorde
                                                        //                                                                                textAttributes[i]=parser.getAttributeLocalName(i).toLowerCase();
272 321 mdecorde
                                                        //                                                                        }
273 321 mdecorde
                                                        //                                                                }
274 321 mdecorde
275 321 mdecorde
                                                                break;
276 321 mdecorde
277 321 mdecorde
278 321 mdecorde
                                                        case "w":
279 321 mdecorde
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++) {
280 321 mdecorde
                                                                        if (parser.getAttributeLocalName(i).equals("id")) {
281 321 mdecorde
                                                                                wordid = parser.getAttributeValue(i);
282 321 mdecorde
                                                                        }
283 321 mdecorde
                                                                }
284 321 mdecorde
                                                                anavalues = [:];
285 321 mdecorde
                                                                break;
286 321 mdecorde
                                                        case "form":
287 321 mdecorde
                                                                flagForm = true;
288 321 mdecorde
                                                                vForm = "";
289 321 mdecorde
                                                                vAna ="";
290 321 mdecorde
                                                                break;
291 321 mdecorde
292 321 mdecorde
                                                        case "ana":
293 321 mdecorde
                                                                flagAna = true;
294 321 mdecorde
                                                                anavalue = "";
295 321 mdecorde
                                                                for (int i = 0 ; i < parser.getAttributeCount(); i++)
296 321 mdecorde
                                                                        if ("type".equals(parser.getAttributeLocalName(i))) {
297 321 mdecorde
                                                                                anatype = parser.getAttributeValue(i).substring(1);//remove the #
298 321 mdecorde
                                                                                break;
299 321 mdecorde
                                                                        }
300 321 mdecorde
                                                                break;
301 321 mdecorde
302 321 mdecorde
                                                        default:
303 321 mdecorde
//                                                                if ("div" == localname ) {
304 321 mdecorde
//                                                                        def type = localname;
305 321 mdecorde
//                                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
306 321 mdecorde
//                                                                                String attrname = parser.getAttributeLocalName(i);
307 321 mdecorde
//                                                                                if ("type".equals(attrname)) {
308 321 mdecorde
//                                                                                        type= parser.getAttributeValue(i)
309 321 mdecorde
//                                                                                }
310 321 mdecorde
//                                                                        }
311 321 mdecorde
//                                                                        divs << type;
312 321 mdecorde
//                                                                        localname = type
313 321 mdecorde
//                                                                }
314 321 mdecorde
315 321 mdecorde
                                                                if (foundtei && !foundtext) break;
316 321 mdecorde
317 321 mdecorde
                                                                sattrsListener.startElement(localname);
318 321 mdecorde
                                                                output.write("<"+localname);
319 321 mdecorde
320 321 mdecorde
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
321 321 mdecorde
                                                                        String attrname = parser.getAttributeLocalName(i);
322 321 mdecorde
                                                                        String attrvalue = parser.getAttributeValue(i)
323 321 mdecorde
                                                                        if (normalizeMetadata)
324 321 mdecorde
                                                                                attrvalue = attrvalue.toLowerCase();
325 321 mdecorde
                                                                        output.write(" "+attrname.toLowerCase()+"=\""+attrvalue+"\"")
326 321 mdecorde
                                                                }
327 321 mdecorde
                                                                if (parser.getAttributeCount() == 0) { // add the n attribute
328 321 mdecorde
                                                                        if (!ncounts.containsKey(localname)) ncounts.put(localname, 0);
329 321 mdecorde
                                                                        int ncount = ncounts.get(localname);
330 321 mdecorde
                                                                        ncounts.put(localname, ncount+1);
331 321 mdecorde
                                                                        output.write(" n=\""+ncount+"\"")
332 321 mdecorde
                                                                }
333 321 mdecorde
                                                                output.write(">\n");
334 321 mdecorde
                                                }
335 321 mdecorde
                                                break;
336 321 mdecorde
337 321 mdecorde
                                        case XMLStreamConstants.END_ELEMENT:
338 321 mdecorde
                                                localname = parser.getLocalName().toLowerCase();
339 321 mdecorde
                                                switch (localname) {
340 321 mdecorde
                                                        case "w":
341 321 mdecorde
                                                                for (String type : anatypes) {
342 321 mdecorde
                                                                        def v = anavalues.get(type);
343 321 mdecorde
                                                                        if (v != null) vAna +="\t"+v;
344 321 mdecorde
                                                                        else vAna +="\t";
345 321 mdecorde
                                                                }
346 321 mdecorde
                                                                vForm = vForm.replaceAll("\n", "").replaceAll("&", "&amp;").replaceAll("<", "&lt;");
347 321 mdecorde
                                                                if (vAna != null) {
348 321 mdecorde
                                                                        output.write(vForm+"\t"+wordid+vAna+"\n");
349 321 mdecorde
                                                                }
350 321 mdecorde
                                                                vAna = "";
351 321 mdecorde
                                                                vForm = "";
352 321 mdecorde
                                                                break;
353 321 mdecorde
354 321 mdecorde
                                                        case "tei":
355 321 mdecorde
                                                                break;
356 321 mdecorde
                                                        case "form":
357 321 mdecorde
                                                                flagForm = false;
358 321 mdecorde
                                                                break;
359 321 mdecorde
                                                        case "ana":
360 321 mdecorde
                                                                anavalues.put(anatype, anavalue)
361 321 mdecorde
                                                                flagAna = false;
362 321 mdecorde
                                                                break;
363 321 mdecorde
                                                        default:
364 321 mdecorde
                                                                if (foundtei && !foundtext) break;
365 321 mdecorde
366 321 mdecorde
//                                                                if ("div" == localname && divs.size() > 0) {
367 321 mdecorde
//                                                                        localname = divs.pop()
368 321 mdecorde
//                                                                }
369 321 mdecorde
370 321 mdecorde
                                                                sattrsListener.endElement(localname);
371 321 mdecorde
                                                                output.write("</"+localname+">\n");
372 321 mdecorde
                                                }
373 321 mdecorde
                                                break;
374 321 mdecorde
375 321 mdecorde
                                        case XMLStreamConstants.CHARACTERS:
376 321 mdecorde
                                                if (flagForm)
377 321 mdecorde
                                                        vForm += parser.getText().trim();
378 321 mdecorde
                                                if (flagAna) {
379 321 mdecorde
                                                        if (normalizeMetadata)
380 321 mdecorde
                                                                anavalue += parser.getText().trim().toLowerCase();
381 321 mdecorde
                                                        else
382 321 mdecorde
                                                                anavalue += parser.getText().trim();
383 321 mdecorde
                                                }
384 321 mdecorde
                                                break;
385 321 mdecorde
                                }
386 321 mdecorde
                        }
387 321 mdecorde
                        //output.write("</txmcorpus>");
388 321 mdecorde
                        output.close();
389 321 mdecorde
                        parser.close();
390 321 mdecorde
                } catch (Exception ex) {
391 321 mdecorde
                        System.out.println("Exception while parsing " + inputData+" of Text "+text);
392 321 mdecorde
                        File xmlFile = null
393 321 mdecorde
                        File errorDir = null
394 321 mdecorde
                        try {
395 321 mdecorde
                                xmlFile = new File(url.getFile())
396 803 mdecorde
                                errorDir = new File(cqpFile.getParentFile(), "compiler-error")
397 321 mdecorde
                                println "Warning: Moving $xmlFile to $errorDir"
398 321 mdecorde
                                errorDir.mkdir();
399 321 mdecorde
                                FileCopy.copy(xmlFile, new File(errorDir, xmlFile.getName()))
400 321 mdecorde
                        } catch(Exception eCopy) {
401 321 mdecorde
                                println "Error while moving "+url+" to "+errorDir
402 321 mdecorde
                        }
403 321 mdecorde
                        return false;
404 321 mdecorde
                }
405 321 mdecorde
                return true;
406 321 mdecorde
        }
407 321 mdecorde
408 321 mdecorde
        private void getAnaTypes(File xmlFile) {
409 321 mdecorde
                inputData = xmlFile.toURI().toURL().openStream();
410 321 mdecorde
                factory = XMLInputFactory.newInstance();
411 321 mdecorde
                parser = factory.createXMLStreamReader(inputData);
412 321 mdecorde
                String ana = "ana"
413 321 mdecorde
                HashSet<String> types = new HashSet<String>();
414 321 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
415 321 mdecorde
                        if (event == XMLStreamConstants.START_ELEMENT) { // start elem
416 321 mdecorde
                                if (ana.equals(parser.getLocalName())) { // ana elem
417 321 mdecorde
                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type
418 321 mdecorde
                                                if ("type".equals(parser.getAttributeLocalName(i))) { // @type
419 321 mdecorde
                                                        types.add(parser.getAttributeValue(i).substring(1)); //remove the #
420 321 mdecorde
                                                        break;
421 321 mdecorde
                                                }
422 321 mdecorde
                                        }
423 321 mdecorde
                                }
424 321 mdecorde
                        }
425 321 mdecorde
                }
426 321 mdecorde
                parser.close()
427 321 mdecorde
428 321 mdecorde
                for (String type : types)
429 321 mdecorde
                        if (!anatypes.contains(type))
430 321 mdecorde
                                anatypes << type
431 321 mdecorde
        }
432 321 mdecorde
433 321 mdecorde
        /**
434 321 mdecorde
         * Run.
435 321 mdecorde
         *
436 321 mdecorde
         * @param rootDirFile the root dir file
437 321 mdecorde
         * @param basename the basename
438 321 mdecorde
         * @param textAttributes the text attributes
439 321 mdecorde
         * @param srcfiles the srcfiles
440 321 mdecorde
         * @return true, if successful
441 321 mdecorde
         */
442 321 mdecorde
        public boolean run(File binDir, File txmDir, String corpusname, String[] textAttributes, def srcfiles, Metadatas metadatas)
443 321 mdecorde
        {
444 321 mdecorde
                sattrsListener = null; // reset SAttribute Listener for each new import
445 321 mdecorde
                String rootDir = binDir.getAbsolutePath();
446 321 mdecorde
                anatypes = [] // reset
447 321 mdecorde
                anavalues = [:] // reset
448 714 mdecorde
449 714 mdecorde
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
450 714 mdecorde
                        println ("Error: CWB executables not well set.")
451 321 mdecorde
                        return false;
452 321 mdecorde
                }
453 321 mdecorde
                if (!binDir.exists()) {
454 321 mdecorde
                        println ("binary directory does not exists: "+binDir)
455 321 mdecorde
                        return false;
456 321 mdecorde
                }
457 321 mdecorde
458 803 mdecorde
                File cqpFile = new File(binDir, "cqp/"+corpusname+".cqp")
459 803 mdecorde
                new File(binDir, "cqp").deleteDir()
460 803 mdecorde
                new File(binDir, "cqp").mkdir()
461 321 mdecorde
                new File(binDir, "data").deleteDir()
462 321 mdecorde
                new File(binDir, "data").mkdir()
463 321 mdecorde
                new File(binDir, "registry").mkdir()
464 321 mdecorde
465 321 mdecorde
                String textid = ""
466 321 mdecorde
                int counttext = 0
467 321 mdecorde
                List<File> files = txmDir.listFiles()
468 804 mdecorde
                //1- Transform into CQP file
469 321 mdecorde
                def builder = null
470 321 mdecorde
471 321 mdecorde
                //start corpus
472 803 mdecorde
                if (createOutput(cqpFile)) {
473 321 mdecorde
                        output.write("<txmcorpus lang=\""+lang+"\">\n")
474 321 mdecorde
                        output.close()
475 321 mdecorde
                }
476 321 mdecorde
477 321 mdecorde
                // sort files
478 321 mdecorde
                if (sortMetadata == null) {
479 321 mdecorde
                        Collections.sort(files)
480 321 mdecorde
                } else {
481 321 mdecorde
                        HashMap<File, String> sortmetadatavalues = new HashMap<File, String>()
482 321 mdecorde
                        for (File f : files) {
483 321 mdecorde
                                String value = MetadataGetter.get(f,"text", sortMetadata)
484 321 mdecorde
                                sortmetadatavalues.put(f, value)
485 321 mdecorde
                        }
486 321 mdecorde
                        println "sort properties value: "+sortmetadatavalues
487 321 mdecorde
                        Collections.sort(files, new Comparator<File>() {
488 321 mdecorde
                                /**
489 321 mdecorde
                                 * Compare.
490 321 mdecorde
                                 *
491 321 mdecorde
                                 * @param o1 the o1
492 321 mdecorde
                                 * @param o2 the o2
493 321 mdecorde
                                 * @return the int
494 321 mdecorde
                                 */
495 321 mdecorde
                                                public int compare(Object o1, Object o2) {
496 321 mdecorde
                                                        String v1 = sortmetadatavalues.get((File)o1)
497 321 mdecorde
                                                        String v2 = sortmetadatavalues.get((File)o2)
498 321 mdecorde
                                                        if (v1 == null || v2 == null) return 0;
499 321 mdecorde
                                                        return v1.compareTo(v2)
500 321 mdecorde
                                                }
501 321 mdecorde
                                        });
502 321 mdecorde
                }
503 321 mdecorde
504 321 mdecorde
                // get all anatypes
505 321 mdecorde
                for (File f : files) {
506 321 mdecorde
                        getAnaTypes(f)
507 321 mdecorde
                }
508 321 mdecorde
509 321 mdecorde
                println("Compiling "+files.size()+" $files ")
510 321 mdecorde
                for (File f : files) {
511 321 mdecorde
                        print "."
512 321 mdecorde
                        HashMap<String, String> textmetadata;
513 321 mdecorde
                        if (metadatas != null)
514 321 mdecorde
                                textmetadata = metadatas.getTextMetadata(f)
515 321 mdecorde
                        else
516 321 mdecorde
                                textmetadata = [:]
517 321 mdecorde
518 321 mdecorde
                        counttext++;
519 321 mdecorde
                        if (!f.exists()) {
520 321 mdecorde
                                println("file "+f+ " does not exists")
521 321 mdecorde
                        } else {
522 321 mdecorde
                                String txtname = f.getName().substring(0,f.getName().length()-4)
523 321 mdecorde
                                builder = new compiler(f.toURI().toURL(), txtname, corpusname, "default")
524 321 mdecorde
                                builder.setLang(lang);
525 804 mdecorde
                                if (!builder.transfomFileCqp(cqpFile, textmetadata)) {
526 321 mdecorde
                                        println("Failed to compile "+f)
527 321 mdecorde
                                }
528 321 mdecorde
                        }
529 321 mdecorde
                }
530 321 mdecorde
531 321 mdecorde
                //end corpus
532 803 mdecorde
                if (createOutput(cqpFile)) {
533 321 mdecorde
                        output.write("</txmcorpus>\n")
534 321 mdecorde
                        output.close()
535 321 mdecorde
                }
536 321 mdecorde
                println ""
537 321 mdecorde
                //2- Import into CWB
538 321 mdecorde
                def outDir = rootDir
539 321 mdecorde
540 321 mdecorde
                CwbEncode cwbEn = new CwbEncode()
541 321 mdecorde
                cwbEn.setDebug(debug)
542 321 mdecorde
                CwbMakeAll cwbMa = new CwbMakeAll()
543 321 mdecorde
                cwbMa.setDebug(debug)
544 321 mdecorde
545 321 mdecorde
                List<String> pargs = []
546 321 mdecorde
                pargs.add("id")
547 321 mdecorde
                for (String ana : anatypes)
548 321 mdecorde
                        pargs.add(ana)
549 321 mdecorde
550 321 mdecorde
                String[] pAttrs = pargs
551 321 mdecorde
552 321 mdecorde
                structs = sattrsListener.getStructs()
553 321 mdecorde
                structsProf = sattrsListener.getProfs()
554 321 mdecorde
555 321 mdecorde
                if (debug) {
556 321 mdecorde
                        println structs
557 321 mdecorde
                        println structsProf
558 321 mdecorde
                }
559 321 mdecorde
560 321 mdecorde
                List<String> sargs = new ArrayList<String>()
561 321 mdecorde
                def tmpTextAttrs = []
562 321 mdecorde
                for (String name : structs.keySet()) {
563 321 mdecorde
                        if (name == "text") {
564 321 mdecorde
                                for (String value : structs.get(name)) // append the attributes
565 321 mdecorde
                                        tmpTextAttrs << value // added after
566 321 mdecorde
                                continue;
567 321 mdecorde
                        }
568 321 mdecorde
                        //if ( name == "q") continue; // added after
569 321 mdecorde
                        //if ( name == "foreign") continue; // added after
570 321 mdecorde
                        String concat = name+":"+structsProf.get(name); // append the depth
571 321 mdecorde
                        for (String attributeName : structs.get(name)) // append the attributes
572 321 mdecorde
                                concat += "+"+attributeName.toLowerCase();
573 321 mdecorde
574 321 mdecorde
                        if (structs.get(name).size() == 0) {
575 321 mdecorde
                                concat += "+n";
576 321 mdecorde
                        } else {
577 321 mdecorde
                                if (!structs.get(name).contains("n"))
578 321 mdecorde
                                        concat += "+n"
579 321 mdecorde
                        }
580 321 mdecorde
581 321 mdecorde
                        if ((name == "p" || name == "body" || name == "back" || name == "front")
582 321 mdecorde
                                 && !concat.contains("+n+") && !concat.endsWith("+n"))
583 321 mdecorde
                                concat += "+n"
584 321 mdecorde
585 321 mdecorde
                        sargs.add(concat)
586 321 mdecorde
                }
587 321 mdecorde
588 321 mdecorde
                String textSAttributes = "text:0+id+base+project";
589 321 mdecorde
                for (String name : tmpTextAttrs) {
590 321 mdecorde
                        if (!("id".equals(name) || "base".equals(name) || "project".equals(name)))
591 321 mdecorde
                                textSAttributes += "+"+name.toLowerCase()
592 321 mdecorde
                }
593 321 mdecorde
                //                if (metadataXPath != null) {
594 321 mdecorde
                //                        for (String meta : metadataXPath.keySet()) // text property declarations from metadata.csv
595 321 mdecorde
                //                                textSAttributes+="+"+meta;
596 321 mdecorde
                //                }
597 321 mdecorde
                sargs.add(textSAttributes)
598 321 mdecorde
                sargs.add("txmcorpus:0+lang")
599 321 mdecorde
600 321 mdecorde
                sargs.sort()
601 321 mdecorde
602 321 mdecorde
                String[] sAttributes = sargs
603 321 mdecorde
                String[] pAttributes = pAttrs
604 321 mdecorde
                println "P-attributes: "+pAttributes
605 321 mdecorde
                println "S-attributes: "+sargs
606 321 mdecorde
607 321 mdecorde
                //if(!annotationSuccess)
608 321 mdecorde
                //pAttributes = ["id"];
609 321 mdecorde
610 321 mdecorde
                //println "PATTRIBUTES : "+pargs;
611 321 mdecorde
                /*
612 321 mdecorde
                 ArrayList<String> wordstag = ["w"];
613 321 mdecorde
                 println "Getting structural attributes..."
614 321 mdecorde
                 BuildCwbEncodeArgs argsgetter = new BuildCwbEncodeArgs();
615 321 mdecorde
                 HashMap<String, HashSet<String>> allStructures = new HashMap<String, HashSet<String>>();
616 321 mdecorde
                 HashMap<String, Integer> allStructuresInclusion = new HashMap<String, Integer>();
617 321 mdecorde
                 for (File srcfile: txmDir.listFiles()) {
618 321 mdecorde
                 if (!(!srcfile.getName().endsWith(".csv") && srcfile.canRead() && !srcfile.isHidden() && !srcfile.isDirectory() && ValidateXml.test(srcfile)))
619 321 mdecorde
                 continue;
620 321 mdecorde
                 print "."
621 321 mdecorde
                 argsgetter.process(srcfile, wordstag);
622 321 mdecorde
                 for (String sattr : argsgetter.getSAttributes()) {
623 321 mdecorde
                 int idx = sattr.indexOf(":");
624 321 mdecorde
                 if(idx < 0 )
625 321 mdecorde
                 continue;
626 321 mdecorde
                 String name = sattr.substring(0, idx);
627 321 mdecorde
                 if (!allStructures.containsKey(name)) {
628 321 mdecorde
                 allStructures.put(name, new HashSet<String>());
629 321 mdecorde
                 allStructuresInclusion.put(name, 0);
630 321 mdecorde
                 }
631 321 mdecorde
                 //println "sattr: "+name
632 321 mdecorde
                 String attrs = sattr.substring(idx+1);
633 321 mdecorde
                 String[] split = attrs.split("\\+");
634 321 mdecorde
                 if (split.length > 0) {
635 321 mdecorde
                 int start = 1;
636 321 mdecorde
                 try {// test if first attr is a number
637 321 mdecorde
                 int n = Integer.parseInt(split[0]);
638 321 mdecorde
                 if (n > allStructuresInclusion.get(name))
639 321 mdecorde
                 allStructuresInclusion.put(name, n);
640 321 mdecorde
                 } catch(Exception e) {start = 0;}
641 321 mdecorde
                 for (int i = start ; i < split.length ; i++)
642 321 mdecorde
                 allStructures.get(name).add(split[i]);
643 321 mdecorde
                 }
644 321 mdecorde
                 }
645 321 mdecorde
                 }
646 321 mdecorde
                 // add structures+properties found in sources
647 321 mdecorde
                 List<String> sargs = new ArrayList<String>();
648 321 mdecorde
                 for (String name : allStructuresInclusion.keySet()) {
649 321 mdecorde
                 String concat = name+":"+allStructuresInclusion.get(name);
650 321 mdecorde
                 for (String value : allStructures.get(name))
651 321 mdecorde
                 concat += "+"+value;
652 321 mdecorde
                 if (name.equals("text")) {
653 321 mdecorde
                 concat += "+base+project"
654 321 mdecorde
                 if (!concat.contains("id"))
655 321 mdecorde
                 concat += "+id";
656 321 mdecorde
                 }
657 321 mdecorde
                 sargs.add(concat);
658 321 mdecorde
                 }*/
659 321 mdecorde
660 321 mdecorde
                //                for (int i = 0 ; i < sargs.size() ; i++) {
661 321 mdecorde
                //                        if (sargs.get(i).startsWith("text:")) {
662 321 mdecorde
                //                                String str = sargs.get(i);
663 321 mdecorde
                //                                sargs.set(i, "text:"+str.substring(6));
664 321 mdecorde
                //                        }
665 321 mdecorde
                //                }
666 321 mdecorde
667 321 mdecorde
                //                String textSAttributes = "text:0+id+base+project";
668 321 mdecorde
                //                if (metadatas != null) {
669 321 mdecorde
                //                        for (String meta : metadatas.getHeadersList()) // text property declarations from metadata.csv
670 321 mdecorde
                //                                textSAttributes+="+"+meta;
671 321 mdecorde
                //                }
672 321 mdecorde
                //sargs.add(textSAttributes)
673 321 mdecorde
                //sargs.add("txmcorpus:0+lang")
674 321 mdecorde
675 321 mdecorde
                //                String[] sAttributes = sargs;
676 321 mdecorde
                //                System.out.println("\nCorpus structures: "+sAttributes);
677 321 mdecorde
                //                System.out.println("corpus word properties: "+pAttributes);
678 321 mdecorde
679 321 mdecorde
                try {
680 321 mdecorde
                        String regPath = outDir + "/registry/"+corpusname.toLowerCase();
681 714 mdecorde
                        cwbEn.run(
682 321 mdecorde
                                outDir + "/data/$corpusname",
683 803 mdecorde
                                outDir + "/cqp/"+corpusname+".cqp",
684 321 mdecorde
                                regPath, pAttributes, sAttributes);
685 321 mdecorde
                        if (!new File(regPath).exists()) {
686 321 mdecorde
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
687 321 mdecorde
                                return false;
688 321 mdecorde
                        }
689 714 mdecorde
                        cwbMa.run(corpusname, outDir + "/registry");
690 321 mdecorde
                } catch (Exception ex) {System.out.println(ex); return false;}
691 321 mdecorde
692 321 mdecorde
                return true;
693 321 mdecorde
        }
694 321 mdecorde
695 321 mdecorde
        /**
696 321 mdecorde
         * Sets the debug.
697 321 mdecorde
         */
698 321 mdecorde
        public void setDebug()
699 321 mdecorde
        {
700 321 mdecorde
                this.debug = true;
701 321 mdecorde
        }
702 321 mdecorde
703 321 mdecorde
        /**
704 321 mdecorde
         * The main method.
705 321 mdecorde
         *
706 321 mdecorde
         * @param args the arguments
707 321 mdecorde
         */
708 321 mdecorde
        public static void main(String[] args)
709 321 mdecorde
        {
710 321 mdecorde
                File dir = new File("~/xml/geo");
711 321 mdecorde
                def c = new compiler();
712 321 mdecorde
                c.setDebug();
713 321 mdecorde
                c.setCwbPath("~/TXM/cwb/bin");
714 321 mdecorde
                c.run(dir,"geo");
715 321 mdecorde
        }
716 321 mdecorde
}