Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / alceste / compiler.groovy @ 1688

History | View | Annotate | Download (6.6 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
//
22 321 mdecorde
//
23 321 mdecorde
//
24 479 mdecorde
// $LastChangedDate: 2016-05-26 17:42:36 +0200 (jeu. 26 mai 2016) $
25 321 mdecorde
// $LastChangedRevision: 3219 $
26 321 mdecorde
// $LastChangedBy: mdecorde $
27 321 mdecorde
//
28 986 mdecorde
package org.txm.scripts.importer.alceste;
29 321 mdecorde
30 1094 mdecorde
import org.txm.objects.*
31 1000 mdecorde
import org.txm.importer.cwb.CwbAlign;
32 1000 mdecorde
import org.txm.importer.cwb.CwbEncode
33 1000 mdecorde
import org.txm.importer.cwb.CwbMakeAll
34 986 mdecorde
import org.txm.scripts.importer.*;
35 321 mdecorde
import org.txm.scripts.*;
36 1000 mdecorde
import org.txm.importer.scripts.xmltxm.*;
37 321 mdecorde
import org.txm.utils.treetagger.TreeTagger;
38 714 mdecorde
import org.txm.core.preferences.TXMPreferences
39 714 mdecorde
import org.txm.libs.cqp.CQPLibPreferences
40 1094 mdecorde
import org.txm.searchengine.cqp.corpus.*
41 321 mdecorde
import javax.xml.stream.*;
42 714 mdecorde
43 321 mdecorde
import java.net.URL;
44 321 mdecorde
import java.io.File;
45 321 mdecorde
import java.util.Collection;
46 321 mdecorde
import java.util.HashMap;
47 321 mdecorde
import java.util.List;
48 321 mdecorde
49 321 mdecorde
// TODO: Auto-generated Javadoc
50 321 mdecorde
/**
51 803 mdecorde
 * build cqp
52 321 mdecorde
 * build CWB indexes
53 321 mdecorde
 * create registry file.
54 321 mdecorde
 *
55 321 mdecorde
 * @author mdecorde
56 321 mdecorde
 */
57 1094 mdecorde
class compiler {
58 321 mdecorde
        /** The debug. */
59 1094 mdecorde
        boolean debug = false;
60 321 mdecorde
61 321 mdecorde
        /** The input data. */
62 321 mdecorde
        private def inputData;
63 321 mdecorde
64 321 mdecorde
        /** The factory. */
65 321 mdecorde
        private def factory;
66 321 mdecorde
67 321 mdecorde
        /** The parser. */
68 321 mdecorde
        private XMLStreamReader parser;
69 321 mdecorde
70 321 mdecorde
        /** The dir. */
71 321 mdecorde
        private def dir;
72 321 mdecorde
73 321 mdecorde
        /** The output. */
74 321 mdecorde
        private def output;
75 321 mdecorde
76 321 mdecorde
        /** The url. */
77 321 mdecorde
        private def url;
78 321 mdecorde
79 321 mdecorde
        /** The anahash. */
80 321 mdecorde
        private HashMap<String, String> anahash = new HashMap<String, String>() ;
81 321 mdecorde
82 321 mdecorde
        /** The text. */
83 321 mdecorde
        String text = "";
84 321 mdecorde
85 321 mdecorde
        /** The base. */
86 321 mdecorde
        String base = "";
87 321 mdecorde
88 321 mdecorde
        /** The lang. */
89 321 mdecorde
        String lang = "fr";
90 321 mdecorde
91 321 mdecorde
        /**
92 321 mdecorde
         * initialize.
93 321 mdecorde
         *
94 321 mdecorde
         */
95 321 mdecorde
        public compiler(){}
96 321 mdecorde
97 321 mdecorde
        /** The annotation success. */
98 321 mdecorde
        boolean annotationSuccess = false;
99 321 mdecorde
100 321 mdecorde
        /**
101 321 mdecorde
         * Sets the annotation success.
102 321 mdecorde
         *
103 321 mdecorde
         * @param val the new annotation success
104 321 mdecorde
         */
105 321 mdecorde
        public void setAnnotationSuccess(boolean val)
106 321 mdecorde
        {
107 321 mdecorde
                this.annotationSuccess = val;
108 321 mdecorde
        }
109 321 mdecorde
110 321 mdecorde
        /**
111 321 mdecorde
         * Creates the output.
112 321 mdecorde
         *
113 321 mdecorde
         * @param f the f
114 321 mdecorde
         * @return true, if successful
115 321 mdecorde
         */
116 321 mdecorde
        private boolean createOutput(File f){
117 321 mdecorde
                try {
118 321 mdecorde
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
119 321 mdecorde
                        return true;
120 321 mdecorde
                } catch (Exception e) {
121 321 mdecorde
                        System.err.println(e);
122 321 mdecorde
                        return false;
123 321 mdecorde
                }
124 321 mdecorde
        }
125 321 mdecorde
126 321 mdecorde
        /**
127 321 mdecorde
         * Sets the lang.
128 321 mdecorde
         *
129 321 mdecorde
         * @param lang the lang
130 321 mdecorde
         * @return the java.lang. object
131 321 mdecorde
         */
132 321 mdecorde
        public setLang(String lang)
133 321 mdecorde
        {
134 321 mdecorde
                this.lang = lang;
135 321 mdecorde
        }
136 321 mdecorde
137 321 mdecorde
        /**
138 321 mdecorde
         * Run.
139 321 mdecorde
         *
140 321 mdecorde
         * @param rootDirFile the root dir file
141 321 mdecorde
         * @param basename the basename
142 321 mdecorde
         * @return true, if successful
143 321 mdecorde
         */
144 1094 mdecorde
        public boolean run(Project project)
145 321 mdecorde
        {
146 1094 mdecorde
                File binDir = project.getProjectDirectory()
147 1094 mdecorde
                String corpusname = project.getName();
148 1094 mdecorde
                File txmDir = new File(binDir, "txm/"+corpusname)
149 714 mdecorde
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
150 714 mdecorde
                        println ("Error: CWB executables not well set.")
151 321 mdecorde
                        return false;
152 321 mdecorde
                }
153 1094 mdecorde
154 1094 mdecorde
                CorpusBuild corpus = project.getCorpusBuild(project.getName());
155 1094 mdecorde
                if (corpus != null) {
156 1094 mdecorde
                        println "CLEAN PREVIOUS CORPUS"
157 1094 mdecorde
                        corpus.delete(); // remove old files
158 321 mdecorde
                }
159 1094 mdecorde
160 1094 mdecorde
                // make new one
161 1094 mdecorde
                corpus = new MainCorpus(project);
162 1094 mdecorde
                corpus.setID(project.getName());
163 1094 mdecorde
                corpus.setName(project.getName());
164 1615 mdecorde
                corpus.setDescription("Build the Alceste import module");
165 1652 mdecorde
166 803 mdecorde
                File cqpFile = new File(binDir,"cqp/"+corpusname+".cqp");
167 1395 mdecorde
cqpFile.delete()
168 1094 mdecorde
                new File(binDir,"cqp").mkdirs()
169 1094 mdecorde
                new File(binDir,"data").mkdirs()
170 1094 mdecorde
                new File(binDir,"registry").mkdirs()
171 321 mdecorde
172 321 mdecorde
                String textid = "";
173 321 mdecorde
                int counttext = 0;
174 321 mdecorde
                List<File> files = txmDir.listFiles();
175 321 mdecorde
                if (files == null || files.size() == 0) {
176 321 mdecorde
                        println "no file to compile"
177 321 mdecorde
                        return false;
178 321 mdecorde
                }
179 321 mdecorde
180 321 mdecorde
                //start corpus
181 803 mdecorde
                if (createOutput(cqpFile)) {
182 321 mdecorde
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
183 321 mdecorde
                        output.close();
184 321 mdecorde
                }
185 321 mdecorde
186 321 mdecorde
                Collections.sort(files);
187 804 mdecorde
                //1- Transform into CQP file
188 804 mdecorde
                XMLTXM2CQP cqpbuilder = null;
189 321 mdecorde
                println("process "+files.size()+" files")
190 321 mdecorde
                for (File f : files) {
191 321 mdecorde
                        print "."
192 321 mdecorde
                        counttext++;
193 321 mdecorde
                        if (!f.exists()) {
194 321 mdecorde
                                println("file "+f+ " does not exists")
195 321 mdecorde
                        } else {
196 804 mdecorde
                                cqpbuilder = new XMLTXM2CQP(f.toURI().toURL());
197 321 mdecorde
                                String txtname = f.getName().substring(0, f.getName().length()-4);
198 803 mdecorde
                                cqpbuilder.setTextInfo(txtname, corpusname, "default");
199 321 mdecorde
200 803 mdecorde
                                cqpbuilder.setBalisesToKeep(["text", "s", "p", "lb"]);
201 803 mdecorde
                                cqpbuilder.setSendToPAttributes(["lb":["n"], "p":["n"], "s":["n"]])
202 803 mdecorde
                                cqpbuilder.setLang(lang);
203 803 mdecorde
                                if (!cqpbuilder.transformFile(cqpFile)) {
204 321 mdecorde
                                        println("Failed to compile "+f)
205 321 mdecorde
                                }
206 321 mdecorde
                        }
207 321 mdecorde
                }
208 321 mdecorde
                println ""
209 321 mdecorde
210 321 mdecorde
                //end corpus
211 803 mdecorde
                if (createOutput(cqpFile)) {
212 321 mdecorde
                        output.write("</txmcorpus>\n");
213 321 mdecorde
                        output.close();
214 321 mdecorde
                }
215 321 mdecorde
216 803 mdecorde
                if (cqpbuilder == null) {
217 321 mdecorde
                        println "there was no files in bin dir : "+txmDir
218 321 mdecorde
                        return false;
219 321 mdecorde
                }
220 321 mdecorde
221 321 mdecorde
                //2- Import into CWB
222 1094 mdecorde
                def outDir = binDir.getAbsolutePath();
223 321 mdecorde
                CwbEncode cwbEn = new CwbEncode();
224 321 mdecorde
                cwbEn.setDebug(debug);
225 321 mdecorde
                CwbMakeAll cwbMa = new CwbMakeAll();
226 321 mdecorde
                cwbMa.setDebug(debug);
227 321 mdecorde
228 803 mdecorde
                List<String> pAttributesList = cqpbuilder.getpAttributs();
229 803 mdecorde
                List<String> sAttributesList = cqpbuilder.getsAttributs();
230 321 mdecorde
                println "pAttrs : "+pAttributesList
231 321 mdecorde
                println "sAttrs : "+sAttributesList
232 321 mdecorde
                String[] pAttributes = pAttributesList.toArray(new String[pAttributesList.size()])
233 321 mdecorde
                String[] sAttributes = sAttributesList.toArray(new String[sAttributesList.size()])
234 321 mdecorde
235 321 mdecorde
                try {
236 1094 mdecorde
                        String regPath = outDir + "/registry/"+corpusname.toLowerCase(); // CQP wants lowercase registry name
237 1094 mdecorde
                        cwbEn.run(outDir + "/data/${corpusname}", outDir + "/cqp/"+corpusname+".cqp", regPath,pAttributes, sAttributes);
238 321 mdecorde
                        if (!new File(regPath).exists()) {
239 1094 mdecorde
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
240 321 mdecorde
                                return false;
241 321 mdecorde
                        }
242 1094 mdecorde
                        cwbMa.run(corpusname.toUpperCase(), outDir + "/registry");
243 1094 mdecorde
244 321 mdecorde
                } catch (Exception ex) {System.out.println(ex); return false;}
245 321 mdecorde
246 321 mdecorde
                return true;
247 321 mdecorde
        }
248 321 mdecorde
249 321 mdecorde
        /**
250 321 mdecorde
         * Sets the debug.
251 321 mdecorde
         */
252 321 mdecorde
        public void setDebug()
253 321 mdecorde
        {
254 321 mdecorde
                this.debug = true;
255 321 mdecorde
        }
256 321 mdecorde
257 321 mdecorde
        /**
258 321 mdecorde
         * The main method.
259 321 mdecorde
         *
260 321 mdecorde
         * @param args the arguments
261 321 mdecorde
         */
262 321 mdecorde
        public static void main(String[] args)
263 321 mdecorde
        {
264 321 mdecorde
                File dir = new File("~/xml/geo");
265 321 mdecorde
                def c = new compiler();
266 321 mdecorde
                c.setDebug();
267 321 mdecorde
                c.setCwbPath("~/TXM/cwb/bin");
268 321 mdecorde
                c.run(dir,"geo");
269 321 mdecorde
        }
270 321 mdecorde
}