Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / discours / compiler.groovy @ 1000

History | View | Annotate | Download (6.8 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
//
22 321 mdecorde
//
23 321 mdecorde
//
24 479 mdecorde
// $LastChangedDate: 2016-05-26 17:42:36 +0200 (jeu. 26 mai 2016) $
25 321 mdecorde
// $LastChangedRevision: 3219 $
26 321 mdecorde
// $LastChangedBy: mdecorde $
27 321 mdecorde
//
28 986 mdecorde
package org.txm.scripts.importer.discours
29 321 mdecorde
30 1000 mdecorde
import org.txm.importer.cwb.BuildCwbEncodeArgsFromTEITXM;
31 1000 mdecorde
import org.txm.importer.cwb.CwbEncode
32 1000 mdecorde
import org.txm.importer.cwb.CwbMakeAll
33 986 mdecorde
import org.txm.scripts.importer.*;
34 321 mdecorde
import org.txm.scripts.*;
35 1000 mdecorde
import org.txm.importer.scripts.xmltxm.*;
36 321 mdecorde
import org.txm.utils.treetagger.TreeTagger;
37 321 mdecorde
38 321 mdecorde
import javax.xml.stream.*;
39 321 mdecorde
import java.net.URL;
40 321 mdecorde
import java.io.File;
41 321 mdecorde
import java.util.HashMap;
42 321 mdecorde
import java.util.List;
43 321 mdecorde
44 321 mdecorde
// TODO: Auto-generated Javadoc
45 321 mdecorde
/**
46 321 mdecorde
 * The Class compiler.
47 321 mdecorde
 */
48 321 mdecorde
class compiler {
49 321 mdecorde
50 321 mdecorde
        /** The input data. */
51 321 mdecorde
        private def inputData;
52 321 mdecorde
53 321 mdecorde
        /** The factory. */
54 321 mdecorde
        private def factory;
55 321 mdecorde
56 321 mdecorde
        /** The parser. */
57 321 mdecorde
        private XMLStreamReader parser;
58 321 mdecorde
59 321 mdecorde
        /** The dir. */
60 321 mdecorde
        private def dir;
61 321 mdecorde
62 321 mdecorde
        /** The output. */
63 321 mdecorde
        private def output;
64 321 mdecorde
65 321 mdecorde
        /** The url. */
66 321 mdecorde
        private def url;
67 321 mdecorde
68 321 mdecorde
        /** The anahash. */
69 321 mdecorde
        private HashMap<String,String> anahash = new HashMap<String,String>() ;
70 321 mdecorde
71 321 mdecorde
        /** The text. */
72 321 mdecorde
        String text="";
73 321 mdecorde
74 321 mdecorde
        /** The base. */
75 321 mdecorde
        String base="";
76 321 mdecorde
77 321 mdecorde
        /** The project. */
78 321 mdecorde
        String project="";
79 321 mdecorde
80 321 mdecorde
        /** The lang. */
81 321 mdecorde
        private String lang ="fr";
82 321 mdecorde
83 321 mdecorde
        /** The text attributes. */
84 321 mdecorde
        static String textAttributes = "";
85 321 mdecorde
86 321 mdecorde
        /**
87 321 mdecorde
         * initialize.
88 321 mdecorde
         *
89 321 mdecorde
         */
90 321 mdecorde
91 321 mdecorde
        public compiler(){
92 321 mdecorde
        }
93 321 mdecorde
94 321 mdecorde
        /**
95 321 mdecorde
         * Instantiates a new compiler.
96 321 mdecorde
         *
97 321 mdecorde
         * @param url the url
98 321 mdecorde
         * @param text the text
99 321 mdecorde
         * @param base the base
100 321 mdecorde
         * @param project the project
101 321 mdecorde
         */
102 321 mdecorde
        public compiler(URL url,String text,String base, String project) {
103 321 mdecorde
                this.text = text
104 321 mdecorde
                this.base = base;
105 321 mdecorde
                this.project = project;
106 321 mdecorde
                try {
107 321 mdecorde
                        this.url = url;
108 321 mdecorde
                        inputData = url.openStream();
109 321 mdecorde
110 321 mdecorde
                        factory = XMLInputFactory.newInstance();
111 321 mdecorde
                        parser = factory.createXMLStreamReader(inputData);
112 321 mdecorde
                } catch (XMLStreamException ex) {
113 321 mdecorde
                        System.out.println(ex);
114 321 mdecorde
                }catch (IOException ex) {
115 321 mdecorde
                        System.err.println("IOException while parsing ");
116 321 mdecorde
                }
117 321 mdecorde
        }
118 321 mdecorde
119 321 mdecorde
        /**
120 321 mdecorde
         * set the language of the corpus.
121 321 mdecorde
         *
122 321 mdecorde
         * @param lang the lang
123 321 mdecorde
         * @return the java.lang. object
124 321 mdecorde
         */
125 321 mdecorde
        public setLang(String lang)
126 321 mdecorde
        {
127 321 mdecorde
                this.lang = lang;
128 321 mdecorde
        }
129 321 mdecorde
130 321 mdecorde
        /**
131 321 mdecorde
         * Creates the output.
132 321 mdecorde
         *
133 321 mdecorde
         * @param f the f
134 321 mdecorde
         * @return true, if successful
135 321 mdecorde
         */
136 321 mdecorde
        private boolean createOutput(File f){
137 321 mdecorde
                try {
138 321 mdecorde
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
139 321 mdecorde
                        return true;
140 321 mdecorde
                } catch (Exception e) {
141 321 mdecorde
                        System.err.println(e);
142 321 mdecorde
                        return false;
143 321 mdecorde
                }
144 321 mdecorde
        }
145 321 mdecorde
146 321 mdecorde
        /**
147 321 mdecorde
         * Go to text.
148 321 mdecorde
         */
149 321 mdecorde
        private void GoToText() {
150 321 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
151 321 mdecorde
                        if(event == XMLStreamConstants.END_ELEMENT)
152 321 mdecorde
                                if(parser.getLocalName().equals("teiHeader"))
153 321 mdecorde
                                        return;
154 321 mdecorde
                }
155 321 mdecorde
        }
156 321 mdecorde
157 321 mdecorde
        /** The debug. */
158 321 mdecorde
        boolean debug = false;
159 321 mdecorde
160 321 mdecorde
        /**
161 321 mdecorde
         * Sets the debug.
162 321 mdecorde
         */
163 321 mdecorde
        public void setDebug()
164 321 mdecorde
        {
165 321 mdecorde
                debug = true;
166 321 mdecorde
        }
167 321 mdecorde
168 321 mdecorde
        /**
169 321 mdecorde
         * Run.
170 321 mdecorde
         *
171 321 mdecorde
         * @param files the files
172 321 mdecorde
         * @param basename the basename
173 321 mdecorde
         * @return true, if successful
174 321 mdecorde
         */
175 321 mdecorde
        public boolean run(List<File> files, File binDir, File txmDir, String corpusname) {
176 321 mdecorde
                String binDirPath = binDir.getAbsolutePath()+"/";
177 321 mdecorde
178 714 mdecorde
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
179 714 mdecorde
                        println ("Error: CWB executables not well set.")
180 321 mdecorde
                        return false;
181 321 mdecorde
                }
182 803 mdecorde
                File cqpFile = new File(binDir,"cqp/${corpusname}.cqp");
183 803 mdecorde
                new File(binDir, "cqp.").deleteDir();
184 803 mdecorde
                new File(binDir, "cqp").mkdirs();
185 321 mdecorde
                new File(binDir, "data/$corpusname").deleteDir();
186 321 mdecorde
                new File(binDir, "data/$corpusname").mkdir();
187 321 mdecorde
                new File(binDir, "registry").deleteDir();
188 321 mdecorde
                new File(binDir, "registry").mkdir();
189 321 mdecorde
190 321 mdecorde
                //start corpus
191 803 mdecorde
                if (createOutput(cqpFile)) {
192 321 mdecorde
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
193 321 mdecorde
                        output.close();
194 321 mdecorde
                }
195 321 mdecorde
196 321 mdecorde
                def second = 0
197 804 mdecorde
                //1- Transform into CQP file
198 321 mdecorde
199 321 mdecorde
                Collections.sort(files);
200 804 mdecorde
                XMLTXM2CQP cqpbuilder = null;
201 321 mdecorde
                for (File f : files) {
202 321 mdecorde
                        if (second) { print(", ") }
203 321 mdecorde
                        if (second > 0 && (second % 5) == 0) println ""
204 321 mdecorde
                        print(f.getName().replaceFirst("\\.xml", ""));
205 321 mdecorde
                        second++
206 321 mdecorde
207 804 mdecorde
                        cqpbuilder = new XMLTXM2CQP(f.toURI().toURL());
208 321 mdecorde
                        String txtname = f.getName().substring(0,f.getName().length()-4);
209 803 mdecorde
                        cqpbuilder.setTextInfo(txtname, corpusname, "project");
210 321 mdecorde
211 803 mdecorde
                        cqpbuilder.setBalisesToKeep(["text","p","s"]);
212 803 mdecorde
                        cqpbuilder.setSendToPAttributes(["s":["id"], "p":["id"]]);
213 803 mdecorde
                        cqpbuilder.setLang(lang);
214 803 mdecorde
                        if (!cqpbuilder.transformFile(cqpFile)) {
215 321 mdecorde
                                println("Failed to compile "+f)
216 321 mdecorde
                        }
217 321 mdecorde
218 321 mdecorde
                }
219 321 mdecorde
220 321 mdecorde
                //end corpus
221 803 mdecorde
                if (createOutput(cqpFile)) {
222 321 mdecorde
                        output.write("</txmcorpus>\n");
223 321 mdecorde
                        output.close();
224 321 mdecorde
                }
225 321 mdecorde
226 803 mdecorde
                if (cqpbuilder == null) {
227 321 mdecorde
                        println "there was no files to process: "+files
228 321 mdecorde
                        return false;
229 321 mdecorde
                }
230 321 mdecorde
231 321 mdecorde
                //2- Import into CWB
232 321 mdecorde
                def outDir = binDirPath;
233 321 mdecorde
234 321 mdecorde
                CwbEncode cwbEn = new CwbEncode();
235 321 mdecorde
                CwbMakeAll cwbMa = new CwbMakeAll();
236 321 mdecorde
237 803 mdecorde
                List<String> pAttributesList = cqpbuilder.getpAttributs();
238 803 mdecorde
                List<String> sAttributesList = cqpbuilder.getsAttributs();
239 321 mdecorde
                println "pAttrs : "+pAttributesList
240 321 mdecorde
                println "sAttrs : "+sAttributesList
241 321 mdecorde
                String[] pAttributes = pAttributesList.toArray(new String[pAttributesList.size()])
242 321 mdecorde
                String[] sAttributes = sAttributesList.toArray(new String[sAttributesList.size()])
243 321 mdecorde
244 321 mdecorde
                try {
245 321 mdecorde
                        //println "Indexing "+compiler.textAttributes
246 321 mdecorde
                        cwbEn.setDebug(debug);
247 321 mdecorde
                        String regPath = outDir + "registry/"+corpusname.toLowerCase()
248 321 mdecorde
                        cwbEn.run(
249 321 mdecorde
                                        outDir + "data/$corpusname",
250 803 mdecorde
                                        outDir + "/cqp/"+corpusname+".cqp",
251 321 mdecorde
                                        regPath,
252 321 mdecorde
                                        pAttributes,
253 321 mdecorde
                                        sAttributes);
254 321 mdecorde
                        if (!new File(regPath).exists()) {
255 321 mdecorde
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
256 321 mdecorde
                                return false;
257 321 mdecorde
                        }
258 321 mdecorde
                        //println "Binding all indexes"
259 321 mdecorde
                        cwbMa.setDebug(debug);
260 714 mdecorde
                        cwbMa.run(corpusname, outDir + "registry");
261 321 mdecorde
262 321 mdecorde
                } catch (Exception ex) { System.err.println(ex); return false;}
263 321 mdecorde
264 321 mdecorde
                return true;
265 321 mdecorde
        }
266 321 mdecorde
267 321 mdecorde
        /**
268 321 mdecorde
         * The main method.
269 321 mdecorde
         *
270 321 mdecorde
         * @param args the arguments
271 321 mdecorde
         */
272 321 mdecorde
        public static void main(String[] args) {
273 321 mdecorde
                File dir = new File(System.getProperty("user.home"),"xml/discours/txm/");
274 321 mdecorde
                List<File> files = dir.listFiles();
275 321 mdecorde
                def c = new compiler()
276 321 mdecorde
                c.setCwbLoc(System.getProperty("user.home")+"/TXM/cwb/bin/")
277 321 mdecorde
                if (!c.run(files,"discours")) {
278 321 mdecorde
                        println "Compiler failed"
279 321 mdecorde
                        return;
280 321 mdecorde
                }
281 321 mdecorde
        }
282 321 mdecorde
}