Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / fleurs / compiler.groovy @ 1000

History | View | Annotate | Download (7.3 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
//
22 321 mdecorde
//
23 321 mdecorde
//
24 479 mdecorde
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25 321 mdecorde
// $LastChangedRevision: 3400 $
26 321 mdecorde
// $LastChangedBy: mdecorde $
27 321 mdecorde
//
28 986 mdecorde
package org.txm.scripts.importer.fleurs;
29 321 mdecorde
30 1000 mdecorde
import org.txm.importer.cwb.CwbEncode
31 1000 mdecorde
import org.txm.importer.cwb.CwbMakeAll
32 986 mdecorde
import org.txm.scripts.importer.*;
33 321 mdecorde
import org.txm.scripts.*;
34 1000 mdecorde
import org.txm.importer.scripts.xmltxm.BuildTTSrc;
35 1000 mdecorde
import org.txm.importer.scripts.xmltxm.*;
36 321 mdecorde
import org.txm.utils.treetagger.TreeTagger;
37 321 mdecorde
38 321 mdecorde
import javax.xml.stream.*;
39 321 mdecorde
import java.net.URL;
40 321 mdecorde
import java.io.File;
41 321 mdecorde
import java.net.URL;
42 321 mdecorde
import java.util.HashMap;
43 321 mdecorde
import java.util.List;
44 321 mdecorde
45 321 mdecorde
// TODO: Auto-generated Javadoc
46 321 mdecorde
/**
47 321 mdecorde
 * The Class compiler.
48 321 mdecorde
 */
49 321 mdecorde
class compiler{
50 321 mdecorde
51 321 mdecorde
        /** The input data. */
52 321 mdecorde
        private def inputData;
53 321 mdecorde
54 321 mdecorde
        /** The factory. */
55 321 mdecorde
        private def factory;
56 321 mdecorde
57 321 mdecorde
        /** The parser. */
58 321 mdecorde
        private def parser;
59 321 mdecorde
60 321 mdecorde
        /** The dir. */
61 321 mdecorde
        private def dir;
62 321 mdecorde
63 321 mdecorde
        /** The output. */
64 321 mdecorde
        private def output;
65 321 mdecorde
66 321 mdecorde
        /** The url. */
67 321 mdecorde
        private def url;
68 321 mdecorde
69 321 mdecorde
        /** The text. */
70 321 mdecorde
        String text="";
71 321 mdecorde
72 321 mdecorde
        /** The base. */
73 321 mdecorde
        String base="";
74 321 mdecorde
75 321 mdecorde
        /** The project. */
76 321 mdecorde
        String project="";
77 321 mdecorde
78 321 mdecorde
        /** The lang. */
79 321 mdecorde
        private String lang ="fr";
80 321 mdecorde
81 321 mdecorde
        /**
82 321 mdecorde
         * Instantiates a new compiler.
83 321 mdecorde
         */
84 321 mdecorde
        public compiler(){}
85 321 mdecorde
86 321 mdecorde
        /**
87 321 mdecorde
         * initialize.
88 321 mdecorde
         *
89 321 mdecorde
         * @param url the url
90 321 mdecorde
         * @param text the text
91 321 mdecorde
         * @param base the base
92 321 mdecorde
         * @param project the project
93 321 mdecorde
         */
94 321 mdecorde
        public compiler(URL url,String text,String base, String project)
95 321 mdecorde
        {
96 321 mdecorde
                this.text = text
97 321 mdecorde
                this.base = base;
98 321 mdecorde
                this.project = project;
99 321 mdecorde
100 321 mdecorde
                try {
101 321 mdecorde
                        this.url = url;
102 321 mdecorde
                        inputData = url.openStream();
103 321 mdecorde
104 321 mdecorde
                        factory = XMLInputFactory.newInstance();
105 321 mdecorde
                        parser = factory.createXMLStreamReader(inputData);
106 321 mdecorde
                } catch (XMLStreamException ex) {
107 321 mdecorde
                        System.out.println(ex);
108 321 mdecorde
                }catch (IOException ex) {
109 321 mdecorde
                        System.out.println("IOException while parsing ");
110 321 mdecorde
                }
111 321 mdecorde
        }
112 321 mdecorde
113 321 mdecorde
        /**
114 321 mdecorde
         * set the language of the corpus.
115 321 mdecorde
         *
116 321 mdecorde
         * @param lang the lang
117 321 mdecorde
         * @return the java.lang. object
118 321 mdecorde
         */
119 321 mdecorde
        public setLang(String lang)
120 321 mdecorde
        {
121 321 mdecorde
                this.lang = lang;
122 321 mdecorde
        }
123 321 mdecorde
124 321 mdecorde
        /**
125 321 mdecorde
         * Creates the output.
126 321 mdecorde
         *
127 321 mdecorde
         * @param dirPathName the dir path name
128 321 mdecorde
         * @param fileName the file name
129 321 mdecorde
         * @return true, if successful
130 321 mdecorde
         */
131 321 mdecorde
        private boolean createOutput(String dirPathName, String fileName){
132 321 mdecorde
                try {
133 321 mdecorde
                        File f = new File(dirPathName, fileName)
134 321 mdecorde
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8")
135 321 mdecorde
                        return true;
136 321 mdecorde
                } catch (Exception e) {
137 321 mdecorde
                        System.out.println(e.getLocalizedMessage());
138 321 mdecorde
                        return false;
139 321 mdecorde
                }
140 321 mdecorde
        }
141 321 mdecorde
142 321 mdecorde
        /**
143 803 mdecorde
         * Transfom file cqp.
144 321 mdecorde
         *
145 321 mdecorde
         * @param dirPathName the dir path name
146 321 mdecorde
         * @param fileName the file name
147 321 mdecorde
         * @return true, if successful
148 321 mdecorde
         */
149 804 mdecorde
        public boolean transfomFileCqp(String dirPathName, String fileName)
150 321 mdecorde
        {
151 321 mdecorde
                createOutput(dirPathName, fileName);
152 321 mdecorde
153 321 mdecorde
                String vForm = "";
154 321 mdecorde
                String wordid= "";
155 321 mdecorde
                String w_l= "";
156 321 mdecorde
                String w_lemme= "";
157 321 mdecorde
                String w_pos= "";
158 321 mdecorde
                String w_x= "";
159 321 mdecorde
                String w_z= "";
160 321 mdecorde
161 321 mdecorde
                boolean flagForm = false;
162 321 mdecorde
                output.write("<txmcorpus lang=\""+lang+"\">\n");
163 321 mdecorde
                try
164 321 mdecorde
                {
165 321 mdecorde
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
166 321 mdecorde
                        {
167 321 mdecorde
                                switch (event)
168 321 mdecorde
                                {
169 321 mdecorde
                                        case XMLStreamConstants.START_ELEMENT:
170 321 mdecorde
                                        switch (parser.getLocalName())
171 321 mdecorde
                                        {
172 321 mdecorde
                                        case "recueil":
173 321 mdecorde
                                                output.write("<text id=\""+text+"\" base=\"fleurs\" project=\"default\">\n");
174 321 mdecorde
                                                break;
175 321 mdecorde
                                        case "w":
176 321 mdecorde
                                                flagForm= true;
177 321 mdecorde
                                                w_l = parser.getAttributeValue(null,"l");
178 321 mdecorde
                                                w_lemme = parser.getAttributeValue(null,"lemme");
179 321 mdecorde
                                                w_pos = parser.getAttributeValue(null,"pos");
180 321 mdecorde
                                                w_x= parser.getAttributeValue(null,"x");
181 321 mdecorde
                                                w_z= parser.getAttributeValue(null,"z");
182 321 mdecorde
                                                wordid= parser.getAttributeValue(null,"id");
183 321 mdecorde
                                                vForm ="";
184 321 mdecorde
                                                break;
185 321 mdecorde
                                        default:
186 321 mdecorde
                                                output.write("<"+parser.getLocalName())
187 321 mdecorde
                                                for(int i =0; i < parser.getAttributeCount() ; i++)
188 321 mdecorde
                                                                output.write(" "+parser.getAttributeLocalName(i)+"=\""+parser.getAttributeValue(i)+"\"")
189 321 mdecorde
                                                        output.write(">\n");
190 321 mdecorde
                                                break;
191 321 mdecorde
                                        }
192 321 mdecorde
                                        break;
193 321 mdecorde
194 321 mdecorde
                                        case XMLStreamConstants.END_ELEMENT:
195 321 mdecorde
                                        switch (parser.getLocalName())
196 321 mdecorde
                                        {
197 321 mdecorde
                                        case "recueil":
198 321 mdecorde
                                                output.write("</text>\n");
199 321 mdecorde
                                                break;
200 321 mdecorde
                                        case "w":
201 321 mdecorde
                                                flagForm= false;
202 321 mdecorde
                                                output.write(vForm+"\t"+wordid+"\t"+w_l+"\t"+w_lemme+"\t"+w_pos+"\t"+w_x+"\t"+w_z+"\n")
203 321 mdecorde
                                                break;
204 321 mdecorde
                                        default:
205 321 mdecorde
                                                output.write("</"+parser.getLocalName()+">\n");
206 321 mdecorde
                                                break;
207 321 mdecorde
                                        }
208 321 mdecorde
                                        break;
209 321 mdecorde
210 321 mdecorde
                                        case XMLStreamConstants.CHARACTERS:
211 321 mdecorde
                                                if(flagForm)
212 321 mdecorde
                                                        vForm += parser.getText().trim();
213 321 mdecorde
214 321 mdecorde
                                        break;
215 321 mdecorde
                                }
216 321 mdecorde
                        }
217 321 mdecorde
                        output.write("</txmcorpus>");
218 321 mdecorde
                        output.close();
219 321 mdecorde
                        parser.close();
220 321 mdecorde
                        inputData.close();
221 321 mdecorde
                }
222 321 mdecorde
                catch (XMLStreamException ex) {
223 321 mdecorde
                        System.out.println(ex);
224 321 mdecorde
                }
225 321 mdecorde
                catch (IOException ex) {
226 321 mdecorde
                        System.out.println("IOException while parsing " + inputData);
227 321 mdecorde
                }
228 321 mdecorde
229 321 mdecorde
                return true;
230 321 mdecorde
        }
231 321 mdecorde
232 321 mdecorde
        /**
233 321 mdecorde
         * Run.
234 321 mdecorde
         *
235 321 mdecorde
         * @param rootDir the root dir
236 321 mdecorde
         */
237 321 mdecorde
        public boolean run(String rootDir)
238 321 mdecorde
        {
239 714 mdecorde
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
240 714 mdecorde
                        println ("Error: CWB executables not well set.")
241 714 mdecorde
                        return false;
242 714 mdecorde
                }
243 803 mdecorde
                new File(rootDir,"cqp/fleurs.cqp").delete();//cleaning&preparing
244 803 mdecorde
                new File(rootDir,"cqp/").deleteDir();
245 803 mdecorde
                new File(rootDir,"cqp/").mkdir();
246 321 mdecorde
                new File(rootDir,"registry/").mkdir();
247 321 mdecorde
248 804 mdecorde
                //1- Transform into CQP file
249 321 mdecorde
                List<File> files = new File(rootDir,"txm").listFiles();
250 321 mdecorde
                for (File f : files) {
251 321 mdecorde
                        if (!f.exists()) {
252 321 mdecorde
                                println("file "+f+ " does not exists")
253 321 mdecorde
                        }
254 321 mdecorde
                        else
255 321 mdecorde
                        {
256 321 mdecorde
                                println("process file "+f)
257 321 mdecorde
                                String txtname = f.getName().substring(0,f.getName().length()-4);
258 321 mdecorde
                                def builder = new compiler(f.toURL(), txtname, "fleurs", "default");
259 321 mdecorde
                                builder.setLang lang
260 804 mdecorde
                                builder.transfomFileCqp(rootDir,"cqp/fleurs.cqp");
261 321 mdecorde
                        }
262 321 mdecorde
                }
263 321 mdecorde
264 321 mdecorde
                //2- Import into CWB
265 321 mdecorde
                def outDir =rootDir;
266 321 mdecorde
                def outDirTxm = rootDir;
267 321 mdecorde
268 321 mdecorde
                CwbEncode cwbEn = new CwbEncode();
269 321 mdecorde
                cwbEn.setDebug true;
270 321 mdecorde
                CwbMakeAll cwbMa = new CwbMakeAll();
271 321 mdecorde
                cwbMa.setDebug true;
272 321 mdecorde
273 321 mdecorde
                String[] pAttributes = ["id","l","lemme","pos","x","z"];
274 321 mdecorde
                String[] sAttributes = ["txmcorpus:0+lang", "text:0+id+base+project","poeme:0+titre+genre+type+section+tranche_chrono+date+annee+titre_abr+preorig+schema1+schema2+n","strophe:0+ordre+nb_vers+structure","div:0+type+structure","vers:0+numvers+tphon+stru+n_syl+n_pied+cesure+rime"];
275 321 mdecorde
276 321 mdecorde
                try {
277 321 mdecorde
                        String regPath = outDirTxm + "/registry/"+"fleurs"
278 803 mdecorde
                        cwbEn.run(outDirTxm + "data", outDir + "/cqp/"+"fleurs.cqp", regPath,pAttributes, sAttributes);
279 321 mdecorde
                        if (!new File(regPath).exists()) {
280 321 mdecorde
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
281 321 mdecorde
                                return false;
282 321 mdecorde
                        }
283 714 mdecorde
                        cwbMa.run("FLEURS", outDirTxm + "/registry");
284 321 mdecorde
285 321 mdecorde
                } catch (Exception ex) {System.out.println(ex); return false;}
286 321 mdecorde
287 321 mdecorde
                System.out.println("Done.")
288 321 mdecorde
                return true;
289 321 mdecorde
        }
290 321 mdecorde
}