Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / perrault / compiler.groovy @ 966

History | View | Annotate | Download (9 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
//
22 321 mdecorde
//
23 321 mdecorde
//
24 479 mdecorde
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25 321 mdecorde
// $LastChangedRevision: 3400 $
26 321 mdecorde
// $LastChangedBy: mdecorde $
27 321 mdecorde
//
28 321 mdecorde
package org.txm.importer.perrault
29 321 mdecorde
30 321 mdecorde
import org.txm.importer.cwb.CwbEncode
31 321 mdecorde
import org.txm.importer.cwb.CwbMakeAll
32 321 mdecorde
import org.txm.importer.*;
33 321 mdecorde
import org.txm.scripts.*;
34 927 mdecorde
import org.txm.importer.xmltxm.BuildTTSrc;
35 927 mdecorde
import org.txm.importer.xmltxm.*;
36 321 mdecorde
import org.txm.utils.treetagger.TreeTagger;
37 321 mdecorde
38 321 mdecorde
import javax.xml.stream.*;
39 321 mdecorde
import java.net.URL;
40 321 mdecorde
import java.io.File;
41 321 mdecorde
import java.util.HashMap;
42 321 mdecorde
import java.util.List;
43 321 mdecorde
44 321 mdecorde
// TODO: Auto-generated Javadoc
45 321 mdecorde
/**
46 321 mdecorde
 * The Class compiler.
47 321 mdecorde
 */
48 321 mdecorde
class compiler
49 321 mdecorde
{
50 321 mdecorde
51 321 mdecorde
        /** The input data. */
52 321 mdecorde
        private def inputData;
53 321 mdecorde
54 321 mdecorde
        /** The factory. */
55 321 mdecorde
        private def factory;
56 321 mdecorde
57 321 mdecorde
        /** The parser. */
58 321 mdecorde
        private XMLStreamReader parser;
59 321 mdecorde
60 321 mdecorde
        /** The dir. */
61 321 mdecorde
        private def dir;
62 321 mdecorde
63 321 mdecorde
        /** The output. */
64 321 mdecorde
        private def output;
65 321 mdecorde
66 321 mdecorde
        /** The url. */
67 321 mdecorde
        private def url;
68 321 mdecorde
69 321 mdecorde
        /** The anahash. */
70 321 mdecorde
        private HashMap<String,String> anahash =new HashMap<String,String>() ;
71 321 mdecorde
72 321 mdecorde
        /** The text. */
73 321 mdecorde
        String text="";
74 321 mdecorde
75 321 mdecorde
        /** The base. */
76 321 mdecorde
        String base="";
77 321 mdecorde
78 321 mdecorde
        /** The project. */
79 321 mdecorde
        String project="";
80 321 mdecorde
81 321 mdecorde
        /** The lang. */
82 321 mdecorde
        private String lang ="fr";
83 321 mdecorde
84 321 mdecorde
        /**
85 321 mdecorde
         * initialize.
86 321 mdecorde
         *
87 321 mdecorde
         */
88 321 mdecorde
        public compiler(){}
89 321 mdecorde
90 321 mdecorde
        /**
91 321 mdecorde
         * Instantiates a new compiler.
92 321 mdecorde
         *
93 321 mdecorde
         * @param url the url
94 321 mdecorde
         * @param text the text
95 321 mdecorde
         * @param base the base
96 321 mdecorde
         * @param project the project
97 321 mdecorde
         */
98 321 mdecorde
        public compiler(URL url,String text,String base, String project)
99 321 mdecorde
        {
100 321 mdecorde
                this.text = text
101 321 mdecorde
                this.base = base;
102 321 mdecorde
                this.project = project;
103 321 mdecorde
                try {
104 321 mdecorde
                        this.url = url;
105 321 mdecorde
                        inputData = url.openStream();
106 321 mdecorde
107 321 mdecorde
                        factory = XMLInputFactory.newInstance();
108 321 mdecorde
                        parser = factory.createXMLStreamReader(inputData);
109 321 mdecorde
                } catch (XMLStreamException ex) {
110 321 mdecorde
                        System.out.println(ex);
111 321 mdecorde
                }catch (IOException ex) {
112 321 mdecorde
                        System.out.println("IOException while parsing ");
113 321 mdecorde
                }
114 321 mdecorde
        }
115 321 mdecorde
116 321 mdecorde
        /**
117 321 mdecorde
         * set the language of the corpus.
118 321 mdecorde
         *
119 321 mdecorde
         * @param lang the lang
120 321 mdecorde
         * @return the java.lang. object
121 321 mdecorde
         */
122 321 mdecorde
        public setLang(String lang)
123 321 mdecorde
        {
124 321 mdecorde
                this.lang = lang;
125 321 mdecorde
        }
126 321 mdecorde
127 321 mdecorde
        /**
128 321 mdecorde
         * Creates the output.
129 321 mdecorde
         *
130 321 mdecorde
         * @param dirPathName the dir path name
131 321 mdecorde
         * @param fileName the file name
132 321 mdecorde
         * @return true, if successful
133 321 mdecorde
         */
134 321 mdecorde
        private boolean createOutput(String dirPathName, String fileName){
135 321 mdecorde
                try {
136 321 mdecorde
                        File f = new File(dirPathName, fileName)
137 321 mdecorde
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
138 321 mdecorde
                        return true;
139 321 mdecorde
                } catch (Exception e) {
140 321 mdecorde
                        System.out.println(e.getLocalizedMessage());
141 321 mdecorde
                        return false;
142 321 mdecorde
                }
143 321 mdecorde
        }
144 321 mdecorde
145 321 mdecorde
        /**
146 321 mdecorde
         * Go to text.
147 321 mdecorde
         */
148 321 mdecorde
        private void GoToText()
149 321 mdecorde
        {
150 321 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
151 321 mdecorde
                {
152 321 mdecorde
                        if(event == XMLStreamConstants.END_ELEMENT)
153 321 mdecorde
                                if(parser.getLocalName().equals("teiHeader"))
154 321 mdecorde
                                        return;
155 321 mdecorde
                }
156 321 mdecorde
        }
157 321 mdecorde
158 321 mdecorde
        /**
159 803 mdecorde
         * Transfom file cqp.
160 321 mdecorde
         *
161 321 mdecorde
         * @param dirPathName the dir path name
162 321 mdecorde
         * @param fileName the file name
163 321 mdecorde
         * @return true, if successful
164 321 mdecorde
         */
165 804 mdecorde
        public boolean transfomFileCqp(String dirPathName, String fileName)
166 321 mdecorde
        {
167 321 mdecorde
                createOutput(dirPathName, fileName);
168 321 mdecorde
169 321 mdecorde
                String headvalue=""
170 321 mdecorde
                String vAna = "";
171 321 mdecorde
                String vForm = "";
172 321 mdecorde
                String wordid= "";
173 321 mdecorde
                String vHead = "";
174 321 mdecorde
175 321 mdecorde
                int p_id = 0;
176 321 mdecorde
                int q_id = 0;
177 321 mdecorde
                int lg_id = 0;
178 321 mdecorde
                int l_id = 0;
179 321 mdecorde
180 321 mdecorde
                boolean flaglg = false;
181 321 mdecorde
                boolean flaghead = false;
182 321 mdecorde
                boolean flagAuthor = false;
183 321 mdecorde
                boolean flagDate = false;
184 321 mdecorde
                boolean flagForm = false;
185 321 mdecorde
                boolean flagAna = false;
186 321 mdecorde
187 321 mdecorde
                this.GoToText()
188 321 mdecorde
                output.write("<txmcorpus lang=\""+lang+"\">\n");
189 321 mdecorde
                try
190 321 mdecorde
                {
191 321 mdecorde
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
192 321 mdecorde
                        {
193 321 mdecorde
                                switch (event)
194 321 mdecorde
                                {
195 321 mdecorde
                                        case XMLStreamConstants.START_ELEMENT:
196 321 mdecorde
                                        switch (parser.getLocalName())
197 321 mdecorde
                                        {
198 321 mdecorde
199 321 mdecorde
                                                case "head"://get attr lang
200 321 mdecorde
                                                        flaghead =true;
201 321 mdecorde
                                                        vHead="";
202 321 mdecorde
                                                break;
203 321 mdecorde
204 321 mdecorde
                                                case "text":
205 321 mdecorde
                                                output.write("<text id=\""+text+"\" base=\""+base+"\" project=\""+project+"\">\n");
206 321 mdecorde
                                                break;
207 321 mdecorde
208 321 mdecorde
                                                case "p":
209 321 mdecorde
                                                output.write("<"+parser.getLocalName()+" id=\""+text+"_p_"+(p_id++)+"\">\n");
210 321 mdecorde
                                                break;
211 321 mdecorde
                                                case "q":
212 321 mdecorde
                                                output.write("<"+parser.getLocalName()+" id=\""+text+"_q_"+(q_id++)+"\">\n");
213 321 mdecorde
                                                break;
214 321 mdecorde
                                                case "l":
215 321 mdecorde
                                                        output.write("<"+parser.getLocalName()+" id=\""+text+"_l_"+(l_id++)+"\">\n");
216 321 mdecorde
                                                break;
217 321 mdecorde
218 321 mdecorde
                                                case "lg":
219 321 mdecorde
                                                        flaglg = true;
220 321 mdecorde
                                                break;
221 321 mdecorde
222 321 mdecorde
                                                case "s":
223 321 mdecorde
                                                output.write( "<s>\n");
224 321 mdecorde
                                                break;
225 321 mdecorde
226 321 mdecorde
                                                case "w":
227 321 mdecorde
                                                        for(int i = 0 ; i < parser.getAttributeCount(); i++)
228 321 mdecorde
                                                                if(parser.getAttributeLocalName(i).equals("id"))
229 321 mdecorde
                                                                {
230 321 mdecorde
                                                                        wordid = parser.getAttributeValue(i);
231 321 mdecorde
                                                                }
232 321 mdecorde
                                                break;
233 321 mdecorde
                                                case "form":
234 321 mdecorde
                                                flagForm = true;
235 321 mdecorde
                                                vForm = "";
236 321 mdecorde
                                                vAna ="";
237 321 mdecorde
                                                break;
238 321 mdecorde
239 321 mdecorde
                                                case "ana":
240 321 mdecorde
                                                flagAna = true;
241 321 mdecorde
                                                break;
242 321 mdecorde
                                        }
243 321 mdecorde
                                        break;
244 321 mdecorde
245 321 mdecorde
                                        case XMLStreamConstants.END_ELEMENT:
246 321 mdecorde
                                        switch (parser.getLocalName())
247 321 mdecorde
                                        {
248 321 mdecorde
                                                case "head"://get attr lang
249 321 mdecorde
                                                flaghead =false;
250 321 mdecorde
                                                if(flaglg)
251 321 mdecorde
                                                        output.write("<moral id=\""+text+"_moral_"+(lg_id++)+"\" head=\""+vHead+"\">\n");
252 321 mdecorde
                                                break;
253 321 mdecorde
254 321 mdecorde
                                                case "text":
255 321 mdecorde
                                                output.write("</text>\n");
256 321 mdecorde
                                                break;
257 321 mdecorde
258 321 mdecorde
                                                case "p":
259 321 mdecorde
                                                case "q":
260 321 mdecorde
                                                case "l":
261 321 mdecorde
                                                output.write("</"+parser.getLocalName()+">\n");
262 321 mdecorde
                                                break;
263 321 mdecorde
264 321 mdecorde
                                                case "lg":
265 321 mdecorde
                                                output.write("</moral>\n");
266 321 mdecorde
                                                flaglg = false;
267 321 mdecorde
                                                break;
268 321 mdecorde
269 321 mdecorde
                                                case "s":
270 321 mdecorde
                                                output.write( "</s>\n");
271 321 mdecorde
                                                break;
272 321 mdecorde
273 321 mdecorde
                                                case "w":
274 321 mdecorde
                                                if(!(flaghead && flaglg))
275 321 mdecorde
                                                        if(vAna != null)
276 321 mdecorde
                                                                output.write( vForm +vAna+"\t"+wordid+"\n");
277 321 mdecorde
                                                vAna = "";
278 321 mdecorde
                                                vForm = "";
279 321 mdecorde
                                                break;
280 321 mdecorde
281 321 mdecorde
                                                case "form":
282 321 mdecorde
                                                flagForm = false;
283 321 mdecorde
                                                break;
284 321 mdecorde
285 321 mdecorde
                                                case "ana":
286 321 mdecorde
                                                flagAna = false;
287 321 mdecorde
                                                break;
288 321 mdecorde
                                        }
289 321 mdecorde
                                        break;
290 321 mdecorde
291 321 mdecorde
                                        case XMLStreamConstants.CHARACTERS:
292 321 mdecorde
                                        if(flagForm)
293 321 mdecorde
                                        vForm += parser.getText().trim();
294 321 mdecorde
                                        if(flagAna)
295 321 mdecorde
                                        vAna += "\t" +parser.getText().trim();
296 321 mdecorde
                                        if(flaghead && flaglg)
297 321 mdecorde
                                                vHead += parser.getText().trim();
298 321 mdecorde
                                        break;
299 321 mdecorde
                                }
300 321 mdecorde
                        }
301 321 mdecorde
                        output.write("</txmcorpus>");
302 321 mdecorde
                        output.close();
303 321 mdecorde
                        parser.close();
304 321 mdecorde
                }
305 321 mdecorde
                catch (XMLStreamException ex) {
306 321 mdecorde
                        System.out.println(ex);
307 321 mdecorde
                }
308 321 mdecorde
                catch (IOException ex) {
309 321 mdecorde
                        System.out.println("IOException while parsing " + inputData);
310 321 mdecorde
                }
311 321 mdecorde
312 321 mdecorde
                return true;
313 321 mdecorde
        }
314 321 mdecorde
315 321 mdecorde
        /**
316 321 mdecorde
         * Run.
317 321 mdecorde
         *
318 321 mdecorde
         * @param files the files
319 321 mdecorde
         * @return true, if successful
320 321 mdecorde
         */
321 321 mdecorde
        public boolean run(List<File> files)
322 321 mdecorde
        {
323 321 mdecorde
                String rootDir ="";
324 321 mdecorde
                if(files.size() > 0)
325 321 mdecorde
                        rootDir = files.get(0).getParentFile().getParentFile().getAbsolutePath()+"/";//"~/xml/perrault/";
326 321 mdecorde
327 714 mdecorde
                        if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
328 714 mdecorde
                                println ("Error: CWB executables not well set.")
329 714 mdecorde
                                return false;
330 714 mdecorde
                        }
331 321 mdecorde
                if(!new File(rootDir).exists())
332 321 mdecorde
                {
333 321 mdecorde
                        println ("binary directory does not exists: "+rootDir)
334 321 mdecorde
                        return false;
335 321 mdecorde
                }
336 803 mdecorde
                new File(rootDir+"cqp/","perrault.cqp").delete();//cleaning&preparing
337 803 mdecorde
                new File(rootDir+"cqp/").deleteDir();
338 803 mdecorde
                new File(rootDir+"cqp/").mkdir();
339 321 mdecorde
                new File(rootDir+"registry/").mkdir();
340 321 mdecorde
341 321 mdecorde
                String textid="";
342 321 mdecorde
                int counttext =0;
343 804 mdecorde
                //1- Transform into CQP file
344 321 mdecorde
                for(File f : files)
345 321 mdecorde
                {
346 321 mdecorde
                        counttext++;
347 321 mdecorde
                        if(!f.exists())
348 321 mdecorde
                        {
349 321 mdecorde
                                println("file "+f+ " does not exists")
350 321 mdecorde
                        }
351 321 mdecorde
                        else
352 321 mdecorde
                        {
353 321 mdecorde
                                println("process file "+f)
354 321 mdecorde
                                String txtname = f.getName().substring(0,f.getName().length()-4);
355 321 mdecorde
                                def builder = new compiler(f.toURL(), txtname, "perrault", "default");
356 321 mdecorde
                                builder.setLang(lang);
357 804 mdecorde
                                builder.transfomFileCqp(rootDir+"cqp","perrault.cqp");
358 321 mdecorde
                        }
359 321 mdecorde
                }
360 321 mdecorde
361 321 mdecorde
                //2- Import into CWB
362 321 mdecorde
                def outDir =rootDir;
363 321 mdecorde
                def outDirTxm = rootDir;
364 321 mdecorde
365 321 mdecorde
                CwbEncode cwbEn = new CwbEncode();
366 321 mdecorde
                CwbMakeAll cwbMa = new CwbMakeAll();
367 321 mdecorde
368 321 mdecorde
                String[] pAttributes = ["pos","lemme","id"];
369 321 mdecorde
                //String[] pAttributes = ["id"];
370 321 mdecorde
                String[] sAttributes = ["txmcorpus:0+lang","text:0+id+base+project","p:0+id","q:0+id","moral:0+head+id","l:0+id"];
371 321 mdecorde
372 321 mdecorde
                try
373 321 mdecorde
                {
374 803 mdecorde
                        cwbEn.run(outDirTxm + "data", outDir + "/cqp/"+"perrault.cqp", outDirTxm + "registry/"+"perrault",pAttributes, sAttributes);
375 714 mdecorde
                        cwbMa.run("PERRAULT", outDirTxm + "registry");
376 321 mdecorde
377 321 mdecorde
                } catch (Exception ex) {System.out.println(ex); return false;}
378 321 mdecorde
379 321 mdecorde
                System.out.println("Done.")
380 321 mdecorde
381 321 mdecorde
                return true;
382 321 mdecorde
        }
383 321 mdecorde
384 321 mdecorde
        /**
385 321 mdecorde
         * The main method.
386 321 mdecorde
         *
387 321 mdecorde
         * @param args the arguments
388 321 mdecorde
         */
389 321 mdecorde
        public static void main(String[] args)
390 321 mdecorde
        {
391 321 mdecorde
                File dir = new File("~/xml/perrault/txm/");
392 321 mdecorde
                List<File> files = dir.listFiles();
393 321 mdecorde
                new compiler().run(files);
394 321 mdecorde
        }
395 321 mdecorde
}