Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / perrault / compiler.groovy @ 1000

History | View | Annotate | Download (9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.scripts.importer.perrault
29

    
30
import org.txm.importer.cwb.CwbEncode
31
import org.txm.importer.cwb.CwbMakeAll
32
import org.txm.scripts.importer.*;
33
import org.txm.scripts.*;
34
import org.txm.importer.scripts.xmltxm.BuildTTSrc;
35
import org.txm.importer.scripts.xmltxm.*;
36
import org.txm.utils.treetagger.TreeTagger;
37

    
38
import javax.xml.stream.*;
39
import java.net.URL;
40
import java.io.File;
41
import java.util.HashMap;
42
import java.util.List;
43

    
44
// TODO: Auto-generated Javadoc
45
/**
46
 * The Class compiler.
47
 */
48
class compiler 
49
{
50
        
51
        /** The input data. */
52
        private def inputData;
53
        
54
        /** The factory. */
55
        private def factory;
56
        
57
        /** The parser. */
58
        private XMLStreamReader parser;
59
        
60
        /** The dir. */
61
        private def dir;
62
        
63
        /** The output. */
64
        private def output;
65
        
66
        /** The url. */
67
        private def url;
68
        
69
        /** The anahash. */
70
        private HashMap<String,String> anahash =new HashMap<String,String>() ;
71
        
72
        /** The text. */
73
        String text="";
74
        
75
        /** The base. */
76
        String base="";
77
        
78
        /** The project. */
79
        String project="";
80
        
81
        /** The lang. */
82
        private String lang ="fr";
83
        
84
        /**
85
         * initialize.
86
         *
87
         */
88
        public compiler(){}
89
        
90
        /**
91
         * Instantiates a new compiler.
92
         *
93
         * @param url the url
94
         * @param text the text
95
         * @param base the base
96
         * @param project the project
97
         */
98
        public compiler(URL url,String text,String base, String project)
99
        {
100
                this.text = text
101
                this.base = base;
102
                this.project = project;
103
                try {
104
                        this.url = url;
105
                        inputData = url.openStream();
106
                        
107
                        factory = XMLInputFactory.newInstance();
108
                        parser = factory.createXMLStreamReader(inputData);
109
                } catch (XMLStreamException ex) {
110
                        System.out.println(ex);
111
                }catch (IOException ex) {
112
                        System.out.println("IOException while parsing ");
113
                }
114
        }
115
        
116
        /**
117
         * set the language of the corpus.
118
         *
119
         * @param lang the lang
120
         * @return the java.lang. object
121
         */
122
        public setLang(String lang)
123
        {
124
                this.lang = lang;
125
        }
126
        
127
        /**
128
         * Creates the output.
129
         *
130
         * @param dirPathName the dir path name
131
         * @param fileName the file name
132
         * @return true, if successful
133
         */
134
        private boolean createOutput(String dirPathName, String fileName){
135
                try {
136
                        File f = new File(dirPathName, fileName)
137
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
138
                        return true;
139
                } catch (Exception e) {
140
                        System.out.println(e.getLocalizedMessage());
141
                        return false;
142
                }
143
        }
144
        
145
        /**
146
         * Go to text.
147
         */
148
        private void GoToText()
149
        {
150
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
151
                {
152
                        if(event == XMLStreamConstants.END_ELEMENT)
153
                                if(parser.getLocalName().equals("teiHeader"))
154
                                        return;
155
                }
156
        }
157
        
158
        /**
159
         * Transfom file cqp.
160
         *
161
         * @param dirPathName the dir path name
162
         * @param fileName the file name
163
         * @return true, if successful
164
         */
165
        public boolean transfomFileCqp(String dirPathName, String fileName)
166
        {
167
                createOutput(dirPathName, fileName);
168
                
169
                String headvalue=""
170
                String vAna = "";
171
                String vForm = "";
172
                String wordid= "";
173
                String vHead = "";
174
                
175
                int p_id = 0;
176
                int q_id = 0;
177
                int lg_id = 0;
178
                int l_id = 0;
179
                
180
                boolean flaglg = false;
181
                boolean flaghead = false;
182
                boolean flagAuthor = false;
183
                boolean flagDate = false;
184
                boolean flagForm = false;
185
                boolean flagAna = false;
186
                
187
                this.GoToText()
188
                output.write("<txmcorpus lang=\""+lang+"\">\n"); 
189
                try 
190
                {
191
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
192
                        {
193
                                switch (event) 
194
                                {
195
                                        case XMLStreamConstants.START_ELEMENT:
196
                                        switch (parser.getLocalName()) 
197
                                        {
198
                                                
199
                                                case "head"://get attr lang
200
                                                        flaghead =true;
201
                                                        vHead="";
202
                                                break;
203
                                                                                                
204
                                                case "text":
205
                                                output.write("<text id=\""+text+"\" base=\""+base+"\" project=\""+project+"\">\n");
206
                                                break;
207
                                                
208
                                                case "p":
209
                                                output.write("<"+parser.getLocalName()+" id=\""+text+"_p_"+(p_id++)+"\">\n");
210
                                                break;
211
                                                case "q":
212
                                                output.write("<"+parser.getLocalName()+" id=\""+text+"_q_"+(q_id++)+"\">\n");
213
                                                break;
214
                                                case "l":
215
                                                        output.write("<"+parser.getLocalName()+" id=\""+text+"_l_"+(l_id++)+"\">\n");
216
                                                break;
217
                                                
218
                                                case "lg":
219
                                                        flaglg = true;
220
                                                break;
221
                                                
222
                                                case "s":
223
                                                output.write( "<s>\n");
224
                                                break;
225
                                                
226
                                                case "w":
227
                                                        for(int i = 0 ; i < parser.getAttributeCount(); i++)
228
                                                                if(parser.getAttributeLocalName(i).equals("id"))
229
                                                                {        
230
                                                                        wordid = parser.getAttributeValue(i);
231
                                                                }
232
                                                break;
233
                                                case "form":
234
                                                flagForm = true;
235
                                                vForm = "";
236
                                                vAna ="";
237
                                                break;
238
                                                
239
                                                case "ana":
240
                                                flagAna = true;
241
                                                break;
242
                                        }
243
                                        break;
244
                                        
245
                                        case XMLStreamConstants.END_ELEMENT:
246
                                        switch (parser.getLocalName()) 
247
                                        {
248
                                                case "head"://get attr lang
249
                                                flaghead =false;
250
                                                if(flaglg)
251
                                                        output.write("<moral id=\""+text+"_moral_"+(lg_id++)+"\" head=\""+vHead+"\">\n");
252
                                                break;
253
                                                
254
                                                case "text":
255
                                                output.write("</text>\n");
256
                                                break;
257
                                                
258
                                                case "p":
259
                                                case "q":
260
                                                case "l":
261
                                                output.write("</"+parser.getLocalName()+">\n");
262
                                                break;
263
                                                
264
                                                case "lg":
265
                                                output.write("</moral>\n");
266
                                                flaglg = false;
267
                                                break;
268
                                                
269
                                                case "s":
270
                                                output.write( "</s>\n");
271
                                                break;
272
                                                
273
                                                case "w":
274
                                                if(!(flaghead && flaglg))
275
                                                        if(vAna != null)
276
                                                                output.write( vForm +vAna+"\t"+wordid+"\n");
277
                                                vAna = "";
278
                                                vForm = "";
279
                                                break;
280
                                                
281
                                                case "form":
282
                                                flagForm = false;
283
                                                break;
284
                                                
285
                                                case "ana":
286
                                                flagAna = false;
287
                                                break;
288
                                        }
289
                                        break;
290
                                        
291
                                        case XMLStreamConstants.CHARACTERS:
292
                                        if(flagForm)
293
                                        vForm += parser.getText().trim();
294
                                        if(flagAna)
295
                                        vAna += "\t" +parser.getText().trim();
296
                                        if(flaghead && flaglg)
297
                                                vHead += parser.getText().trim();
298
                                        break;
299
                                }
300
                        }
301
                        output.write("</txmcorpus>"); 
302
                        output.close();
303
                        parser.close();
304
                }
305
                catch (XMLStreamException ex) {
306
                        System.out.println(ex);
307
                }
308
                catch (IOException ex) {
309
                        System.out.println("IOException while parsing " + inputData);
310
                }
311
                
312
                return true;
313
        }
314
        
315
        /**
316
         * Run.
317
         *
318
         * @param files the files
319
         * @return true, if successful
320
         */
321
        public boolean run(List<File> files) 
322
        {
323
                String rootDir ="";
324
                if(files.size() > 0)
325
                        rootDir = files.get(0).getParentFile().getParentFile().getAbsolutePath()+"/";//"~/xml/perrault/";
326
                
327
                        if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
328
                                println ("Error: CWB executables not well set.")
329
                                return false;
330
                        }
331
                if(!new File(rootDir).exists())
332
                {
333
                        println ("binary directory does not exists: "+rootDir)
334
                        return false;
335
                }
336
                new File(rootDir+"cqp/","perrault.cqp").delete();//cleaning&preparing
337
                new File(rootDir+"cqp/").deleteDir();
338
                new File(rootDir+"cqp/").mkdir();
339
                new File(rootDir+"registry/").mkdir();
340
                
341
                String textid="";
342
                int counttext =0;
343
                //1- Transform into CQP file
344
                for(File f : files)
345
                {
346
                        counttext++;
347
                        if(!f.exists())
348
                        {
349
                                println("file "+f+ " does not exists")        
350
                        }
351
                        else
352
                        {        
353
                                println("process file "+f)
354
                                String txtname = f.getName().substring(0,f.getName().length()-4);
355
                                def builder = new compiler(f.toURL(), txtname, "perrault", "default");
356
                                builder.setLang(lang);
357
                                builder.transfomFileCqp(rootDir+"cqp","perrault.cqp");
358
                        }
359
                }
360
                
361
                //2- Import into CWB
362
                def outDir =rootDir;
363
                def outDirTxm = rootDir;
364
                
365
                CwbEncode cwbEn = new CwbEncode();
366
                CwbMakeAll cwbMa = new CwbMakeAll();
367
                
368
                String[] pAttributes = ["pos","lemme","id"];
369
                //String[] pAttributes = ["id"];
370
                String[] sAttributes = ["txmcorpus:0+lang","text:0+id+base+project","p:0+id","q:0+id","moral:0+head+id","l:0+id"];
371
                
372
                try
373
                {
374
                        cwbEn.run(outDirTxm + "data", outDir + "/cqp/"+"perrault.cqp", outDirTxm + "registry/"+"perrault",pAttributes, sAttributes);
375
                        cwbMa.run("PERRAULT", outDirTxm + "registry");
376
                        
377
                } catch (Exception ex) {System.out.println(ex); return false;}
378
                
379
                System.out.println("Done.") 
380
                
381
                return true;
382
        }
383
        
384
        /**
385
         * The main method.
386
         *
387
         * @param args the arguments
388
         */
389
        public static void main(String[] args)
390
        {
391
                File dir = new File("~/xml/perrault/txm/");
392
                List<File> files = dir.listFiles();
393
                new compiler().run(files);
394
        }
395
}