Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / perrault / compiler.groovy @ 479

History | View | Annotate | Download (9.3 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.perrault
29

    
30
import org.txm.importer.cwb.CwbEncode
31
import org.txm.importer.cwb.CwbMakeAll
32
import org.txm.importer.*;
33
import org.txm.scripts.*;
34
import org.txm.scripts.teitxm.BuildTTSrc;
35
import org.txm.scripts.teitxm.*;
36
import org.txm.utils.treetagger.TreeTagger;
37

    
38
import javax.xml.stream.*;
39
import java.net.URL;
40
import java.io.File;
41
import java.util.HashMap;
42
import java.util.List;
43

    
44
// TODO: Auto-generated Javadoc
45
/**
46
 * The Class compiler.
47
 */
48
class compiler 
49
{
50
        
51
        /** The input data. */
52
        private def inputData;
53
        
54
        /** The factory. */
55
        private def factory;
56
        
57
        /** The parser. */
58
        private XMLStreamReader parser;
59
        
60
        /** The dir. */
61
        private def dir;
62
        
63
        /** The output. */
64
        private def output;
65
        
66
        /** The url. */
67
        private def url;
68
        
69
        /** The anahash. */
70
        private HashMap<String,String> anahash =new HashMap<String,String>() ;
71
        
72
        /** The text. */
73
        String text="";
74
        
75
        /** The base. */
76
        String base="";
77
        
78
        /** The project. */
79
        String project="";
80
        
81
        /** The cwb loc. */
82
        String cwbLoc;
83
        
84
        /** The lang. */
85
        private String lang ="fr";
86
        
87
        /**
88
         * initialize.
89
         *
90
         */
91
        public compiler(){}
92
        
93
        /**
94
         * Instantiates a new compiler.
95
         *
96
         * @param url the url
97
         * @param text the text
98
         * @param base the base
99
         * @param project the project
100
         */
101
        public compiler(URL url,String text,String base, String project)
102
        {
103
                this.text = text
104
                this.base = base;
105
                this.project = project;
106
                try {
107
                        this.url = url;
108
                        inputData = url.openStream();
109
                        
110
                        factory = XMLInputFactory.newInstance();
111
                        parser = factory.createXMLStreamReader(inputData);
112
                } catch (XMLStreamException ex) {
113
                        System.out.println(ex);
114
                }catch (IOException ex) {
115
                        System.out.println("IOException while parsing ");
116
                }
117
        }
118
        
119
        /**
120
         * set the language of the corpus.
121
         *
122
         * @param lang the lang
123
         * @return the java.lang. object
124
         */
125
        public setLang(String lang)
126
        {
127
                this.lang = lang;
128
        }
129
        
130
        /**
131
         * Sets the cwb path.
132
         *
133
         * @param path the new cwb path
134
         */
135
        public void setCwbPath(String path)
136
        {
137
                if(!new File(path).exists())
138
                        System.err.println("CWB Path : "+path+" does not exists")
139
                cwbLoc = path;
140
        }
141
        
142
        /**
143
         * Creates the output.
144
         *
145
         * @param dirPathName the dir path name
146
         * @param fileName the file name
147
         * @return true, if successful
148
         */
149
        private boolean createOutput(String dirPathName, String fileName){
150
                try {
151
                        File f = new File(dirPathName, fileName)
152
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
153
                        return true;
154
                } catch (Exception e) {
155
                        System.out.println(e.getLocalizedMessage());
156
                        return false;
157
                }
158
        }
159
        
160
        /**
161
         * Go to text.
162
         */
163
        private void GoToText()
164
        {
165
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
166
                {
167
                        if(event == XMLStreamConstants.END_ELEMENT)
168
                                if(parser.getLocalName().equals("teiHeader"))
169
                                        return;
170
                }
171
        }
172
        
173
        /**
174
         * Transfom file wtc.
175
         *
176
         * @param dirPathName the dir path name
177
         * @param fileName the file name
178
         * @return true, if successful
179
         */
180
        public boolean transfomFileWtc(String dirPathName, String fileName)
181
        {
182
                createOutput(dirPathName, fileName);
183
                
184
                String headvalue=""
185
                String vAna = "";
186
                String vForm = "";
187
                String wordid= "";
188
                String vHead = "";
189
                
190
                int p_id = 0;
191
                int q_id = 0;
192
                int lg_id = 0;
193
                int l_id = 0;
194
                
195
                boolean flaglg = false;
196
                boolean flaghead = false;
197
                boolean flagAuthor = false;
198
                boolean flagDate = false;
199
                boolean flagForm = false;
200
                boolean flagAna = false;
201
                
202
                this.GoToText()
203
                output.write("<txmcorpus lang=\""+lang+"\">\n"); 
204
                try 
205
                {
206
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
207
                        {
208
                                switch (event) 
209
                                {
210
                                        case XMLStreamConstants.START_ELEMENT:
211
                                        switch (parser.getLocalName()) 
212
                                        {
213
                                                
214
                                                case "head"://get attr lang
215
                                                        flaghead =true;
216
                                                        vHead="";
217
                                                break;
218
                                                                                                
219
                                                case "text":
220
                                                output.write("<text id=\""+text+"\" base=\""+base+"\" project=\""+project+"\">\n");
221
                                                break;
222
                                                
223
                                                case "p":
224
                                                output.write("<"+parser.getLocalName()+" id=\""+text+"_p_"+(p_id++)+"\">\n");
225
                                                break;
226
                                                case "q":
227
                                                output.write("<"+parser.getLocalName()+" id=\""+text+"_q_"+(q_id++)+"\">\n");
228
                                                break;
229
                                                case "l":
230
                                                        output.write("<"+parser.getLocalName()+" id=\""+text+"_l_"+(l_id++)+"\">\n");
231
                                                break;
232
                                                
233
                                                case "lg":
234
                                                        flaglg = true;
235
                                                break;
236
                                                
237
                                                case "s":
238
                                                output.write( "<s>\n");
239
                                                break;
240
                                                
241
                                                case "w":
242
                                                        for(int i = 0 ; i < parser.getAttributeCount(); i++)
243
                                                                if(parser.getAttributeLocalName(i).equals("id"))
244
                                                                {        
245
                                                                        wordid = parser.getAttributeValue(i);
246
                                                                }
247
                                                break;
248
                                                case "form":
249
                                                flagForm = true;
250
                                                vForm = "";
251
                                                vAna ="";
252
                                                break;
253
                                                
254
                                                case "ana":
255
                                                flagAna = true;
256
                                                break;
257
                                        }
258
                                        break;
259
                                        
260
                                        case XMLStreamConstants.END_ELEMENT:
261
                                        switch (parser.getLocalName()) 
262
                                        {
263
                                                case "head"://get attr lang
264
                                                flaghead =false;
265
                                                if(flaglg)
266
                                                        output.write("<moral id=\""+text+"_moral_"+(lg_id++)+"\" head=\""+vHead+"\">\n");
267
                                                break;
268
                                                
269
                                                case "text":
270
                                                output.write("</text>\n");
271
                                                break;
272
                                                
273
                                                case "p":
274
                                                case "q":
275
                                                case "l":
276
                                                output.write("</"+parser.getLocalName()+">\n");
277
                                                break;
278
                                                
279
                                                case "lg":
280
                                                output.write("</moral>\n");
281
                                                flaglg = false;
282
                                                break;
283
                                                
284
                                                case "s":
285
                                                output.write( "</s>\n");
286
                                                break;
287
                                                
288
                                                case "w":
289
                                                if(!(flaghead && flaglg))
290
                                                        if(vAna != null)
291
                                                                output.write( vForm +vAna+"\t"+wordid+"\n");
292
                                                vAna = "";
293
                                                vForm = "";
294
                                                break;
295
                                                
296
                                                case "form":
297
                                                flagForm = false;
298
                                                break;
299
                                                
300
                                                case "ana":
301
                                                flagAna = false;
302
                                                break;
303
                                        }
304
                                        break;
305
                                        
306
                                        case XMLStreamConstants.CHARACTERS:
307
                                        if(flagForm)
308
                                        vForm += parser.getText().trim();
309
                                        if(flagAna)
310
                                        vAna += "\t" +parser.getText().trim();
311
                                        if(flaghead && flaglg)
312
                                                vHead += parser.getText().trim();
313
                                        break;
314
                                }
315
                        }
316
                        output.write("</txmcorpus>"); 
317
                        output.close();
318
                        parser.close();
319
                }
320
                catch (XMLStreamException ex) {
321
                        System.out.println(ex);
322
                }
323
                catch (IOException ex) {
324
                        System.out.println("IOException while parsing " + inputData);
325
                }
326
                
327
                return true;
328
        }
329
        
330
        /**
331
         * Run.
332
         *
333
         * @param files the files
334
         * @return true, if successful
335
         */
336
        public boolean run(List<File> files) 
337
        {
338
                String rootDir ="";
339
                if(files.size() > 0)
340
                        rootDir = files.get(0).getParentFile().getParentFile().getAbsolutePath()+"/";//"~/xml/perrault/";
341
                if(cwbLoc == null)
342
                        cwbLoc = org.txm.Toolbox.getParam(org.txm.Toolbox.CQI_SERVER_PATH_TO_CQPLIB)+File.separator;
343
                
344
                if(!new File(cwbLoc).exists())
345
                {
346
                        println ("CWB path error: "+cwbLoc)
347
                        return false;
348
                }
349
                if(!new File(rootDir).exists())
350
                {
351
                        println ("binary directory does not exists: "+rootDir)
352
                        return false;
353
                }
354
                new File(rootDir+"wtc/","perrault.wtc").delete();//cleaning&preparing
355
                new File(rootDir+"wtc/").deleteDir();
356
                new File(rootDir+"wtc/").mkdir();
357
                new File(rootDir+"registry/").mkdir();
358
                
359
                String textid="";
360
                int counttext =0;
361
                //1- Transform into WTC file
362
                for(File f : files)
363
                {
364
                        counttext++;
365
                        if(!f.exists())
366
                        {
367
                                println("file "+f+ " does not exists")        
368
                        }
369
                        else
370
                        {        
371
                                println("process file "+f)
372
                                String txtname = f.getName().substring(0,f.getName().length()-4);
373
                                def builder = new compiler(f.toURL(), txtname, "perrault", "default");
374
                                builder.setLang(lang);
375
                                builder.transfomFileWtc(rootDir+"wtc","perrault.wtc");
376
                        }
377
                }
378
                
379
                //2- Import into CWB
380
                def outDir =rootDir;
381
                def outDirTxm = rootDir;
382
                
383
                CwbEncode cwbEn = new CwbEncode();
384
                CwbMakeAll cwbMa = new CwbMakeAll();
385
                
386
                String[] pAttributes = ["pos","lemme","id"];
387
                //String[] pAttributes = ["id"];
388
                String[] sAttributes = ["txmcorpus:0+lang","text:0+id+base+project","p:0+id","q:0+id","moral:0+head+id","l:0+id"];
389
                
390
                try
391
                {
392
                        cwbEn.run(cwbLoc + "cwb-encode", outDirTxm + "data", outDir + "/wtc/"+"perrault.wtc", outDirTxm + "registry/"+"perrault",pAttributes, sAttributes);
393
                        cwbMa.run(cwbLoc + "cwb-makeall", "PERRAULT", outDirTxm + "registry");
394
                        
395
                } catch (Exception ex) {System.out.println(ex); return false;}
396
                
397
                System.out.println("Done.") 
398
                
399
                return true;
400
        }
401
        
402
        /**
403
         * The main method.
404
         *
405
         * @param args the arguments
406
         */
407
        public static void main(String[] args)
408
        {
409
                File dir = new File("~/xml/perrault/txm/");
410
                List<File> files = dir.listFiles();
411
                new compiler().run(files);
412
        }
413
}