Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xmltxmpara / compiler.groovy @ 1000

History | View | Annotate | Download (11.4 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2011-06-03 12:37:57 +0200 (Fri, 03 Jun 2011) $
25
// $LastChangedRevision: 1867 $
26
// $LastChangedBy: mdecorde $ 
27
//
28

    
29

    
30
package org.txm.scripts.importer.xmltxmpara;
31

    
32
import java.util.ArrayList;
33

    
34
import org.txm.importer.cwb.BuildCwbEncodeArgs;
35
import org.txm.importer.cwb.CwbEncode
36
import org.txm.importer.cwb.CwbMakeAll
37
import org.txm.scripts.importer.*;
38
import org.txm.scripts.*;
39
import org.txm.importer.scripts.xmltxm.BuildTTSrc;
40
import org.txm.importer.scripts.xmltxm.*;
41
import org.txm.utils.treetagger.TreeTagger;
42

    
43
import javax.xml.stream.*;
44
import java.net.URL;
45
import java.io.File;
46
import java.util.HashMap;
47
import java.util.List;
48

    
49
// TODO: Auto-generated Javadoc
50
/**
51
 * The Class compiler.
52
 */
53
class compiler 
54
{
55
        
56
        /** The debug. */
57
        private boolean debug= false;
58
        
59
        /** The input data. */
60
        private def inputData;
61
        
62
        /** The factory. */
63
        private def factory;
64
        
65
        /** The parser. */
66
        private XMLStreamReader parser;
67
        
68
        /** The dir. */
69
        private def dir;
70
        
71
        /** The output. */
72
        private Writer output;
73
        
74
        /** The url. */
75
        private def url;
76
        
77
        /** The text. */
78
        String text="";
79
        
80
        /** The base. */
81
        String base="";
82
        
83
        /** The project. */
84
        String project="";
85
        
86
        /** The text attributes. */
87
        String[] textAttributes = null;
88
        
89
        /** The lang. */
90
        private String lang ="fr";
91
        
92
        /** The anatypes. */
93
        private static ArrayList<String> anatypes;
94
        
95
        /** The s attribs. */
96
        private static HashMap<String, List<String>> sAttribs;
97
        
98
        /**
99
         * initialize.
100
         *
101
         */
102
        public compiler(){}
103
        
104
        /**
105
         * Instantiates a new compiler.
106
         *
107
         * @param url the url
108
         * @param text the text
109
         * @param base the base
110
         * @param project the project
111
         */
112
        public compiler(URL url,String text,String base, String project)
113
        {
114
                this.text = text
115
                this.base = base;
116
                this.project = project;
117
                this.textAttributes = textAttributes;
118
                try {
119
                        this.url = url;
120
                        inputData = url.openStream();
121
                        
122
                        factory = XMLInputFactory.newInstance();
123
                        parser = factory.createXMLStreamReader(inputData);
124
                } catch (XMLStreamException ex) {
125
                        System.out.println(ex);
126
                }catch (IOException ex) {
127
                        System.err.println("IOException while parsing ");
128
                }
129
        }
130
        
131
        /**
132
         * set the language of the corpus.
133
         *
134
         * @param lang the lang
135
         * @return the java.lang. object
136
         */
137
        public setLang(String lang)
138
        {
139
                this.lang = lang;
140
        }
141
        
142
        /**
143
         * Creates the output.
144
         *
145
         * @param dirPathName the dir path name
146
         * @param fileName the file name
147
         * @return true, if successful
148
         */
149
        private boolean createOutput(File f){
150
                try {
151
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
152
                        return true;
153
                } catch (Exception e) {
154
                        System.err.println(e);
155
                        
156
                        return false;
157
                }
158
        }
159
        
160
        /**
161
         * Go to text.
162
         */
163
        private void GoToText()
164
        {
165
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
166
                {
167
                        if(event == XMLStreamConstants.END_ELEMENT)
168
                                if(parser.getLocalName().equals("teiHeader"))
169
                                        return;
170
                }
171
        }
172
        
173
        /**
174
         * Transfom file cqp.
175
         *
176
         * @param dirPathName the dir path name
177
         * @param fileName the file name
178
         * @return true, if successful
179
         */
180
        public boolean transfomFileCqp(File cqpfile)
181
        {
182
                createOutput(cqpfile);
183
                String headvalue=""
184
                String vAna = "";
185
                String vForm = "";
186
                String wordid= "";
187
                String vHead = "";
188
                
189
                int p_id = 0;
190
                int s_id = 0;
191
                
192
                boolean captureword = false;
193
                boolean flagForm = false;
194
                boolean flagAna = false;
195
                boolean inW = false;
196
                int wcounter = 1;
197
                GoToText();
198
                
199
                try 
200
                {
201
                        boolean stop = false;
202
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT && !stop; event = parser.next()) 
203
                        {
204
                                switch (event) 
205
                                {
206
                                        case XMLStreamConstants.START_ELEMENT:
207
                                        
208
                                                
209
                                                switch (parser.getLocalName()) 
210
                                                {                                                        
211
                                                        case "w":
212
                                                                inW = true;
213
                                                                wordid = parser.getAttributeValue(null, "id")
214
                                                                if(wordid == null)
215
                                                                        wordid = "w_"+text+"_"+(wcounter++)
216
                                                        
217
                                                        vAna ="";
218
                                                        break;
219
                                                        
220
                                                        case "form":
221
                                                                String type = parser.getAttributeValue(null, "type");
222
                                                                if(type == null)
223
                                                                {
224
                                                                        flagForm = true;
225
                                                                }
226
                                                                else if(type.equals("default"))
227
                                                                {
228
                                                                        flagForm = true;
229
                                                                }
230
                                                                else
231
                                                                {
232
                                                                        flagAna = true;
233
                                                                        vAna += "\t";
234
                                                                        if(!anatypes.contains(type))
235
                                                                                anatypes << type;        
236
                                                                }
237
                                                        vForm = "";
238
                                                        break;
239
                                                        
240
                                                        case "ana":
241
                                                        flagAna = true;
242
                                                        vAna += "\t";
243
                                                        String type = parser.getAttributeValue(null, "type");
244

    
245
                                                                if(type != null)
246
                                                                {        
247
                                                                        if(type.startsWith("#"))
248
                                                                                type = type.substring(1)
249
                                                                        if(!anatypes.contains(type))
250
                                                                                anatypes << type;
251
                                                                        break;
252
                                                                }
253
                                                        break;
254
                                                        
255
                                                        default:
256
                                                                if(!inW)
257
                                                                {
258
                                                                        output.write("<"+parser.getLocalName().toLowerCase());
259
                                                                        if(!sAttribs.containsKey(parser.getLocalName()))
260
                                                                                sAttribs.put(parser.getLocalName().toLowerCase(), []);
261
                                                                        
262
                                                                        for( int i = 0 ; i < parser.getAttributeCount() ; i++)
263
                                                                        {
264
                                                                                String attrname = parser.getAttributeLocalName(i).toLowerCase();
265
                                                                                String attrvalue = parser.getAttributeValue(i);
266
                                                                                if(!(parser.getLocalName() == "text" && attrname == "id"))
267
                                                                                        output.write(" "+attrname+"=\""+attrvalue.replace("\"", "'")+"\"");
268
                                                                                
269
                                                                                if(!sAttribs.get(parser.getLocalName().toLowerCase()).contains(attrname))
270
                                                                                        sAttribs.get(parser.getLocalName().toLowerCase()).add(attrname)
271
                                                                        }
272
                                                        
273
                                                                        if(parser.getLocalName() == "text")
274
                                                                        { // add some infos
275
                                                                                output.write(" id=\""+text+"\" base=\""+base+"\" project=\""+project+"\"");
276
                                                                        }
277
                                                                        output.write(">\n");
278
                                                                }
279
                                                        }
280
                                                break;
281
                                        
282
                                        case XMLStreamConstants.END_ELEMENT:
283
                                                
284
                                                switch (parser.getLocalName()) 
285
                                                {        
286
                                                        case "w":
287
                                                        output.write( vForm.replaceAll("&", "&amp;").replaceAll("<", "&lt;") +"\t"+wordid+vAna+"\n");
288
                                                        vAna = "";
289
                                                        vForm = "";
290
                                                        inW = false;
291
                                                        break;
292
                                                        
293
                                                        case "form":
294
                                                        flagForm = false;
295
                                                        flagAna = false;
296
                                                        break;
297
                                                        
298
                                                        case "ana":
299
                                                        flagAna = false;
300
                                                        break;
301
                                                        
302
                                                        default:
303
                                                                if(!inW)
304
                                                                        output.write("</"+parser.getLocalName().toLowerCase()+">\n");
305
                                                                if(parser.getLocalName() == "text")
306
                                                                        stop = true;
307
                                                }
308
                                                break;
309
                                        
310
                                        case XMLStreamConstants.CHARACTERS:
311
                                                if(inW)
312
                                                {
313
                                                if(flagForm)
314
                                                        if(flagAna)
315
                                                                vAna += parser.getText().trim();
316
                                                        else
317
                                                                vForm += parser.getText().trim();
318
                                                if(flagAna)
319
                                                        vAna += parser.getText().trim();
320
                                                }
321
                                                break;
322
                                }
323
                        }
324
                        
325
                        output.close();
326
                        parser.close();
327
                }
328
                catch (XMLStreamException ex) {
329
                        System.out.println(ex);
330
                        return false;
331
                }
332
                catch (IOException ex) {
333
                        System.out.println("IOException while parsing " + inputData);
334
                        return false;
335
                }
336
                return true;
337
        }
338
        
339
        
340
        
341
        /**
342
         * Run.
343
         *
344
         * @param rootDirFile the root dir file
345
         * @param basename the basename
346
         * @param textAttributes the text attributes
347
         * @return true, if successful
348
         */
349
        public boolean run(ArrayList<File> files, File binDir, String corpusname, String basename, String[] textAttributes) 
350
        {
351
                anatypes = new ArrayList<String>();// init only 1 time
352
                sAttribs = new HashMap<String, List<String>>();// init only 1 time
353
                String rootDir = binDir.getAbsolutePath();
354
                
355
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
356
                        println ("Error: CWB executables not well set.")
357
                        return false;
358
                }
359
                if(!binDir.exists())
360
                {
361
                        println ("binary directory does not exists: "+binDir)
362
                        return false;
363
                }
364
                
365
                if(files == null || files.size() == 0)
366
                {
367
                        println "Error: no file to process"
368
                        return false;
369
                }
370
                                
371
                String textid = "";
372
                int counttext = 0;
373
                File cqpdir = new File(binDir,"cqp");
374
                File cqpfile = new File(cqpdir, corpusname.toLowerCase()+".cqp");
375
                //0 set Lang
376
                if(createOutput(cqpfile))
377
                {
378
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
379
                        output.close();
380
                }
381
                
382
                //1- Transform into CQP file
383
                def builder = null;
384
                for (File f : files) {
385
                        counttext++;
386
                        if (!f.exists()) {
387
                                println("The file "+f+ " does not exists")        
388
                        } else {        
389
                                //println("process file "+f)
390
                                String txtname = f.getName().substring(0,f.getName().length()-4);
391
                                builder = new compiler(f.toURL(), txtname, basename, "default");
392
                                builder.setLang(lang);
393
                                if (!builder.transfomFileCqp(cqpfile))
394
                                        return false;
395
                        }
396
                }
397
                
398
                //end corpus
399
                if(createOutput(cqpfile))
400
                {
401
                        output.write("</txmcorpus>\n");
402
                        output.close();
403
                }
404
                
405
                //2- Import into CWB
406
                def outDir = rootDir;
407
                def outDirTxm = rootDir;
408
                
409
                CwbEncode cwbEn = new CwbEncode();
410
                cwbEn.setDebug(debug);
411
                CwbMakeAll cwbMa = new CwbMakeAll();
412
                cwbMa.setDebug(debug);
413
                List<String> pargs = ["id"];
414
                for(String ana : anatypes)
415
                        pargs.add(ana);
416
                
417
                List<String> sargs = [];
418
                //println "Found Sattributes "+this.sAttribs;
419
                if (sAttribs.containsKey("text")) {
420
                        if (!sAttribs.get("text").contains("id"))
421
                                sAttribs.get("text").add("id");
422
                        if (!sAttribs.get("text").contains("base"))
423
                                sAttribs.get("text").add("base");
424
                        if (!sAttribs.get("text").contains("project"))
425
                                sAttribs.get("text").add("project");
426
                } else {
427
                        sargs.add("text:0+id+base+project")
428
                }
429
                
430
                if(sAttribs.containsKey("txmcorpus"))
431
                {
432
                        if(!sAttribs.get("txmcorpus").contains("lang"))
433
                                sAttribs.get("txmcorpus").add("lang");
434
                } else {
435
                        sargs.add("txmcorpus:0+lang")
436
                }
437
                
438
                for(String tag : this.sAttribs.keySet())
439
                {
440
                        String sAttr = tag;
441
                        if(sAttribs.get(tag).size() > 0)
442
                                sAttr += ":";
443
                        for(String attr : sAttribs.get(tag))
444
                                sAttr +="+"+attr;
445
                        sargs.add(sAttr)
446
                }
447
                
448
                String[] sAttributes = sargs;
449
                String[] pAttributes = pargs;
450
                println "Corpus structural attributes: "+sAttributes;
451
                println "Corpus lexical attributes: "+pAttributes;
452
                try {
453
                        String regPath = outDirTxm + "/registry/"+corpusname.toLowerCase();
454
                        cwbEn.run(
455
                                outDirTxm + "/data/"+corpusname+"/", 
456
                                cqpfile.getAbsolutePath(), 
457
                                regPath, pAttributes, sAttributes);
458
                        if (!new File(regPath).exists()) {
459
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
460
                                return false;
461
                        }
462
                        cwbMa.run(corpusname.toUpperCase(), outDirTxm + "/registry");
463
                        
464
                } catch (Exception ex) {System.out.println(ex); return false;}
465
                
466
                System.out.println("Done.") 
467
                
468
                return true;
469
        }
470
        
471
        /**
472
         * Sets the debug.
473
         */
474
        public void setDebug()
475
        {
476
                this.debug = true;
477
        }
478
        
479
        /**
480
         * The main method.
481
         *
482
         * @param args the arguments
483
         */
484
        public static void main(String[] args)
485
        {
486
                File dir = new File("~/xml/geo");
487
                def c = new compiler();
488
                c.setDebug();
489
                c.setCwbPath("~/TXM/cwb/bin");
490
                c.run(dir,"geo");
491
        }
492
}