Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xmltxmpara / compiler.groovy @ 1688

History | View | Annotate | Download (11.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2011-06-03 12:37:57 +0200 (Fri, 03 Jun 2011) $
25
// $LastChangedRevision: 1867 $
26
// $LastChangedBy: mdecorde $ 
27
//
28

    
29

    
30
package org.txm.scripts.importer.xmltxmpara;
31

    
32
import java.util.ArrayList;
33

    
34
import org.txm.importer.cwb.BuildCwbEncodeArgs;
35
import org.txm.importer.cwb.CwbEncode
36
import org.txm.importer.cwb.CwbMakeAll
37
import org.txm.scripts.importer.*;
38
import org.txm.scripts.*;
39
import org.txm.importer.scripts.xmltxm.BuildTTSrc;
40
import org.txm.importer.scripts.xmltxm.*;
41
import org.txm.utils.treetagger.TreeTagger;
42

    
43
import javax.xml.stream.*;
44
import java.net.URL;
45
import java.io.File;
46
import java.util.HashMap;
47
import java.util.List;
48

    
49
// TODO: Auto-generated Javadoc
50
/**
51
 * The Class compiler.
52
 */
53
class compiler 
54
{
55
        
56
        /** The debug. */
57
        private boolean debug= false;
58
        
59
        /** The input data. */
60
        private def inputData;
61
        
62
        /** The factory. */
63
        private def factory;
64
        
65
        /** The parser. */
66
        private XMLStreamReader parser;
67
        
68
        /** The dir. */
69
        private def dir;
70
        
71
        /** The output. */
72
        private Writer output;
73
        
74
        /** The url. */
75
        private def url;
76
        
77
        /** The text. */
78
        String text="";
79
        
80
        /** The base. */
81
        String base="";
82
        
83
        /** The project. */
84
        String project="";
85
        
86
        /** The text attributes. */
87
        String[] textAttributes = null;
88
        
89
        /** The lang. */
90
        private String lang ="fr";
91
        
92
        /** The anatypes. */
93
        private static ArrayList<String> anatypes;
94
        
95
        /** The s attribs. */
96
        private static HashMap<String, List<String>> sAttribs;
97
        
98
        /**
99
         * initialize.
100
         *
101
         */
102
        public compiler(){}
103
        
104
        /**
105
         * Instantiates a new compiler.
106
         *
107
         * @param url the url
108
         * @param text the text
109
         * @param base the base
110
         * @param project the project
111
         */
112
        public compiler(URL url,String text,String base, String project)
113
        {
114
                this.text = text
115
                this.base = base;
116
                this.project = project;
117
                this.textAttributes = textAttributes;
118
                try {
119
                        this.url = url;
120
                        inputData = url.openStream();
121
                        
122
                        factory = XMLInputFactory.newInstance();
123
                        parser = factory.createXMLStreamReader(inputData);
124
                } catch (XMLStreamException ex) {
125
                        System.out.println(ex);
126
                }catch (IOException ex) {
127
                        System.err.println("IOException while parsing ");
128
                }
129
        }
130
        
131
        /**
132
         * set the language of the corpus.
133
         *
134
         * @param lang the lang
135
         * @return the java.lang. object
136
         */
137
        public setLang(String lang)
138
        {
139
                this.lang = lang;
140
        }
141
        
142
        /**
143
         * Creates the output.
144
         *
145
         * @param dirPathName the dir path name
146
         * @param fileName the file name
147
         * @return true, if successful
148
         */
149
        private boolean createOutput(File f){
150
                try {
151
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
152
                        return true;
153
                } catch (Exception e) {
154
                        System.err.println(e);
155
                        
156
                        return false;
157
                }
158
        }
159
        
160
        /**
161
         * Go to text.
162
         */
163
        private void GoToText()
164
        {
165
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
166
                {
167
                        if(event == XMLStreamConstants.END_ELEMENT)
168
                                if(parser.getLocalName().equals("teiHeader"))
169
                                        return;
170
                }
171
        }
172
        
173
        /**
174
         * Transfom file cqp.
175
         *
176
         * @param dirPathName the dir path name
177
         * @param fileName the file name
178
         * @return true, if successful
179
         */
180
        public boolean transfomFileCqp(File cqpfile)
181
        {
182
                createOutput(cqpfile);
183
                String headvalue=""
184
                String vAna = "";
185
                String vForm = "";
186
                String wordid= "";
187
                String vHead = "";
188
                
189
                int p_id = 0;
190
                int s_id = 0;
191
                
192
                boolean captureword = false;
193
                boolean flagForm = false;
194
                boolean flagAna = false;
195
                boolean inW = false;
196
                int wcounter = 1;
197
                GoToText();
198
                
199
                try 
200
                {
201
                        boolean stop = false;
202
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT && !stop; event = parser.next()) 
203
                        {
204
                                switch (event) 
205
                                {
206
                                        case XMLStreamConstants.START_ELEMENT:
207
                                        
208
                                                
209
                                                switch (parser.getLocalName()) 
210
                                                {                                                        
211
                                                        case "w":
212
                                                                inW = true;
213
                                                                wordid = parser.getAttributeValue(null, "id")
214
                                                                if(wordid == null)
215
                                                                        wordid = "w_"+text+"_"+(wcounter++)
216
                                                        
217
                                                        vAna ="";
218
                                                        break;
219
                                                        
220
                                                        case "form":
221
                                                                String type = parser.getAttributeValue(null, "type");
222
                                                                if(type == null)
223
                                                                {
224
                                                                        flagForm = true;
225
                                                                }
226
                                                                else if(type.equals("default"))
227
                                                                {
228
                                                                        flagForm = true;
229
                                                                }
230
                                                                else
231
                                                                {
232
                                                                        flagAna = true;
233
                                                                        vAna += "\t";
234
                                                                        if(!anatypes.contains(type))
235
                                                                                anatypes << type;        
236
                                                                }
237
                                                        vForm = "";
238
                                                        break;
239
                                                        
240
                                                        case "ana":
241
                                                        flagAna = true;
242
                                                        vAna += "\t";
243
                                                        String type = parser.getAttributeValue(null, "type");
244

    
245
                                                                if(type != null)
246
                                                                {        
247
                                                                        if(type.startsWith("#"))
248
                                                                                type = type.substring(1)
249
                                                                        if(!anatypes.contains(type))
250
                                                                                anatypes << type;
251
                                                                        break;
252
                                                                }
253
                                                        break;
254
                                                        
255
                                                        default:
256
                                                                if(!inW)
257
                                                                {
258
                                                                        output.write("<"+parser.getLocalName().toLowerCase());
259
                                                                        if(!sAttribs.containsKey(parser.getLocalName()))
260
                                                                                sAttribs.put(parser.getLocalName().toLowerCase(), []);
261
                                                                        
262
                                                                        for( int i = 0 ; i < parser.getAttributeCount() ; i++)
263
                                                                        {
264
                                                                                String attrname = parser.getAttributeLocalName(i).toLowerCase();
265
                                                                                String attrvalue = parser.getAttributeValue(i);
266
                                                                                if(!(parser.getLocalName() == "text" && attrname == "id"))
267
                                                                                        output.write(" "+attrname+"=\""+attrvalue.replace("\"", "'")+"\"");
268
                                                                                
269
                                                                                if(!sAttribs.get(parser.getLocalName().toLowerCase()).contains(attrname))
270
                                                                                        sAttribs.get(parser.getLocalName().toLowerCase()).add(attrname)
271
                                                                        }
272
                                                        
273
                                                                        if(parser.getLocalName() == "text")
274
                                                                        { // add some infos
275
                                                                                output.write(" id=\""+text+"\" base=\""+base+"\" project=\""+project+"\"");
276
                                                                        }
277
                                                                        output.write(">\n");
278
                                                                }
279
                                                        }
280
                                                break;
281
                                        
282
                                        case XMLStreamConstants.END_ELEMENT:
283
                                                
284
                                                switch (parser.getLocalName()) 
285
                                                {        
286
                                                        case "w":
287
                                                        output.write( vForm.replaceAll("&", "&amp;").replaceAll("<", "&lt;") +"\t"+wordid+vAna+"\n");
288
                                                        vAna = "";
289
                                                        vForm = "";
290
                                                        inW = false;
291
                                                        break;
292
                                                        
293
                                                        case "form":
294
                                                        flagForm = false;
295
                                                        flagAna = false;
296
                                                        break;
297
                                                        
298
                                                        case "ana":
299
                                                        flagAna = false;
300
                                                        break;
301
                                                        
302
                                                        default:
303
                                                                if(!inW)
304
                                                                        output.write("</"+parser.getLocalName().toLowerCase()+">\n");
305
                                                                if(parser.getLocalName() == "text")
306
                                                                        stop = true;
307
                                                }
308
                                                break;
309
                                        
310
                                        case XMLStreamConstants.CHARACTERS:
311
                                                if(inW)
312
                                                {
313
                                                if(flagForm)
314
                                                        if(flagAna)
315
                                                                vAna += parser.getText().trim();
316
                                                        else
317
                                                                vForm += parser.getText().trim();
318
                                                if(flagAna)
319
                                                        vAna += parser.getText().trim();
320
                                                }
321
                                                break;
322
                                }
323
                        }
324
                        
325
                        output.close();
326
                        if (parser != null) parser.close();
327
                        if (inputData != null) inputData.close();
328
                }
329
                catch (Exception ex) {
330
                        System.out.println(ex);
331
                        if (parser != null) parser.close();
332
                        if (inputData != null) inputData.close();
333
                        return false;
334
                }
335
                return true;
336
        }
337
        
338
        
339
        
340
        /**
341
         * Run.
342
         *
343
         * @param rootDirFile the root dir file
344
         * @param basename the basename
345
         * @param textAttributes the text attributes
346
         * @return true, if successful
347
         */
348
        public boolean run(ArrayList<File> files, File binDir, String corpusname, String basename, String[] textAttributes) 
349
        {
350
                anatypes = new ArrayList<String>();// init only 1 time
351
                sAttribs = new HashMap<String, List<String>>();// init only 1 time
352
                String rootDir = binDir.getAbsolutePath();
353
                
354
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
355
                        println ("Error: CWB executables not well set.")
356
                        return false;
357
                }
358
                if(!binDir.exists())
359
                {
360
                        println ("binary directory does not exists: "+binDir)
361
                        return false;
362
                }
363
                
364
                if(files == null || files.size() == 0)
365
                {
366
                        println "Error: no file to process"
367
                        return false;
368
                }
369
                                
370
                String textid = "";
371
                int counttext = 0;
372
                File cqpdir = new File(binDir,"cqp");
373
                File cqpfile = new File(cqpdir, corpusname.toLowerCase()+".cqp");
374
                //0 set Lang
375
                if(createOutput(cqpfile))
376
                {
377
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
378
                        output.close();
379
                }
380
                
381
                //1- Transform into CQP file
382
                def builder = null;
383
                for (File f : files) {
384
                        counttext++;
385
                        if (!f.exists()) {
386
                                println("The file "+f+ " does not exists")        
387
                        } else {        
388
                                //println("process file "+f)
389
                                String txtname = f.getName().substring(0,f.getName().length()-4);
390
                                builder = new compiler(f.toURL(), txtname, basename, "default");
391
                                builder.setLang(lang);
392
                                if (!builder.transfomFileCqp(cqpfile))
393
                                        return false;
394
                        }
395
                }
396
                
397
                //end corpus
398
                if(createOutput(cqpfile))
399
                {
400
                        output.write("</txmcorpus>\n");
401
                        output.close();
402
                }
403
                
404
                //2- Import into CWB
405
                def outDir = rootDir;
406
                def outDirTxm = rootDir;
407
                
408
                CwbEncode cwbEn = new CwbEncode();
409
                cwbEn.setDebug(debug);
410
                CwbMakeAll cwbMa = new CwbMakeAll();
411
                cwbMa.setDebug(debug);
412
                List<String> pargs = ["id"];
413
                for(String ana : anatypes)
414
                        pargs.add(ana);
415
                
416
                List<String> sargs = [];
417
                //println "Found Sattributes "+this.sAttribs;
418
                if (sAttribs.containsKey("text")) {
419
                        if (!sAttribs.get("text").contains("id"))
420
                                sAttribs.get("text").add("id");
421
                        if (!sAttribs.get("text").contains("base"))
422
                                sAttribs.get("text").add("base");
423
                        if (!sAttribs.get("text").contains("project"))
424
                                sAttribs.get("text").add("project");
425
                } else {
426
                        sargs.add("text:0+id+base+project")
427
                }
428
                
429
                if(sAttribs.containsKey("txmcorpus"))
430
                {
431
                        if(!sAttribs.get("txmcorpus").contains("lang"))
432
                                sAttribs.get("txmcorpus").add("lang");
433
                } else {
434
                        sargs.add("txmcorpus:0+lang")
435
                }
436
                
437
                for(String tag : this.sAttribs.keySet())
438
                {
439
                        String sAttr = tag;
440
                        if(sAttribs.get(tag).size() > 0)
441
                                sAttr += ":";
442
                        for(String attr : sAttribs.get(tag))
443
                                sAttr +="+"+attr;
444
                        sargs.add(sAttr)
445
                }
446
                
447
                String[] sAttributes = sargs;
448
                String[] pAttributes = pargs;
449
                println "Corpus structural attributes: "+sAttributes;
450
                println "Corpus lexical attributes: "+pAttributes;
451
                try {
452
                        String regPath = outDirTxm + "/registry/"+corpusname.toLowerCase();
453
                        cwbEn.run(
454
                                outDirTxm + "/data/"+corpusname+"/", 
455
                                cqpfile.getAbsolutePath(), 
456
                                regPath, pAttributes, sAttributes);
457
                        if (!new File(regPath).exists()) {
458
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
459
                                return false;
460
                        }
461
                        cwbMa.run(corpusname.toUpperCase(), outDirTxm + "/registry");
462
                        
463
                } catch (Exception ex) {System.out.println(ex); return false;}
464
                
465
                System.out.println("Done.") 
466
                
467
                return true;
468
        }
469
        
470
        /**
471
         * Sets the debug.
472
         */
473
        public void setDebug()
474
        {
475
                this.debug = true;
476
        }
477
        
478
        /**
479
         * The main method.
480
         *
481
         * @param args the arguments
482
         */
483
        public static void main(String[] args)
484
        {
485
                File dir = new File("~/xml/geo");
486
                def c = new compiler();
487
                c.setDebug();
488
                c.setCwbPath("~/TXM/cwb/bin");
489
                c.run(dir,"geo");
490
        }
491
}