Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / xmltxmpara / compiler.groovy @ 187

History | View | Annotate | Download (11.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2011-06-03 12:37:57 +0200 (Fri, 03 Jun 2011) $
25
// $LastChangedRevision: 1867 $
26
// $LastChangedBy: mdecorde $ 
27
//
28

    
29

    
30
package org.txm.importer.xmltxmpara;
31

    
32
import java.util.ArrayList;
33

    
34
import org.txm.importer.cwb.BuildCwbEncodeArgs;
35
import org.txm.importer.cwb.CwbEncode
36
import org.txm.importer.cwb.CwbMakeAll
37
import org.txm.importer.*;
38
import org.txm.scripts.*;
39
import org.txm.scripts.teitxm.BuildTTSrc;
40
import org.txm.scripts.teitxm.*;
41
import org.txm.utils.treetagger.TreeTagger;
42

    
43
import javax.xml.stream.*;
44
import java.net.URL;
45
import java.io.File;
46
import java.util.HashMap;
47
import java.util.List;
48

    
49
// TODO: Auto-generated Javadoc
50
/**
51
 * The Class compiler.
52
 */
53
class compiler 
54
{
55
        
56
        /** The debug. */
57
        private boolean debug= false;
58
        
59
        /** The input data. */
60
        private def inputData;
61
        
62
        /** The factory. */
63
        private def factory;
64
        
65
        /** The parser. */
66
        private XMLStreamReader parser;
67
        
68
        /** The dir. */
69
        private def dir;
70
        
71
        /** The output. */
72
        private Writer output;
73
        
74
        /** The url. */
75
        private def url;
76
        
77
        /** The text. */
78
        String text="";
79
        
80
        /** The base. */
81
        String base="";
82
        
83
        /** The project. */
84
        String project="";
85
        
86
        /** The text attributes. */
87
        String[] textAttributes = null;
88
        
89
        /** The cwb loc. */
90
        String cwbLoc;
91
        
92
        /** The lang. */
93
        private String lang ="fr";
94
        
95
        /** The anatypes. */
96
        private static ArrayList<String> anatypes;
97
        
98
        /** The s attribs. */
99
        private static HashMap<String, List<String>> sAttribs;
100
        
101
        /**
102
         * initialize.
103
         *
104
         */
105
        public compiler(){}
106
        
107
        /**
108
         * Instantiates a new compiler.
109
         *
110
         * @param url the url
111
         * @param text the text
112
         * @param base the base
113
         * @param project the project
114
         */
115
        public compiler(URL url,String text,String base, String project)
116
        {
117
                this.text = text
118
                this.base = base;
119
                this.project = project;
120
                this.textAttributes = textAttributes;
121
                try {
122
                        this.url = url;
123
                        inputData = url.openStream();
124
                        
125
                        factory = XMLInputFactory.newInstance();
126
                        parser = factory.createXMLStreamReader(inputData);
127
                } catch (XMLStreamException ex) {
128
                        System.out.println(ex);
129
                }catch (IOException ex) {
130
                        System.err.println("IOException while parsing ");
131
                }
132
        }
133
        
134
        /**
135
         * set the language of the corpus.
136
         *
137
         * @param lang the lang
138
         * @return the java.lang. object
139
         */
140
        public setLang(String lang)
141
        {
142
                this.lang = lang;
143
        }
144
        
145
        /**
146
         * Sets the cwb path.
147
         *
148
         * @param path the new cwb path
149
         */
150
        public void setCwbPath(String path)
151
        {
152
                if(!new File(path).exists())
153
                        System.err.println("CWB Path : "+path+" does not exists")
154
                cwbLoc = path;
155
        }
156
        
157
        /**
158
         * Creates the output.
159
         *
160
         * @param dirPathName the dir path name
161
         * @param fileName the file name
162
         * @return true, if successful
163
         */
164
        private boolean createOutput(File f){
165
                try {
166
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
167
                        return true;
168
                } catch (Exception e) {
169
                        System.err.println(e);
170
                        
171
                        return false;
172
                }
173
        }
174
        
175
        /**
176
         * Go to text.
177
         */
178
        private void GoToText()
179
        {
180
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
181
                {
182
                        if(event == XMLStreamConstants.END_ELEMENT)
183
                                if(parser.getLocalName().equals("teiHeader"))
184
                                        return;
185
                }
186
        }
187
        
188
        /**
189
         * Transfom file wtc.
190
         *
191
         * @param dirPathName the dir path name
192
         * @param fileName the file name
193
         * @return true, if successful
194
         */
195
        public boolean transfomFileWtc(File wtcfile)
196
        {
197
                createOutput(wtcfile);
198
                String headvalue=""
199
                String vAna = "";
200
                String vForm = "";
201
                String wordid= "";
202
                String vHead = "";
203
                
204
                int p_id = 0;
205
                int s_id = 0;
206
                
207
                boolean captureword = false;
208
                boolean flagForm = false;
209
                boolean flagAna = false;
210
                boolean inW = false;
211
                int wcounter = 1;
212
                GoToText();
213
                
214
                try 
215
                {
216
                        boolean stop = false;
217
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT && !stop; event = parser.next()) 
218
                        {
219
                                switch (event) 
220
                                {
221
                                        case XMLStreamConstants.START_ELEMENT:
222
                                        
223
                                                
224
                                                switch (parser.getLocalName()) 
225
                                                {                                                        
226
                                                        case "w":
227
                                                                inW = true;
228
                                                                wordid = parser.getAttributeValue(null, "id")
229
                                                                if(wordid == null)
230
                                                                        wordid = "w_"+text+"_"+(wcounter++)
231
                                                        
232
                                                        vAna ="";
233
                                                        break;
234
                                                        
235
                                                        case "form":
236
                                                                String type = parser.getAttributeValue(null, "type");
237
                                                                if(type == null)
238
                                                                {
239
                                                                        flagForm = true;
240
                                                                }
241
                                                                else if(type.equals("default"))
242
                                                                {
243
                                                                        flagForm = true;
244
                                                                }
245
                                                                else
246
                                                                {
247
                                                                        flagAna = true;
248
                                                                        vAna += "\t";
249
                                                                        if(!anatypes.contains(type))
250
                                                                                anatypes << type;        
251
                                                                }
252
                                                        vForm = "";
253
                                                        break;
254
                                                        
255
                                                        case "ana":
256
                                                        flagAna = true;
257
                                                        vAna += "\t";
258
                                                        String type = parser.getAttributeValue(null, "type");
259

    
260
                                                                if(type != null)
261
                                                                {        
262
                                                                        if(type.startsWith("#"))
263
                                                                                type = type.substring(1)
264
                                                                        if(!anatypes.contains(type))
265
                                                                                anatypes << type;
266
                                                                        break;
267
                                                                }
268
                                                        break;
269
                                                        
270
                                                        default:
271
                                                                if(!inW)
272
                                                                {
273
                                                                        output.write("<"+parser.getLocalName().toLowerCase());
274
                                                                        if(!sAttribs.containsKey(parser.getLocalName()))
275
                                                                                sAttribs.put(parser.getLocalName().toLowerCase(), []);
276
                                                                        
277
                                                                        for( int i = 0 ; i < parser.getAttributeCount() ; i++)
278
                                                                        {
279
                                                                                String attrname = parser.getAttributeLocalName(i).toLowerCase();
280
                                                                                String attrvalue = parser.getAttributeValue(i);
281
                                                                                if(!(parser.getLocalName() == "text" && attrname == "id"))
282
                                                                                        output.write(" "+attrname+"=\""+attrvalue.replace("\"", "'")+"\"");
283
                                                                                
284
                                                                                if(!sAttribs.get(parser.getLocalName().toLowerCase()).contains(attrname))
285
                                                                                        sAttribs.get(parser.getLocalName().toLowerCase()).add(attrname)
286
                                                                        }
287
                                                        
288
                                                                        if(parser.getLocalName() == "text")
289
                                                                        { // add some infos
290
                                                                                output.write(" id=\""+text+"\" base=\""+base+"\" project=\""+project+"\"");
291
                                                                        }
292
                                                                        output.write(">\n");
293
                                                                }
294
                                                        }
295
                                                break;
296
                                        
297
                                        case XMLStreamConstants.END_ELEMENT:
298
                                                
299
                                                switch (parser.getLocalName()) 
300
                                                {        
301
                                                        case "w":
302
                                                        output.write( vForm.replaceAll("&", "&amp;").replaceAll("<", "&lt;") +"\t"+wordid+vAna+"\n");
303
                                                        vAna = "";
304
                                                        vForm = "";
305
                                                        inW = false;
306
                                                        break;
307
                                                        
308
                                                        case "form":
309
                                                        flagForm = false;
310
                                                        flagAna = false;
311
                                                        break;
312
                                                        
313
                                                        case "ana":
314
                                                        flagAna = false;
315
                                                        break;
316
                                                        
317
                                                        default:
318
                                                                if(!inW)
319
                                                                        output.write("</"+parser.getLocalName().toLowerCase()+">\n");
320
                                                                if(parser.getLocalName() == "text")
321
                                                                        stop = true;
322
                                                }
323
                                                break;
324
                                        
325
                                        case XMLStreamConstants.CHARACTERS:
326
                                                if(inW)
327
                                                {
328
                                                if(flagForm)
329
                                                        if(flagAna)
330
                                                                vAna += parser.getText().trim();
331
                                                        else
332
                                                                vForm += parser.getText().trim();
333
                                                if(flagAna)
334
                                                        vAna += parser.getText().trim();
335
                                                }
336
                                                break;
337
                                }
338
                        }
339
                        
340
                        output.close();
341
                        parser.close();
342
                }
343
                catch (XMLStreamException ex) {
344
                        System.out.println(ex);
345
                        return false;
346
                }
347
                catch (IOException ex) {
348
                        System.out.println("IOException while parsing " + inputData);
349
                        return false;
350
                }
351
                return true;
352
        }
353
        
354
        
355
        
356
        /**
357
         * Run.
358
         *
359
         * @param rootDirFile the root dir file
360
         * @param basename the basename
361
         * @param textAttributes the text attributes
362
         * @return true, if successful
363
         */
364
        public boolean run(ArrayList<File> files, File binDir, String corpusname, String basename, String[] textAttributes) 
365
        {
366
                anatypes = new ArrayList<String>();// init only 1 time
367
                sAttribs = new HashMap<String, List<String>>();// init only 1 time
368
                String rootDir = binDir.getAbsolutePath();
369
                
370
                if(cwbLoc == null)
371
                        cwbLoc = org.txm.Toolbox.getParam(org.txm.Toolbox.CQI_SERVER_PATH_TO_CQPLIB)+File.separator;
372
                
373
                if(!new File(cwbLoc).exists())
374
                {
375
                        println ("CWB path error: "+cwbLoc)
376
                        return false;
377
                }
378
                if(!binDir.exists())
379
                {
380
                        println ("binary directory does not exists: "+binDir)
381
                        return false;
382
                }
383
                
384
                if(files == null || files.size() == 0)
385
                {
386
                        println "Error: no file to process"
387
                        return false;
388
                }
389
                                
390
                String textid = "";
391
                int counttext = 0;
392
                File wtcdir = new File(binDir,"wtc");
393
                File wtcfile = new File(wtcdir, corpusname.toLowerCase()+".wtc");
394
                //0 set Lang
395
                if(createOutput(wtcfile))
396
                {
397
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
398
                        output.close();
399
                }
400
                
401
                //1- Transform into WTC file
402
                def builder = null;
403
                for (File f : files) {
404
                        counttext++;
405
                        if (!f.exists()) {
406
                                println("The file "+f+ " does not exists")        
407
                        } else {        
408
                                //println("process file "+f)
409
                                String txtname = f.getName().substring(0,f.getName().length()-4);
410
                                builder = new compiler(f.toURL(), txtname, basename, "default");
411
                                builder.setLang(lang);
412
                                if (!builder.transfomFileWtc(wtcfile))
413
                                        return false;
414
                        }
415
                }
416
                
417
                //end corpus
418
                if(createOutput(wtcfile))
419
                {
420
                        output.write("</txmcorpus>\n");
421
                        output.close();
422
                }
423
                
424
                //2- Import into CWB
425
                def outDir = rootDir;
426
                def outDirTxm = rootDir;
427
                
428
                CwbEncode cwbEn = new CwbEncode();
429
                cwbEn.setDebug(debug);
430
                CwbMakeAll cwbMa = new CwbMakeAll();
431
                cwbMa.setDebug(debug);
432
                List<String> pargs = ["id"];
433
                for(String ana : anatypes)
434
                        pargs.add(ana);
435
                
436
                List<String> sargs = [];
437
                //println "Found Sattributes "+this.sAttribs;
438
                if (sAttribs.containsKey("text")) {
439
                        if (!sAttribs.get("text").contains("id"))
440
                                sAttribs.get("text").add("id");
441
                        if (!sAttribs.get("text").contains("base"))
442
                                sAttribs.get("text").add("base");
443
                        if (!sAttribs.get("text").contains("project"))
444
                                sAttribs.get("text").add("project");
445
                } else {
446
                        sargs.add("text:0+id+base+project")
447
                }
448
                
449
                if(sAttribs.containsKey("txmcorpus"))
450
                {
451
                        if(!sAttribs.get("txmcorpus").contains("lang"))
452
                                sAttribs.get("txmcorpus").add("lang");
453
                } else {
454
                        sargs.add("txmcorpus:0+lang")
455
                }
456
                
457
                for(String tag : this.sAttribs.keySet())
458
                {
459
                        String sAttr = tag;
460
                        if(sAttribs.get(tag).size() > 0)
461
                                sAttr += ":";
462
                        for(String attr : sAttribs.get(tag))
463
                                sAttr +="+"+attr;
464
                        sargs.add(sAttr)
465
                }
466
                
467
                String[] sAttributes = sargs;
468
                String[] pAttributes = pargs;
469
                println "Corpus structural attributes: "+sAttributes;
470
                println "Corpus lexical attributes: "+pAttributes;
471
                try {
472
                        String regPath = outDirTxm + "/registry/"+corpusname.toLowerCase();
473
                        cwbEn.run(new File(cwbLoc,"cwb-encode").getAbsolutePath(),
474
                                outDirTxm + "/data/"+corpusname+"/", 
475
                                wtcfile.getAbsolutePath(), 
476
                                regPath, pAttributes, sAttributes);
477
                        if (!new File(regPath).exists()) {
478
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
479
                                return false;
480
                        }
481
                        cwbMa.run(new File(cwbLoc,"cwb-makeall").getAbsolutePath(), corpusname.toUpperCase(), outDirTxm + "/registry");
482
                        
483
                } catch (Exception ex) {System.out.println(ex); return false;}
484
                
485
                System.out.println("Done.") 
486
                
487
                return true;
488
        }
489
        
490
        /**
491
         * Sets the debug.
492
         */
493
        public void setDebug()
494
        {
495
                this.debug = true;
496
        }
497
        
498
        /**
499
         * The main method.
500
         *
501
         * @param args the arguments
502
         */
503
        public static void main(String[] args)
504
        {
505
                File dir = new File("~/xml/geo");
506
                def c = new compiler();
507
                c.setDebug();
508
                c.setCwbPath("~/TXM/cwb/bin");
509
                c.run(dir,"geo");
510
        }
511
}