Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / RGAQCJ / importRGAQCJ.groovy @ 187

History | View | Annotate | Download (10.6 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (Tue, 24 Jan 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.RGAQCJ
29

    
30
import org.txm.importer.cwb.CwbEncode
31
import org.txm.importer.cwb.CwbMakeAll
32

    
33
import javax.xml.stream.*;
34
import java.net.URL;
35

    
36
import org.txm.importer.filters.*;
37
// TODO: Auto-generated Javadoc
38

    
39
/**
40
 * The Class BuildXmlRGAQCJ.
41
 *
42
 * @author mdecorde
43
 * simple import in cwb of RGAQCJ
44
 * structunits : initial, s, lb ...
45
 * wordproperties : form, pos and n
46
 */
47

    
48
public class BuildXmlRGAQCJ
49
{
50
        
51
        /** The input data. */
52
        private def inputData;
53
        
54
        /** The factory. */
55
        private def factory;
56
        
57
        /** The parser. */
58
        private def parser;
59
        
60
        /** The dir. */
61
        private def dir;
62
        
63
        /** The output. */
64
        private def output;
65
        
66
        /** The url. */
67
        private def url;
68
        
69
        /** The anahash. */
70
        private HashMap<String,String> anahash =new HashMap<String,String>() ;
71
        
72
        /** The types. */
73
        private ArrayList<String> types;
74
        
75
        /** The current type. */
76
        private String currentType;
77
        
78
        /** The cat ref. */
79
        private String catRef="";
80
        
81
        /** The initial. */
82
        private String initial;
83
        
84
        /**
85
         * initialize.
86
         *
87
         * @param url the url
88
         * @param initial the initial
89
         * @param types the types
90
         */
91
        public BuildXmlRGAQCJ(URL url,String initial,ArrayList<String> types){
92
                try {
93
                        this.url = url;
94
                        this.types = types;
95
                        this.initial = initial;
96
                        inputData = url.openStream();
97
                        factory = XMLInputFactory.newInstance();
98
                        parser = factory.createXMLStreamReader(inputData);
99
                } catch (XMLStreamException ex) {
100
                        System.out.println(ex);
101
                }catch (IOException ex) {
102
                        System.out.println("IOException while parsing ");
103
                }
104
        }
105
        
106
        /**
107
         * clear anaHash variable, it is used to store ana tags values then print it when the end element </ana> is found.
108
         */
109
        private void fillanaHash()
110
        {
111
                anahash.clear();
112
                for(String s : types)
113
                        anahash.put( s,"-" );
114
        }
115
        
116
        /**
117
         * Creates the output.
118
         *
119
         * @param dirPathName output directory
120
         * @param fileName output file name
121
         * @return true, if successful
122
         */
123
        private boolean createOutput(String dirPathName, String fileName){
124
                try {
125
                        dir = new File(dirPathName)
126
                        File f = new File(dir, fileName);
127
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
128

    
129
                        return true;
130
                } catch (Exception e) {
131
                        System.out.println(e.getLocalizedMessage());
132
                        return false;
133
                }
134
        }
135
        
136
        /**
137
         * Checks if is body.
138
         *
139
         * @param name the name
140
         * @return true, if is body
141
         */
142
        private static boolean isBody(String name) {
143
                if (name.equals("body")) return true;
144
        }
145
        
146
        /**
147
         * Find body.
148
         *
149
         * @return true, if successful
150
         */
151
        private boolean findBody(){
152
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()){
153
                        if(event == XMLStreamConstants.START_ELEMENT){
154
                                if (parser.getLocalName() == "catRef"){
155
                                        if(parser.getAttributeValue(null,"target") == "#forme_prose")
156
                                                catRef = "prose";
157
                                        else if(parser.getAttributeValue(null,"target") == "#forme_vers")
158
                                                catRef = "vers";
159
                                        else
160
                                                catRef = "mixte";
161
                                        //println("catRef : "+catRef);
162
                                }
163
                                if (isBody(parser.getLocalName())){
164
                                        return true;
165
                                }
166
                        }
167
                }
168
                return false;
169
        } 
170
        
171
        /**
172
         * Transfom file wtc.
173
         *
174
         * @param dirPathName the dir path name
175
         * @param fileName the file name
176
         * @return true, if successful
177
         */
178
        public boolean transfomFileWtc(String dirPathName, String fileName)
179
        {
180
                if(findBody() && createOutput(dirPathName, fileName))
181
                {
182
                        def idPb = "";
183
                        def idLb = "";
184
                        String idLinesuiv;
185
                        
186
                        boolean flagForm = false;
187
                        boolean flagAna = false;
188
                        String vAna = "";
189
                        String vWord = "";
190
                        String vForm = "";
191
                        
192
                        try 
193
                        {
194
                                File inputfile = new File(this.url.getFile());
195
                                output.write( "<text initiale=\""+initial+"\" forme=\""+catRef+"\">\n");
196
                                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
197
                                {
198
                                        switch (event) 
199
                                        {
200
                                                case XMLStreamConstants.START_ELEMENT:
201
                                                switch (parser.getLocalName()) 
202
                                                {
203
                                                        case "p":
204
                                                                output.write "<p n=" + parser.getAttributeValue(null,"n") +">\n";
205
                                                                break;
206
                                                        
207
                                                        case "s":
208
                                                                output.write "<s n=" + parser.getAttributeValue(null,"n")+ "\">\n";
209
                                                                break;
210
                                                        
211
                                                        case "pb":
212
                                                                idLinesuiv = parser.getAttributeValue(null,"n");
213
                                                                idPb = idLinesuiv;
214
                                                                break;
215
                                                                
216
                                                        case "lb":
217
                                                                idLb = parser.getAttributeValue(null,"n");
218
                                                                break;
219
                                                        
220
                                                        case "w":
221
                                                                fillanaHash();
222
                                                                vWord ="";
223
                                                        break;
224
                                                        
225
                                                        case "form":
226
                                                                flagForm =true;
227
                                                                vForm ="";
228
                                                        break;
229
                                                        case "ana":
230
                                                                flagAna = true;
231
                                                                vAna ="";
232
                                                                
233
                                                                currentType = (parser.getAttributeValue(null,"ref"));
234
                                                                if(currentType != null)
235
                                                                        currentType= currentType.substring(1)
236
                                                                else
237
                                                                        flagAna = false;
238
                                                        break;
239
                                                }
240
                                                break;
241
                                                
242
                                                case XMLStreamConstants.END_ELEMENT:
243
                                                switch (parser.getLocalName()) 
244
                                                {
245
                                                        case "p":
246
                                                                output.write( "</p>\n");
247
                                                                break;
248
                                                        
249
                                                        case "s":
250
                                                                output.write( "</s>\n");
251
                                                                break;
252

    
253
                                                        case "w":
254
                                                                for(String type : types)
255
                                                                        vWord+="\t"+anahash.get(type)
256
                                                                if(catRef.equals("vers"))
257
                                                                        output.write( vWord+ "\t" +idLb+ "\n");
258
                                                                else
259
                                                                        output.write( vWord+ "\t" +idPb+ "\n");
260
                                                                vWord= "";
261
                                                                break;
262
                                                        
263
                                                        case "form":
264
                                                                flagForm = false;
265
                                                                vWord +=vForm; 
266
                                                                break;
267
                                                                
268
                                                        case "ana":
269
                                                                if(flagAna)
270
                                                                        anahash.put (currentType,vAna);
271
                                                                flagAna = false;
272
                                                                break;
273
                                                }
274
                                                break;
275
                                                
276
                                                case XMLStreamConstants.CHARACTERS:
277
                                                        if(flagForm){
278
                                                                vForm += parser.getText().trim();
279
                                                        }
280
                                                        if(flagAna){
281
                                                                vAna += parser.getText().trim();
282
                                                        }
283
                                                break;
284
                                        }
285
                                }
286
                                output.write( "</text>\n");
287
                                output.close();
288
                                parser.close();
289
                        }
290
                        catch (XMLStreamException ex) {
291
                                System.out.println(ex);
292
                        }
293
                        catch (IOException ex) {
294
                                System.out.println("IOException while parsing " + inputData);
295
                        }
296
                }
297
                return true;
298
        }
299
        
300
        /**
301
         * Gets the anatypes.
302
         *
303
         * @param rootDir the root dir
304
         * @param xmltxmfiles the xmltxmfiles
305
         * @return the anatypes
306
         */
307
        public static def getAnatypes(String rootDir, List<String> xmltxmfiles)
308
        {
309
                ArrayList<String> types = new ArrayList<String>()
310
                for(int i=0; i < xmltxmfiles.size();i++)
311
                {
312
                        //println("look in "+xmltxmfiles[i]);
313
                        URL url = new File(rootDir+"/anainline/", xmltxmfiles[i]).toURL();
314
                        def inputData = url.openStream();
315
                        def factory = XMLInputFactory.newInstance();
316
                        XMLStreamReader parser = factory.createXMLStreamReader(inputData);
317
                        
318
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
319
                                if(event == XMLStreamConstants.START_ELEMENT)
320
                                        if(parser.getLocalName().equals("ana"))
321
                                        {
322
                                                String s = parser.getAttributeValue(null, "ref");
323
                                                if(s != null)
324
                                                {
325
                                                        s = s.substring(1, s.length());
326
                                                        if(!types.contains(s))
327
                                                        {
328
                                                                types.add(s);
329
                                                        }
330
                                                }
331
                                        }
332
                        
333
                        inputData=null;
334
                        factory=null
335
                        parser=null;
336
                }
337
                return types;                
338
        }
339
                
340
        /**
341
         * Process.
342
         *
343
         * @param files the files
344
         * @param rootDir the root dir
345
         * @param cwbLoc the cwb loc
346
         */
347
        public static void process(List<String> files, String rootDir,String cwbLoc) 
348
        {
349
                new File(rootDir+"wtc/").mkdir();
350
                new File(rootDir+"registry/").mkdir();
351
                //String[] files = ["roland-ana.xml","qgraal_cm-ana.xml","artu-ana.xml","qjm-ana.xml","commyn1-ana.xml","jehpar-ana.xml"];
352
                //String rootDir = "~/xml/rgaqcj/";
353
                ArrayList<String> types = BuildXmlRGAQCJ.getAnatypes(rootDir, files);
354
                println(types);
355
                
356
                //1- Transform into WTC file
357
                File f = new File(rootDir+"wtc/","RGAQCJ.wtc");
358
                f.delete();
359
                
360
                def output = new OutputStreamWriter(new FileOutputStream(f) , "UTF-8");
361
                output.write("<corpus name=\"RGAQCJ\">\n");
362
                output.close();
363
                
364
                
365
                def initiales = "RAQCJ";
366
                for(int i=0; i < files.size();i++)
367
                {
368
                        println("process file "+files[i])
369
                        String file = files[i]; 
370
                        def builder = new BuildXmlRGAQCJ(new File(rootDir+"/anainline/",file).toURL(),""+initiales.charAt(i),types);
371
                        builder.transfomFileWtc(rootDir+"wtc","RGAQCJ.wtc");
372
                }
373
                
374
                output = new OutputStreamWriter(new FileOutputStream(f,true) , "UTF-8");
375
                output.write("</corpus>\n");
376
                output.close();
377

    
378
                //2- Import into CWB
379
                
380
                //def cwbLoc ="~/Bureau/textometrie/CWB/cwb-3.0/utils/";//chemin vers executable cqp
381

    
382
                def inDir = rootDir+"src/";
383
                def outDir =rootDir;
384
                def outDirTxm = rootDir;
385
                
386
                CwbEncode cwbEn = new CwbEncode();
387
                CwbMakeAll cwbMa = new CwbMakeAll();
388
                
389
                types.add("line");
390
                String[] pAttributes = types;
391
                String[] sAttributes = ["corpus:0+name","text:0+initiale+forme", "s:0+n","p:0+n"];
392
                
393
                try
394
                {
395
                        if (System.getProperty("os.name").contains("Windows"))
396
                        {
397
                                cwbEn.run(cwbLoc + "cwb-encode.exe", outDirTxm + "data/"+"RGAQCJ", outDir +"wtc/"+"RGAQCJ"+".wtc", outDirTxm + "registry/"+"rgaqcj",pAttributes, sAttributes);
398
                                cwbMa.run(cwbLoc + "cwb-makeall.exe", "RGAQCJ", outDirTxm + "registry");
399
                        }
400
                        else
401
                        {
402
                                cwbEn.run(cwbLoc + "cwb-encode", outDirTxm + "data/"+"RGAQCJ", outDir + "/wtc/"+"RGAQCJ.wtc", outDirTxm + "registry/"+"rgaqcj",pAttributes, sAttributes);
403
                                cwbMa.run(cwbLoc + "cwb-makeall", "RGAQCJ", outDirTxm + "registry");
404

    
405
                        }
406
                } catch (Exception ex) {System.out.println(ex);}
407
                System.out.println("Done.") 
408
                
409
                return
410
        }
411
        
412
        /**
413
         * The main method.
414
         *
415
         * @param args the arguments
416
         */
417
        public static void main(String[] args)
418
        {
419
                List<String> files = ["roland-ana.xml","qgraal_cm-ana.xml","artu-ana.xml","qjm-ana.xml","commyn1-ana.xml","jehpar-ana.xml"];
420
                String rootDir = "~/xml/rgaqcj/";
421
                def cwbLoc ="~/Bureau/textometrie/CWB/cwb-3.0/utils/";//chemin vers executable cqp
422
                BuildXmlRGAQCJ.process( files,  rootDir, cwbLoc);
423
        }
424
}