Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / graal / GraalImportCqp.groovy @ 187

History | View | Annotate | Download (10.6 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (Tue, 24 Jan 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.graal
29

    
30
import javax.xml.stream.*;
31
import java.net.URL;
32
import java.lang.Boolean
33
import java.lang.System
34

    
35
// TODO: Auto-generated Javadoc
36
/**
37
 * The Class GraalImportCqp.
38
 *
39
 * @author ayepdieu
40
 * parse graal xml file to build the wtc the next step is to call importInCWB script
41
 */
42
public class GraalImportCqp{
43
        
44
        /** The input data. */
45
        private def inputData;
46
        
47
        /** The factory. */
48
        private def factory;
49
        
50
        /** The parser. */
51
        private def parser;
52
        
53
        /** The dir. */
54
        private def dir;
55
        
56
        /** The output. */
57
        private def output;
58
        
59
        
60
        /**
61
         * initialize.
62
         *
63
         * @param url the url to the xmlfile to transform
64
         */
65
        public GraalImportCqp(URL url){
66
                try {
67
                        inputData = url.openStream();
68
                        factory = XMLInputFactory.newInstance();
69
                        parser = factory.createXMLStreamReader(inputData);
70
                        //System.out.println "ouverture du document en entree reussi";
71
                } catch (XMLStreamException ex) {
72
                        System.out.println(ex);
73
                }catch (IOException ex) {
74
                        System.out.println("IOException while parsing ");
75
                }
76
        }
77
        
78
        /**
79
         * create the output file writer.
80
         *
81
         * @param dirPathName the dir path name
82
         * @param fileName the file name
83
         * @return true, if successful
84
         */
85
        private boolean createOutput(String dirPathName, String fileName){
86
                try {
87
                        dir = new File(dirPathName)
88
                        output = new OutputStreamWriter(new FileOutputStream(new File(dir, fileName)) , "UTF-8")
89
                        //System.out.println "ouverture du document en sortie reussi";
90
                        return true;
91
                } catch (Exception e) {
92
                        System.out.println(e.getLocalizedMessage());
93
                        return false;
94
                }
95
        }
96
        
97
        /**
98
         * process !!!.
99
         *
100
         * @param dirPathName the output directory
101
         * @param fileName the outfile name
102
         * @return true, if successful
103
         */
104
        public boolean transfomFileWtc(String dirPathName, String fileName){
105
                if(findBody() && createOutput(dirPathName, fileName)){
106
                        String idColumn;
107
                        int idParagraph = 0;
108
                        int idSentence = 0;
109
                        int idLine = 0;
110
                        int q_id =0;
111
                        String idLinesuiv;
112
                        String idWord;
113
                        boolean flagVersion = false;
114
                        boolean flagWord = false;
115
                        boolean flagNorm = false;
116
                        boolean flagDipl = false;
117
                        boolean flagFacs = false;
118
                        //boolean flagSupplied = false;
119
                        int levelSupplied = 0;
120
                        int levelq = 0;
121
                        String typeWord = "";
122
                        String vWordDipl = "";
123
                        String vWordFacs = "";
124
                        String vWordNorm = "";
125
                        try {
126
                                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
127
                                        switch (event) {
128
                                                case XMLStreamConstants.START_ELEMENT:
129
                                                        switch (parser.getLocalName()) {
130
                                                                case "milestone":
131
                                                                        if (parser.getAttributeValue(0) == "column") {
132
                                                                                idColumn = parser.getAttributeValue(1);
133
                                                                                idColumn = idColumn.substring(4);
134
                                                                        }
135
                                                                break;
136
                                                                case "p":
137
                                                                        idParagraph = parser.getAttributeValue(0).toInteger();
138
                                                                        output.write "<p n=" + idParagraph +">\n";
139
                                                                        //if(idParagraph < 13 && idParagraph > 4){
140
                                                                                flagVersion = true;
141
                                                                        /*}else{
142
                                                                                flagVersion = false;
143
                                                                        }*/
144
                                                                break;
145
                                                                case "s":
146
//                                                                        String no = parser.getAttributeValue(0);
147
//                                                                        if(no == null)
148
//                                                                                idSentence = -1;
149
//                                                                        else
150
//                                                                                idSentence = no.toInteger();
151
                                                                idSentence++;
152
                                                                        output.write "<s n=" + idSentence + " id=\"" + parser.getAttributeValue(1) + "\">\n";
153
                                                                break;
154
                                                                case "q":
155
                                                                        output.write "<q n=\""+q_id+"\">\n";
156
                                                                        q_id++;
157
                                                                        //new
158
                                                                        levelq = levelq + 1;
159
                                                                break;
160
                                                                
161
                                                                case "supplied":
162
                                                                levelSupplied++;
163
                                                                break;                                                                
164
                                                                
165
                                                                case "lb":
166
                                                                        idLinesuiv = parser.getAttributeValue(0);
167
                                                                        if(!("facs" == idLinesuiv)){
168
                                                                                idLine = idLinesuiv.toInteger();
169
                                                                        }
170
                                                                break;
171
                                                                case "w":
172
                                                                        typeWord = parser.getAttributeValue(0);
173
                                                                        if(flagVersion  && null != parser.getAttributeValue(2)){
174
                                                                                idWord = parser.getAttributeValue(2);
175
                                                                        }else{
176
                                                                                idWord = parser.getAttributeValue(1);
177
                                                                        }
178
                                                                        flagWord = true;
179
                                                                        flagNorm = true;
180
                                                                break;
181
                                                                case "norm":
182
                                                                        flagNorm = true;
183
                                                                break;
184
                                                                case "dipl":
185
                                                                        flagDipl = true;
186
                                                                break;
187
                                                                case "facs":
188
                                                                        flagFacs = true;
189
                                                                break;
190
                                                        }
191
                                                break;
192
                                                
193
                                                case XMLStreamConstants.END_ELEMENT:
194
                                                        switch (parser.getLocalName()) {
195
                                                                case "p":
196
                                                                        output.write "</p>\n";
197
                                                                break;
198
                                                                case "s":
199
                                                                        output.write "</s>\n";
200
                                                                break;
201
                                                                case "q":
202
                                                                        output.write "</q>\n";
203
                                                                        levelq = levelq - 1;
204
                                                                break;
205
                                                                case "supplied":
206
                                                                        levelSupplied = levelSupplied - 1;
207
                                                                break;
208
                                                                case "w":
209
                                                                        output.write vWordNorm + "\t" + typeWord + "\t" + levelq + "\t" + levelSupplied.toString().substring(0,1) + "\t" + idColumn + "\t" + idLine + "\t" + idWord + "\t" + vWordDipl + "\t" + vWordFacs + "\n";
210
                                                                        vWordNorm = "";
211
                                                                        vWordDipl = "";
212
                                                                        vWordFacs = "";
213
                                                                        flagNorm = false;
214
                                                                        flagDipl = false;
215
                                                                        flagFacs = false;
216
                                                                        flagWord = false; 
217
                                                                break;
218
                                                                case "norm":
219
                                                                        flagNorm = false;
220
                                                                break;
221
                                                                case "dipl":
222
                                                                        flagDipl = false;
223
                                                                break;
224
                                                                case "facs":
225
                                                                        flagFacs = false;
226
                                                                break;
227
                                                        }
228
                                                break;
229
                                                
230
                                                case XMLStreamConstants.CHARACTERS:
231
                                                        if(flagWord){
232
                                                                vWordNorm.trim();
233
                                                                if(flagNorm){
234
                                                                        vWordNorm += parser.getText().trim();
235
                                                                }else if(flagDipl){
236
                                                                        vWordDipl += parser.getText().trim();
237
                                                                }else if(flagFacs){
238
                                                                        vWordFacs += parser.getText().trim();
239
                                                                }
240
                                                        }
241
                                                break;
242
                                        }
243
                                }
244
                                output.close();
245
                                parser.close();
246
                        }
247
                        catch (Exception ex) {
248
                                System.out.println("Xml Location: "+parser.getLocation());
249
                                System.out.println(ex);
250
                        }
251
                        
252
                }
253
        }
254
        
255
        /**
256
         * a simplier version of transform file to process a non-facette xml.
257
         *
258
         * @param dirPathName the dir path name
259
         * @param fileName the file name
260
         * @return true, if successful
261
         */
262
        public boolean transfomFileWtcBrut(String dirPathName, String fileName){
263
                if(findBody() && createOutput(dirPathName, fileName)){
264
                        String idColumn = "";
265
                        int idParagraph = 0;
266
                        int idSentence = 0;
267
                        String idWord;
268
                        boolean flagWord = false;
269
                        boolean flagCont = true;
270
                        boolean flagStart = true;
271
                        String typeWord = "";
272
                        String vWord = "";
273
                        try {
274
                                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT && flagCont; event = parser.next()) {
275
                                        switch (event) {
276
                                                case XMLStreamConstants.START_ELEMENT:
277
                                                        //System.out.println parser.getLocalName();
278
                                                        switch (parser.getLocalName()) {
279
                                                                case "milestone":
280
                                                                        if (parser.getAttributeValue(0) == "p") {
281
                                                                                idParagraph = parser.getAttributeValue(1).toInteger();
282
                                                                                if(flagStart){
283
                                                                                        output.write "<p n=" + idParagraph + ">\n";
284
                                                                                }else{
285
                                                                                        output.write "</p>\n<p n=" + idParagraph + ">\n";
286
                                                                                }
287
                                                                        }else if(parser.getAttributeValue(0) == "column"){
288
                                                                                idColumn = parser.getAttributeValue(1);
289
                                                                                idColumn = idColumn.substring(4);
290
                                                                        }else if (parser.getAttributeValue(0) != "s"){
291
                                                                                idColumn = "";
292
                                                                        }
293
                                                                break;
294
                                                                case "s":
295
                                                                        idSentence = parser.getAttributeValue(0).toInteger();
296
                                                                        output.write "<s n=" + idSentence + ">\n";
297
                                                                break;
298
                                                                case "w":
299
                                                                        if(null != parser.getAttributeValue(1)){
300
                                                                                typeWord = parser.getAttributeValue(0);
301
                                                                                idWord = parser.getAttributeValue(1);
302
                                                                        }else{
303
                                                                                idWord = parser.getAttributeValue(0);
304
                                                                                typeWord = "";
305
                                                                        }
306
                                                                        flagWord = true;
307
                                                                break;
308
                                                                case "div":
309
                                                                        if (parser.getAttributeValue(0) == "note") {
310
                                                                                output.write "</s>\n</p>";
311
                                                                                flagCont = false;
312
                                                                        }
313
                                                                break;
314
                                                        }
315
                                                break;
316
                                                
317
                                                case XMLStreamConstants.END_ELEMENT:
318
                                                        switch (parser.getLocalName()) {
319
                                                                case "w":
320
                                                                        output.write vWord + "\t" + typeWord + "\t" + idWord + "\t" + idColumn + "\n";
321
                                                                        flagWord = false; 
322
                                                                break;
323
                                                        }
324
                                                break;
325
                                                case "s":
326
                                                        output.write "</s>\n";
327
                                                break;                                                
328
                                                case XMLStreamConstants.CHARACTERS:
329
                                                        if(flagWord){
330
                                                                vWord = parser.getText().trim();
331
                                                        }
332
                                                break;
333
                                        }
334
                                }
335
                                output.close();
336
                                parser.close();
337
                        }
338
                        catch (XMLStreamException ex) {
339
                                System.out.println(ex);
340
                        }
341
                        catch (IOException ex) {
342
                                System.out.println("IOException while parsing " + input);
343
                        }
344
                }
345
        }
346
        
347
        /**
348
         * test if the current element if the body.
349
         *
350
         * @param name the name
351
         * @return true, if is body
352
         */
353
        private static boolean isBody(String name) {
354
                if (name.equals("body")) return true;
355
        }
356
        
357
        /**
358
         * bring the parser to the body element.
359
         *
360
         * @return true, if successful
361
         */
362
        private boolean findBody(){
363
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()){
364
                        if(event == XMLStreamConstants.START_ELEMENT){
365
                                if (isBody(parser.getLocalName())){
366
                                        return true;
367
                                }
368
                        }
369
                }
370
                return false;
371
        } 
372
        
373
        /**
374
         * no args needed.
375
         *
376
         * @param args the arguments
377
         */
378
        public static void main(String[] args) {
379
                 def inDir = "/home/txm/src";
380
                 def outDir = "/home/txm/";
381
                 def inFile = "qgraal_cm_2009-07-d.xml";
382
                System.out.println "file://"+ inDir + "/" + inFile;
383

    
384
                if (args.length == 0) {
385
                        GraalImportCqp traitTxt = new GraalImportCqp(new URL("file://"+ inDir + "/" + inFile));
386
                        traitTxt.transfomFileWtc("/home/ayepdieu/srcQuete/result", "graal.wtc");
387
                }else if(args.length == 3){
388
                        GraalImportCqp traitTxt = new GraalImportCqp(new URL(args[0]));
389
                        traitTxt.transfomFileWtc(args[1], args[2]);
390
                }else{
391
                        System.err.println("Usage: java XHTMLOutliner url" );
392
                        return;
393
                }
394
                return
395
        }
396
        
397
}