Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / hyperprince / compiler.groovy @ 187

History | View | Annotate | Download (9.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (Tue, 24 Jan 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.hyperprince
29
;
30

    
31
import org.txm.importer.cwb.CwbEncode
32
import org.txm.importer.cwb.CwbMakeAll
33
import org.txm.importer.*;
34
import org.txm.scripts.*;
35
import org.txm.scripts.teitxm.BuildTTSrc;
36
import org.txm.scripts.teitxm.*;
37
import org.txm.utils.treetagger.TreeTagger;
38

    
39
import javax.xml.stream.*;
40
import java.net.URL;
41
import java.io.File;
42
import java.net.URL;
43
import java.util.HashMap;
44
import java.util.List;
45

    
46
// TODO: Auto-generated Javadoc
47
/**
48
 * The Class compiler.
49
 */
50
class compiler 
51
{
52
        
53
        /** The input data. */
54
        private def inputData;
55
        
56
        /** The factory. */
57
        private def factory;
58
        
59
        /** The parser. */
60
        private def parser;
61
        
62
        /** The dir. */
63
        private def dir;
64
        
65
        /** The output. */
66
        private def output;
67
        
68
        /** The url. */
69
        private def url;
70
        
71
        /** The anahash. */
72
        private HashMap<String,String> anahash =new HashMap<String,String>() ;
73
        
74
        /** The text. */
75
        String text="";
76
        
77
        /** The base. */
78
        String base="";
79
        
80
        /** The project. */
81
        String project="";
82
        
83
        /** The cwb loc. */
84
        String cwbLoc;
85
        
86
        /** The lang. */
87
        private String lang ="fr";
88
        
89
        /**
90
         * Instantiates a new compiler.
91
         */
92
        public compiler(){}
93
        
94
        /**
95
         * initialize.
96
         *
97
         * @param url the url
98
         * @param text the text
99
         * @param base the base
100
         * @param project the project
101
         */
102
        public compiler(URL url,String text,String base, String project)
103
        {
104
                this.text = text
105
                this.base = base;
106
                this.project = project;
107
                
108
                try {
109
                        this.url = url;
110
                        inputData = url.openStream();
111
                        
112
                        factory = XMLInputFactory.newInstance();
113
                        parser = factory.createXMLStreamReader(inputData);
114
                } catch (XMLStreamException ex) {
115
                        System.out.println(ex);
116
                }catch (IOException ex) {
117
                        System.out.println("IOException while parsing ");
118
                }
119
        }
120
        
121
        /**
122
         * Sets the cwb path.
123
         *
124
         * @param path the new cwb path
125
         */
126
        public void setCwbPath(String path)
127
        {
128
                if(!new File(path).exists())
129
                        System.err.println("CWB Path : "+path+" does not exists")
130
                cwbLoc = path;
131
        }
132
        
133
        /**
134
         * set the language of the corpus.
135
         *
136
         * @param lang the lang
137
         * @return the java.lang. object
138
         */
139
        public setLang(String lang)
140
        {
141
                this.lang = lang;
142
        }
143
        
144
        /**
145
         * Creates the output.
146
         *
147
         * @param dirPathName the dir path name
148
         * @param fileName the file name
149
         * @return true, if successful
150
         */
151
        private boolean createOutput(String dirPathName, String fileName){
152
                try {
153
                        File f = new File(dirPathName, fileName)
154
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8")
155
                        return true;
156
                } catch (Exception e) {
157
                        System.out.println(e.getLocalizedMessage());
158
                        return false;
159
                }
160
        }
161
        
162
        /**
163
         * clear anaHash variable, it is used to store ana tags values then print it when the end element </ana> is found.
164
         */
165
        private void fillanaHash()
166
        {
167
                anahash.clear();
168
                for(String s : types)
169
                anahash.put( s,"-" );
170
        }
171
        
172
        /**
173
         * Transfom file wtc.
174
         *
175
         * @param dirPathName the dir path name
176
         * @param fileName the file name
177
         * @param idtext the idtext
178
         * @return true, if successful
179
         */
180
        public boolean transfomFileWtc(String dirPathName, String fileName,idtext)
181
        {
182
                createOutput(dirPathName, fileName);
183
                
184
                String author = "";
185
                String date = "";
186
                String lang = "";
187
                String vAna = "";
188
                String vForm = "";
189
                String wordid= "";
190
                
191
                boolean flagAuthor = false;
192
                boolean flagDate = false;
193
                boolean flagForm = false;
194
                boolean flagAna = false;
195
                output.write("<txmcorpus lang=\""+lang+"\">\n"); 
196
                try 
197
                {
198
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
199
                        {
200
                                switch (event) 
201
                                {
202
                                        case XMLStreamConstants.START_ELEMENT:
203
                                        switch (parser.getLocalName()) 
204
                                        {
205
                                                
206
                                                case "monogr"://get attr lang
207
                                                lang = parser.getAttributeValue(null,"lang");
208
                                                //idtext = parser.getAttributeValue(null,"id");
209
                                                break;
210
                                                
211
                                                case "author"://get text
212
                                                flagAuthor = true;
213
                                                author ="";
214
                                                break;
215
                                                
216
                                                case "date"://get text
217
                                                flagDate = true;
218
                                                date = "";
219
                                                break;
220
                                                
221
                                                case "text"://write tag+ lang + date + author
222
                                                output.write("<text lang=\""+lang+"\" auteur=\""+author+"\" date=\""+date+"\" id=\""+idtext+"\" base=\""+base+"\" project=\""+project+"\">\n");
223
                                                break;
224
                                                
225
                                                case "front"://get type,id, write tag
226
                                                case "back"://get type,id, write tag
227
                                                case "head"://get type,id, write tag
228
                                                case "seg"://get type,id,n write tag
229
                                                output.write("<"+parser.getLocalName()+" id=\""+parser.getAttributeValue(null,"id")+"\" type=\""+parser.getAttributeValue(null,"type")+"\">\n");
230
                                                break;
231
                                                
232
                                                case "body"://get type,id, write tag BODY
233
                                                //        for(int i = 0 ; i < parser.getAttributeCount(); i++)
234
                                                                
235
                                                output.write("<body id=\""+parser.getAttributeValue(null,"id")+"\" type=\""+parser.getAttributeValue(null,"type")+"\">\n");
236
                                                break;
237
                                                
238
                                                case "s":
239
                                                output.write( "<s>\n");
240
                                                break;
241
                                                
242
                                                case "w":
243
                                                wordid = parser.getAttributeValue(null,"id")
244
                                                break;
245
                                                case "form":
246
                                                flagForm = true;
247
                                                vForm = "";
248
                                                vAna ="";
249
                                                break;
250
                                                
251
                                                case "ana":
252
                                                flagAna = true;
253
                                                break;
254
                                        }
255
                                        break;
256
                                        
257
                                        case XMLStreamConstants.END_ELEMENT:
258
                                        switch (parser.getLocalName()) 
259
                                        {
260
                                                case "author"://get text
261
                                                flagAuthor = false;
262
                                                break;
263
                                                
264
                                                case "date"://get text
265
                                                flagDate = false;
266
                                                break;
267
                                                
268
                                                case "text"://write tag+ lang + date + author
269
                                                output.write("</text>\n");
270
                                                break;
271
                                                
272
                                                case "front"://get type,id, write tag
273
                                                case "back"://get type,id, write tag
274
                                                case "head"://get type,id, write tag
275
                                                case "seg"://get type,id,n write tag
276
                                                output.write("</"+parser.getLocalName()+">\n");
277
                                                break;
278
                                                
279
                                                case "div0"://get type,id, write tag BODY
280
                                                output.write("</body>\n");
281
                                                break;
282
                                                
283
                                                case "s":
284
                                                output.write( "</s>\n");
285
                                                break;
286
                                                
287
                                                case "w":
288
                                                output.write( vForm +vAna+"\t"+wordid+"\n");
289
                                                vAna = "";
290
                                                vForm = "";
291
                                                break;
292
                                                
293
                                                case "form":
294
                                                flagForm = false;
295
                                                break;
296
                                                
297
                                                case "ana":
298
                                                flagAna = false;
299
                                                break;
300
                                        }
301
                                        break;
302
                                        
303
                                        case XMLStreamConstants.CHARACTERS:
304
                                        if(flagForm)
305
                                        vForm += parser.getText().trim();
306
                                        if(flagAna)
307
                                        vAna += "\t" +parser.getText().trim();
308
                                        if(flagDate)
309
                                        date += parser.getText().trim();
310
                                        if(flagAuthor)
311
                                        author += parser.getText().trim();
312
                                        break;
313
                                }
314
                        }
315
                        output.write("</txmcorpus>"); 
316
                        output.close();
317
                        parser.close();
318
                        inputData.close();
319
                }
320
                catch (XMLStreamException ex) {
321
                        System.out.println(ex);
322
                }
323
                catch (IOException ex) {
324
                        System.out.println("IOException while parsing " + inputData);
325
                }
326
                
327
                return true;
328
        }
329
        
330
        /**
331
         * Run.
332
         *
333
         * @param rootDir the root dir
334
         */
335
        public boolean run(String rootDir) 
336
        {
337
                //String rootDir = "~/xml/Hyperprince/";
338
                if(cwbLoc == null)
339
                        cwbLoc = org.txm.Toolbox.getParam(org.txm.Toolbox.CQI_SERVER_PATH_TO_CQPLIB)+File.separator;
340
                
341
                new File(rootDir,"wtc/hyperprince.wtc").delete();//cleaning&preparing
342
                new File(rootDir,"wtc/").deleteDir();
343
                new File(rootDir,"wtc/").mkdir();
344
                new File(rootDir,"registry/").mkdir();
345
                
346
                //1- Transform into WTC file
347
                List<File> files = new File(rootDir,"txm").listFiles();
348
                for(File f : files) {
349
                        if(!f.exists()) {
350
                                println("file "+f+ " does not exists")        
351
                        }
352
                        else
353
                        {        
354
                                println("process file "+f)
355
                                String txtname = f.getName().substring(0,f.getName().length()-4);
356
                                def builder = new compiler(f.toURL(),txtname, "hyperprince", "default");
357
                                builder.setLang lang
358
                                builder.transfomFileWtc(rootDir,"wtc/hyperprince.wtc",f.getName());
359
                        }
360
                }
361
                
362
                //2- Import into CWB
363
                def outDir =rootDir;
364
                def outDirTxm = rootDir;
365
                
366
                CwbEncode cwbEn = new CwbEncode();
367
                cwbEn.setDebug true;
368
                CwbMakeAll cwbMa = new CwbMakeAll();
369
                cwbMa.setDebug true;
370
                
371
                String[] pAttributes = ["pos","lemme","id"];
372
                String[] sAttributes = ["txmcorpus:0+lang", "text:0+auteur+date+id+lang+base+project","front:0+type+id","body:0+type+id","back:0+type+id","head:0+type+id", "seg:0+type+n+id"];
373
                
374
                try {
375
                        String regPath =outDirTxm + "/registry/"+"hyperprince"
376
                        cwbEn.run(cwbLoc + "/cwb-encode", outDirTxm + "data", outDir + "/wtc/"+"hyperprince.wtc", regPath,pAttributes, sAttributes);
377
                        if (!new File(regPath).exists()) {
378
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
379
                                return false;
380
                        }
381
                        cwbMa.run(cwbLoc + "/cwb-makeall", "HYPERPRINCE", outDirTxm + "/registry");
382
                } catch (Exception ex) {System.out.println(ex); return false;}
383
                
384
                System.out.println("Done.") 
385
                return true;
386
        }
387
}