Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / fleurs / compiler.groovy @ 479

History | View | Annotate | Download (7.6 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.fleurs;
29

    
30
import org.txm.importer.cwb.CwbEncode
31
import org.txm.importer.cwb.CwbMakeAll
32
import org.txm.importer.*;
33
import org.txm.scripts.*;
34
import org.txm.scripts.teitxm.BuildTTSrc;
35
import org.txm.scripts.teitxm.*;
36
import org.txm.utils.treetagger.TreeTagger;
37

    
38
import javax.xml.stream.*;
39
import java.net.URL;
40
import java.io.File;
41
import java.net.URL;
42
import java.util.HashMap;
43
import java.util.List;
44

    
45
// TODO: Auto-generated Javadoc
46
/**
47
 * The Class compiler.
48
 */
49
class compiler{
50
        
51
        /** The input data. */
52
        private def inputData;
53
        
54
        /** The factory. */
55
        private def factory;
56
        
57
        /** The parser. */
58
        private def parser;
59
        
60
        /** The dir. */
61
        private def dir;
62
        
63
        /** The output. */
64
        private def output;
65
        
66
        /** The url. */
67
        private def url;
68
        
69
        /** The text. */
70
        String text="";
71
        
72
        /** The base. */
73
        String base="";
74
        
75
        /** The project. */
76
        String project="";
77
        
78
        /** The cwb loc. */
79
        String cwbLoc;
80
        
81
        /** The lang. */
82
        private String lang ="fr";
83
        
84
        /**
85
         * Instantiates a new compiler.
86
         */
87
        public compiler(){}
88
        
89
        /**
90
         * initialize.
91
         *
92
         * @param url the url
93
         * @param text the text
94
         * @param base the base
95
         * @param project the project
96
         */
97
        public compiler(URL url,String text,String base, String project)
98
        {
99
                this.text = text
100
                this.base = base;
101
                this.project = project;
102
                
103
                try {
104
                        this.url = url;
105
                        inputData = url.openStream();
106
                        
107
                        factory = XMLInputFactory.newInstance();
108
                        parser = factory.createXMLStreamReader(inputData);
109
                } catch (XMLStreamException ex) {
110
                        System.out.println(ex);
111
                }catch (IOException ex) {
112
                        System.out.println("IOException while parsing ");
113
                }
114
        }
115
        
116
        /**
117
         * set the language of the corpus.
118
         *
119
         * @param lang the lang
120
         * @return the java.lang. object
121
         */
122
        public setLang(String lang)
123
        {
124
                this.lang = lang;
125
        }
126
        
127
        /**
128
         * Sets the cwb path.
129
         *
130
         * @param path the new cwb path
131
         */
132
        public void setCwbPath(String path)
133
        {
134
                if(!new File(path).exists())
135
                        System.err.println("CWB Path : "+path+" does not exists")
136
                cwbLoc = path;
137
        }
138
        
139
        /**
140
         * Creates the output.
141
         *
142
         * @param dirPathName the dir path name
143
         * @param fileName the file name
144
         * @return true, if successful
145
         */
146
        private boolean createOutput(String dirPathName, String fileName){
147
                try {
148
                        File f = new File(dirPathName, fileName)
149
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8")
150
                        return true;
151
                } catch (Exception e) {
152
                        System.out.println(e.getLocalizedMessage());
153
                        return false;
154
                }
155
        }
156
                
157
        /**
158
         * Transfom file wtc.
159
         *
160
         * @param dirPathName the dir path name
161
         * @param fileName the file name
162
         * @return true, if successful
163
         */
164
        public boolean transfomFileWtc(String dirPathName, String fileName)
165
        {
166
                createOutput(dirPathName, fileName);
167
                
168
                String vForm = "";
169
                String wordid= "";
170
                String w_l= "";
171
                String w_lemme= "";
172
                String w_pos= "";
173
                String w_x= "";
174
                String w_z= "";
175
                
176
                boolean flagForm = false;
177
                output.write("<txmcorpus lang=\""+lang+"\">\n"); 
178
                try 
179
                {
180
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
181
                        {
182
                                switch (event) 
183
                                {
184
                                        case XMLStreamConstants.START_ELEMENT:
185
                                        switch (parser.getLocalName()) 
186
                                        {
187
                                        case "recueil":
188
                                                output.write("<text id=\""+text+"\" base=\"fleurs\" project=\"default\">\n");
189
                                                break;
190
                                        case "w":
191
                                                flagForm= true;
192
                                                w_l = parser.getAttributeValue(null,"l");
193
                                                w_lemme = parser.getAttributeValue(null,"lemme");
194
                                                w_pos = parser.getAttributeValue(null,"pos");
195
                                                w_x= parser.getAttributeValue(null,"x");
196
                                                w_z= parser.getAttributeValue(null,"z");
197
                                                wordid= parser.getAttributeValue(null,"id");
198
                                                vForm ="";
199
                                                break;
200
                                        default:
201
                                                output.write("<"+parser.getLocalName())
202
                                                for(int i =0; i < parser.getAttributeCount() ; i++)
203
                                                                output.write(" "+parser.getAttributeLocalName(i)+"=\""+parser.getAttributeValue(i)+"\"")
204
                                                        output.write(">\n");
205
                                                break;
206
                                        }
207
                                        break;
208
                                        
209
                                        case XMLStreamConstants.END_ELEMENT:
210
                                        switch (parser.getLocalName()) 
211
                                        {
212
                                        case "recueil":
213
                                                output.write("</text>\n");
214
                                                break;
215
                                        case "w":
216
                                                flagForm= false;
217
                                                output.write(vForm+"\t"+wordid+"\t"+w_l+"\t"+w_lemme+"\t"+w_pos+"\t"+w_x+"\t"+w_z+"\n")
218
                                                break;
219
                                        default:
220
                                                output.write("</"+parser.getLocalName()+">\n");
221
                                                break;
222
                                        }
223
                                        break;
224
                                        
225
                                        case XMLStreamConstants.CHARACTERS:
226
                                                if(flagForm)
227
                                                        vForm += parser.getText().trim();
228
                                        
229
                                        break;
230
                                }
231
                        }
232
                        output.write("</txmcorpus>"); 
233
                        output.close();
234
                        parser.close();
235
                        inputData.close();
236
                }
237
                catch (XMLStreamException ex) {
238
                        System.out.println(ex);
239
                }
240
                catch (IOException ex) {
241
                        System.out.println("IOException while parsing " + inputData);
242
                }
243
                
244
                return true;
245
        }
246
        
247
        /**
248
         * Run.
249
         *
250
         * @param rootDir the root dir
251
         */
252
        public boolean run(String rootDir) 
253
        {
254
                //String rootDir = "~/xml/Hyperprince/";
255
                if(cwbLoc == null)
256
                        cwbLoc = org.txm.Toolbox.getParam(org.txm.Toolbox.CQI_SERVER_PATH_TO_CQPLIB)+File.separator;
257
                
258
                new File(rootDir,"wtc/fleurs.wtc").delete();//cleaning&preparing
259
                new File(rootDir,"wtc/").deleteDir();
260
                new File(rootDir,"wtc/").mkdir();
261
                new File(rootDir,"registry/").mkdir();
262
                
263
                //1- Transform into WTC file
264
                List<File> files = new File(rootDir,"txm").listFiles();
265
                for (File f : files) {
266
                        if (!f.exists()) {
267
                                println("file "+f+ " does not exists")        
268
                        }
269
                        else
270
                        {        
271
                                println("process file "+f)
272
                                String txtname = f.getName().substring(0,f.getName().length()-4);
273
                                def builder = new compiler(f.toURL(), txtname, "fleurs", "default");
274
                                builder.setLang lang
275
                                builder.transfomFileWtc(rootDir,"wtc/fleurs.wtc");
276
                        }
277
                }
278
                
279
                //2- Import into CWB
280
                def outDir =rootDir;
281
                def outDirTxm = rootDir;
282
                
283
                CwbEncode cwbEn = new CwbEncode();
284
                cwbEn.setDebug true;
285
                CwbMakeAll cwbMa = new CwbMakeAll();
286
                cwbMa.setDebug true;
287
                
288
                String[] pAttributes = ["id","l","lemme","pos","x","z"];
289
                String[] sAttributes = ["txmcorpus:0+lang", "text:0+id+base+project","poeme:0+titre+genre+type+section+tranche_chrono+date+annee+titre_abr+preorig+schema1+schema2+n","strophe:0+ordre+nb_vers+structure","div:0+type+structure","vers:0+numvers+tphon+stru+n_syl+n_pied+cesure+rime"];
290
                
291
                try {
292
                        String regPath = outDirTxm + "/registry/"+"fleurs"
293
                        cwbEn.run(cwbLoc + "/cwb-encode", outDirTxm + "data", outDir + "/wtc/"+"fleurs.wtc", regPath,pAttributes, sAttributes);
294
                        if (!new File(regPath).exists()) {
295
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
296
                                return false;
297
                        }
298
                        cwbMa.run(cwbLoc + "/cwb-makeall", "FLEURS", outDirTxm + "/registry");
299
                        
300
                } catch (Exception ex) {System.out.println(ex); return false;}
301
                
302
                System.out.println("Done.") 
303
                return true;
304
        }
305
}