Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / hyperbase / compiler.groovy @ 187

History | View | Annotate | Download (7.3 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2016-05-26 17:42:36 +0200 (Thu, 26 May 2016) $
25
// $LastChangedRevision: 3219 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.hyperbase;
29

    
30
import org.txm.importer.cwb.CwbEncode
31
import org.txm.importer.cwb.CwbMakeAll
32
import org.txm.importer.*;
33
import org.txm.scripts.*;
34
import org.txm.scripts.teitxm.BuildTTSrc;
35
import org.txm.scripts.teitxm.*;
36
import org.txm.utils.treetagger.TreeTagger;
37

    
38
import javax.xml.stream.*;
39
import java.net.URL;
40
import java.io.File;
41
import java.util.HashMap;
42
import java.util.List;
43

    
44
// TODO: Auto-generated Javadoc
45
/**
46
 * The Class compiler.
47
 */
48
class compiler 
49
{
50
        
51
        /** The debug. */
52
        private boolean debug= false;
53
        
54
        /** The input data. */
55
        private def inputData;
56
        
57
        /** The factory. */
58
        private def factory;
59
        
60
        /** The parser. */
61
        private XMLStreamReader parser;
62
        
63
        /** The dir. */
64
        private def dir;
65
        
66
        /** The output. */
67
        private def output;
68
        
69
        /** The url. */
70
        private def url;
71
        
72
        /** The anahash. */
73
        private HashMap<String,String> anahash = new HashMap<String,String>() ;
74
        
75
        /** The text. */
76
        String text="";
77
        
78
        /** The base. */
79
        String base="";
80
        
81
        /** The project. */
82
        String project="";
83
        
84
        /** The text attributes. */
85
        String[] textAttributes = null;
86
        
87
        /** The cwb loc. */
88
        String cwbLoc;
89
        
90
        /** The lang. */
91
        private String lang ="fr";
92
        
93
        /**
94
         * initialize.
95
         *
96
         */
97
        public compiler(){}
98
        
99
        /**
100
         * Instantiates a new compiler.
101
         *
102
         * @param url the url
103
         * @param text the text
104
         * @param base the base
105
         * @param project the project
106
         */
107
        public compiler(URL url,String text,String base, String project)
108
        {
109
                this.text = text
110
                this.base = base;
111
                this.project = project;
112
                this.textAttributes = textAttributes;
113
                try {
114
                        this.url = url;
115
                        inputData = url.openStream();
116
                        
117
                        factory = XMLInputFactory.newInstance();
118
                        parser = factory.createXMLStreamReader(inputData);
119
                } catch (XMLStreamException ex) {
120
                        System.out.println(ex);
121
                }catch (IOException ex) {
122
                        System.err.println("IOException while parsing ");
123
                }
124
        }
125
        
126
        /**
127
         * set the language of the corpus.
128
         *
129
         * @param lang the lang
130
         * @return the java.lang. object
131
         */
132
        public setLang(String lang)
133
        {
134
                this.lang = lang;
135
        }
136
        
137
        /** The annotation success. */
138
        boolean annotationSuccess = false;
139
        
140
        /**
141
         * Sets the annotation success.
142
         *
143
         * @param value the value
144
         * @return the java.lang. object
145
         */
146
        public setAnnotationSuccess(boolean value)
147
        {
148
                this.annotationSuccess = value
149
        }
150
        
151
        /**
152
         * Sets the cwb path.
153
         *
154
         * @param path the new cwb path
155
         */
156
        public void setCwbPath(String path)
157
        {
158
                if(!new File(path).exists())
159
                        System.err.println("CWB Path : "+path+" does not exists")
160
                cwbLoc = path;
161
        }
162
        
163
        /**
164
         * Creates the output.
165
         *
166
         * @param f the f
167
         * @return true, if successful
168
         */
169
        private boolean createOutput(File f){
170
                try {
171
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
172
                        return true;
173
                } catch (Exception e) {
174
                        System.err.println(e);
175
                        
176
                        return false;
177
                }
178
        }
179
        
180
        /**
181
         * Go to text.
182
         */
183
        private void GoToText()
184
        {
185
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
186
                {
187
                        if(event == XMLStreamConstants.END_ELEMENT)
188
                                if(parser.getLocalName().equals("teiHeader"))
189
                                        return;
190
                }
191
        }
192
        
193
        /**
194
         * Run.
195
         *
196
         * @param rootDirFile the root dir file
197
         * @param basename the basename
198
         * @return true, if successful
199
         */
200
        public boolean run(File binDir, File txmDir, String corpusname) 
201
        {
202
                String rootDir = binDir.getAbsolutePath();
203
                
204
                if(cwbLoc == null)
205
                        cwbLoc = org.txm.Toolbox.getParam(org.txm.Toolbox.CQI_SERVER_PATH_TO_CQPLIB)+File.separator;
206
                
207
                if(!new File(cwbLoc).exists())
208
                {
209
                        println ("CWB path error: "+cwbLoc)
210
                        return false;
211
                }
212
                if(!binDir.exists())
213
                {
214
                        println ("binary directory does not exists: "+rootDir)
215
                        return false;
216
                }
217
                File wtcFile = new File(binDir,"wtc/"+corpusname+".wtc");
218
                new File(binDir,"wtc").deleteDir();
219
                new File(binDir,"wtc").mkdir();
220
                new File(binDir,"data").deleteDir();
221
                new File(binDir,"data").mkdir();
222
                new File(binDir,"registry").mkdir();
223
                
224
                //start corpus
225
                if (createOutput(wtcFile)) {
226
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
227
                        output.close();
228
                }
229
                
230
                String textid="";
231
                int counttext =0;
232
                List<File> files = txmDir.listFiles();
233
                Collections.sort(files);
234
                //1- Transform into WTC file
235
                XMLTXM2WTC wtcbuilder = null;
236
                println("Compiling "+files.size()+" files")
237
                for (File f : files) {
238
                        print "."
239
                        counttext++;
240
                        if (!f.exists()) {
241
                                println("file "+f+ " does not exists")        
242
                        }
243
                        else {        
244
                                wtcbuilder = new XMLTXM2WTC(f.toURI().toURL());
245
                                String txtname = f.getName().substring(0,f.getName().length()-4);
246
                                wtcbuilder.setTextInfo(txtname, corpusname, "project");
247

    
248
                                wtcbuilder.setBalisesToKeep(["text","p","s"]);
249
                                wtcbuilder.setSendToPAttributes(["s":["n"]]);
250
                                wtcbuilder.setLang(lang);
251
                                if (!wtcbuilder.transformFile(wtcFile)) {
252
                                        println("Failed to compile "+f)
253
                                }
254
                        }
255
                }
256
                println ""
257
                
258
                //end corpus
259
                if (createOutput(wtcFile)) {
260
                        output.write("</txmcorpus>\n");
261
                        output.close();
262
                }
263
                
264
                if (wtcbuilder == null) return false;
265
                
266
                //2- Import into CWB
267
                def outDir =rootDir;
268
                
269
                CwbEncode cwbEn = new CwbEncode();
270
                cwbEn.setDebug(debug);
271
                CwbMakeAll cwbMa = new CwbMakeAll();
272
                cwbMa.setDebug(debug);
273
                
274
                List<String> pAttributesList = wtcbuilder.getpAttributs();
275
                List<String> sAttributesList = wtcbuilder.getsAttributs();
276
                println "word properties : "+pAttributesList
277
                println "structures : "+sAttributesList
278
                String[] pAttributes = pAttributesList.toArray(new String[pAttributesList.size()])
279
                String[] sAttributes = sAttributesList.toArray(new String[sAttributesList.size()])
280
                
281
                try {
282
                        String regPath = outDir + "/registry/"+corpusname.toLowerCase();
283
                        cwbEn.run(new File(cwbLoc,"cwb-encode").getAbsolutePath(), outDir + "/data/$corpusname", outDir + "/wtc/"+corpusname+".wtc", regPath,pAttributes, sAttributes);
284
                        if (!new File(regPath).exists()) {
285
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
286
                                return false;
287
                        }
288
                        cwbMa.run(new File(cwbLoc,"cwb-makeall").getAbsolutePath(), corpusname, outDir + "/registry");
289
                        
290
                } catch (Exception ex) {System.out.println(ex); return false;}
291
                
292
                return true;
293
        }
294
        
295
        /**
296
         * Sets the debug.
297
         */
298
        public void setDebug()
299
        {
300
                this.debug = true;
301
        }
302
        
303
        /**
304
         * The main method.
305
         *
306
         * @param args the arguments
307
         */
308
        public static void main(String[] args)
309
        {
310
                File dir = new File("~/xml/geo");
311
                def c = new compiler();
312
                c.setDebug();
313
                c.setCwbPath("~/TXM/cwb/bin");
314
                c.run(dir,"geo");
315
        }
316
}