Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / alceste / compiler.groovy @ 187

History | View | Annotate | Download (7.3 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2016-05-26 17:42:36 +0200 (Thu, 26 May 2016) $
25
// $LastChangedRevision: 3219 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.alceste;
29

    
30
import org.txm.importer.cwb.CwbEncode
31
import org.txm.importer.cwb.CwbMakeAll
32
import org.txm.importer.*;
33
import org.txm.scripts.*;
34
import org.txm.scripts.teitxm.BuildTTSrc;
35
import org.txm.scripts.teitxm.*;
36
import org.txm.utils.treetagger.TreeTagger;
37

    
38
import javax.xml.stream.*;
39
import java.net.URL;
40
import java.io.File;
41
import java.util.Collection;
42
import java.util.HashMap;
43
import java.util.List;
44

    
45
// TODO: Auto-generated Javadoc
46
/**
47
 * build wtc
48
 * build CWB indexes
49
 * create registry file.
50
 *
51
 * @author mdecorde
52
 */
53
class compiler 
54
{
55
        /** The debug. */
56
        private boolean debug = false;
57
        
58
        /** The input data. */
59
        private def inputData;
60
        
61
        /** The factory. */
62
        private def factory;
63
        
64
        /** The parser. */
65
        private XMLStreamReader parser;
66
        
67
        /** The dir. */
68
        private def dir;
69
        
70
        /** The output. */
71
        private def output;
72
        
73
        /** The url. */
74
        private def url;
75
        
76
        /** The anahash. */
77
        private HashMap<String, String> anahash = new HashMap<String, String>() ;
78
        
79
        /** The text. */
80
        String text = "";
81
        
82
        /** The base. */
83
        String base = "";
84
        
85
        /** The project. */
86
        String project = "";
87
        
88
        /** The text attributes. */
89
        String[] textAttributes = null;
90
        
91
        /** The cwb loc. */
92
        String cwbLoc;
93
        
94
        /** The lang. */
95
        String lang = "fr";
96
        
97
        /**
98
         * initialize.
99
         *
100
         */
101
        public compiler(){}
102
        
103
        
104
        /**
105
         * Instantiates a new compiler.
106
         *
107
         * @param url the url
108
         * @param text the text
109
         * @param base the base
110
         * @param project the project
111
         */
112
        public compiler(URL url, String text, String base, String project)
113
        {
114
                this.text = text
115
                this.base = base;
116
                this.project = project;
117
                this.textAttributes = textAttributes;
118
                try {
119
                        this.url = url;
120
                        inputData = url.openStream();
121
                        
122
                        factory = XMLInputFactory.newInstance();
123
                        parser = factory.createXMLStreamReader(inputData);
124
                } catch (Exception ex) {
125
                        System.err.println("IOException while parsing: "+ex);
126
                }
127
        }
128
        
129
        /** The annotation success. */
130
        boolean annotationSuccess = false;
131
        
132
        /**
133
         * Sets the annotation success.
134
         *
135
         * @param val the new annotation success
136
         */
137
        public void setAnnotationSuccess(boolean val)
138
        {
139
                this.annotationSuccess = val;
140
        }
141
        
142
        /**
143
         * Sets the cwb path.
144
         *
145
         * @param path the new cwb path
146
         */
147
        public void setCwbPath(String path)
148
        {
149
                if (!new File(path).exists())
150
                        System.err.println("CWB Path : "+path+" does not exists")
151
                cwbLoc = path;
152
        }
153
        
154
        /**
155
         * Creates the output.
156
         *
157
         * @param f the f
158
         * @return true, if successful
159
         */
160
        private boolean createOutput(File f){
161
                try {
162
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
163
                        return true;
164
                } catch (Exception e) {
165
                        System.err.println(e);
166
                        return false;
167
                }
168
        }
169
        
170
        /**
171
         * Sets the lang.
172
         *
173
         * @param lang the lang
174
         * @return the java.lang. object
175
         */
176
        public setLang(String lang)
177
        {
178
                this.lang = lang;
179
        }
180
        
181
        /**
182
         * Run.
183
         *
184
         * @param rootDirFile the root dir file
185
         * @param basename the basename
186
         * @param textAttributes the text attributes
187
         * @return true, if successful
188
         */
189
        public boolean run(File binDir, File txmDir, String corpusname, String[] textAttributes) 
190
        {
191
                if(cwbLoc == null)
192
                        cwbLoc = org.txm.Toolbox.getParam(org.txm.Toolbox.CQI_SERVER_PATH_TO_CQPLIB)+File.separator;
193
                
194
                if (!new File(cwbLoc).exists()) {
195
                        println ("CWB path error: "+cwbLoc)
196
                        return false;
197
                }
198
                if (!binDir.exists()) {
199
                        println ("binary directory does not exists: "+binDir)
200
                        return false;
201
                }
202
                File wtcFile = new File(binDir,"wtc/"+corpusname+".wtc");
203
                new File(binDir, "wtc").deleteDir();
204
                new File(binDir, "wtc").mkdir();
205
                new File(binDir, "data/$corpusname").deleteDir();
206
                new File(binDir, "data/$corpusname").mkdir();
207
                new File(binDir, "registry").mkdir();
208
                
209
                String textid = "";
210
                int counttext = 0;
211
                List<File> files = txmDir.listFiles();
212
                if (files == null || files.size() == 0) {
213
                        println "no file to compile"
214
                        return false;
215
                }
216
                
217
                //start corpus
218
                if (createOutput(wtcFile)) {
219
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
220
                        output.close();
221
                }
222
                
223
                Collections.sort(files);
224
                //1- Transform into WTC file
225
                XMLTXM2WTC wtcbuilder = null;
226
                println("process "+files.size()+" files")
227
                for (File f : files) {
228
                        print "."
229
                        counttext++;
230
                        if (!f.exists()) {
231
                                println("file "+f+ " does not exists")        
232
                        } else {                        
233
                                wtcbuilder = new XMLTXM2WTC(f.toURI().toURL());
234
                                String txtname = f.getName().substring(0, f.getName().length()-4);
235
                                wtcbuilder.setTextInfo(txtname, corpusname, "default");
236

    
237
                                wtcbuilder.setBalisesToKeep(["text", "s", "p", "lb"]);
238
                                wtcbuilder.setSendToPAttributes(["lb":["n"], "p":["n"], "s":["n"]])
239
                                wtcbuilder.setLang(lang);
240
                                if (!wtcbuilder.transformFile(wtcFile)) {
241
                                        println("Failed to compile "+f)
242
                                }
243
                        }
244
                }
245
                println ""
246
                
247
                //end corpus
248
                if (createOutput(wtcFile)) {
249
                        output.write("</txmcorpus>\n");
250
                        output.close();
251
                }
252
                
253
                if (wtcbuilder == null) {
254
                        println "there was no files in bin dir : "+txmDir
255
                        return false;
256
                }
257
                
258
                //2- Import into CWB
259
                String rootDir = binDir.getAbsolutePath();
260
                CwbEncode cwbEn = new CwbEncode();
261
                cwbEn.setDebug(debug);
262
                CwbMakeAll cwbMa = new CwbMakeAll();
263
                cwbMa.setDebug(debug);
264
                
265
                List<String> pAttributesList = wtcbuilder.getpAttributs();
266
                List<String> sAttributesList = wtcbuilder.getsAttributs();
267
                println "pAttrs : "+pAttributesList
268
                println "sAttrs : "+sAttributesList
269
                String[] pAttributes = pAttributesList.toArray(new String[pAttributesList.size()])
270
                String[] sAttributes = sAttributesList.toArray(new String[sAttributesList.size()])
271
                
272
                try {
273
                        String regPath = rootDir + "/registry/"+corpusname.toLowerCase(); // CQP wants lowercase registry name
274
                        cwbEn.run(new File(cwbLoc,"cwb-encode").getAbsolutePath(), rootDir + "/data/${corpusname}", rootDir + "/wtc/"+corpusname+".wtc", regPath,pAttributes, sAttributes);
275
                        if (!new File(regPath).exists()) {
276
                                println "The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
277
                                return false;
278
                        }
279
                        cwbMa.run(new File(cwbLoc,"cwb-makeall").getAbsolutePath(), corpusname.toUpperCase(), rootDir + "/registry");
280
                } catch (Exception ex) {System.out.println(ex); return false;}
281

    
282
                return true;
283
        }
284
        
285
        /**
286
         * Sets the debug.
287
         */
288
        public void setDebug()
289
        {
290
                this.debug = true;
291
        }
292
        
293
        /**
294
         * The main method.
295
         *
296
         * @param args the arguments
297
         */
298
        public static void main(String[] args)
299
        {
300
                File dir = new File("~/xml/geo");
301
                def c = new compiler();
302
                c.setDebug();
303
                c.setCwbPath("~/TXM/cwb/bin");
304
                c.run(dir,"geo");
305
        }
306
}