Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / alceste / compiler.groovy @ 1094

History | View | Annotate | Download (7.1 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2016-05-26 17:42:36 +0200 (jeu. 26 mai 2016) $
25
// $LastChangedRevision: 3219 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.scripts.importer.alceste;
29

    
30
import org.txm.objects.*
31
import org.txm.importer.cwb.CwbAlign;
32
import org.txm.importer.cwb.CwbEncode
33
import org.txm.importer.cwb.CwbMakeAll
34
import org.txm.scripts.importer.*;
35
import org.txm.scripts.*;
36
import org.txm.importer.scripts.xmltxm.*;
37
import org.txm.utils.treetagger.TreeTagger;
38
import org.txm.core.preferences.TXMPreferences
39
import org.txm.libs.cqp.CQPLibPreferences
40
import org.txm.searchengine.cqp.corpus.*
41
import javax.xml.stream.*;
42

    
43
import java.net.URL;
44
import java.io.File;
45
import java.util.Collection;
46
import java.util.HashMap;
47
import java.util.List;
48

    
49
// TODO: Auto-generated Javadoc
50
/**
51
 * build cqp
52
 * build CWB indexes
53
 * create registry file.
54
 *
55
 * @author mdecorde
56
 */
57
class compiler {
58
        /** The debug. */
59
        boolean debug = false;
60
        
61
        /** The input data. */
62
        private def inputData;
63
        
64
        /** The factory. */
65
        private def factory;
66
        
67
        /** The parser. */
68
        private XMLStreamReader parser;
69
        
70
        /** The dir. */
71
        private def dir;
72
        
73
        /** The output. */
74
        private def output;
75
        
76
        /** The url. */
77
        private def url;
78
        
79
        /** The anahash. */
80
        private HashMap<String, String> anahash = new HashMap<String, String>() ;
81
        
82
        /** The text. */
83
        String text = "";
84
        
85
        /** The base. */
86
        String base = "";
87
        
88
        /** The lang. */
89
        String lang = "fr";
90
        
91
        /**
92
         * initialize.
93
         *
94
         */
95
        public compiler(){}
96
        
97
        
98
        /**
99
         * Instantiates a new compiler.
100
         *
101
         * @param url the url
102
         * @param text the text
103
         * @param base the base
104
         * @param project the project
105
         */
106
        public compiler(URL url, String text, String base, String projectName)
107
        {
108
                this.text = text
109
                this.base = base;
110
                try {
111
                        this.url = url;
112
                        inputData = url.openStream();
113
                        
114
                        factory = XMLInputFactory.newInstance();
115
                        parser = factory.createXMLStreamReader(inputData);
116
                } catch (Exception ex) {
117
                        System.err.println("IOException while parsing: "+ex);
118
                }
119
        }
120
        
121
        /** The annotation success. */
122
        boolean annotationSuccess = false;
123
        
124
        /**
125
         * Sets the annotation success.
126
         *
127
         * @param val the new annotation success
128
         */
129
        public void setAnnotationSuccess(boolean val)
130
        {
131
                this.annotationSuccess = val;
132
        }
133
        
134
        /**
135
         * Creates the output.
136
         *
137
         * @param f the f
138
         * @return true, if successful
139
         */
140
        private boolean createOutput(File f){
141
                try {
142
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
143
                        return true;
144
                } catch (Exception e) {
145
                        System.err.println(e);
146
                        return false;
147
                }
148
        }
149
        
150
        /**
151
         * Sets the lang.
152
         *
153
         * @param lang the lang
154
         * @return the java.lang. object
155
         */
156
        public setLang(String lang)
157
        {
158
                this.lang = lang;
159
        }
160
        
161
        /**
162
         * Run.
163
         *
164
         * @param rootDirFile the root dir file
165
         * @param basename the basename
166
         * @return true, if successful
167
         */
168
        public boolean run(Project project) 
169
        {
170
                File binDir = project.getProjectDirectory()
171
                String corpusname = project.getName();
172
                File txmDir = new File(binDir, "txm/"+corpusname)
173
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
174
                        println ("Error: CWB executables not well set.")
175
                        return false;
176
                }
177
                
178
                CorpusBuild corpus = project.getCorpusBuild(project.getName());
179
                if (corpus != null) {
180
                        println "CLEAN PREVIOUS CORPUS"
181
                        corpus.delete(); // remove old files
182
                }
183
                        
184
                // make new one
185
                corpus = new MainCorpus(project);
186
                corpus.setID(project.getName());
187
                corpus.setName(project.getName());
188
                corpus.setDescription("Build the alceste import module");
189
                
190
                File cqpFile = new File(binDir,"cqp/"+corpusname+".cqp");
191
                new File(binDir,"cqp").mkdirs()
192
                new File(binDir,"data").mkdirs()
193
                new File(binDir,"registry").mkdirs()
194
                
195
                String textid = "";
196
                int counttext = 0;
197
                List<File> files = txmDir.listFiles();
198
                if (files == null || files.size() == 0) {
199
                        println "no file to compile"
200
                        return false;
201
                }
202
                
203
                //start corpus
204
                if (createOutput(cqpFile)) {
205
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
206
                        output.close();
207
                }
208
                
209
                Collections.sort(files);
210
                //1- Transform into CQP file
211
                XMLTXM2CQP cqpbuilder = null;
212
                println("process "+files.size()+" files")
213
                for (File f : files) {
214
                        print "."
215
                        counttext++;
216
                        if (!f.exists()) {
217
                                println("file "+f+ " does not exists")        
218
                        } else {                        
219
                                cqpbuilder = new XMLTXM2CQP(f.toURI().toURL());
220
                                String txtname = f.getName().substring(0, f.getName().length()-4);
221
                                cqpbuilder.setTextInfo(txtname, corpusname, "default");
222

    
223
                                cqpbuilder.setBalisesToKeep(["text", "s", "p", "lb"]);
224
                                cqpbuilder.setSendToPAttributes(["lb":["n"], "p":["n"], "s":["n"]])
225
                                cqpbuilder.setLang(lang);
226
                                if (!cqpbuilder.transformFile(cqpFile)) {
227
                                        println("Failed to compile "+f)
228
                                }
229
                        }
230
                }
231
                println ""
232
                
233
                //end corpus
234
                if (createOutput(cqpFile)) {
235
                        output.write("</txmcorpus>\n");
236
                        output.close();
237
                }
238
                
239
                if (cqpbuilder == null) {
240
                        println "there was no files in bin dir : "+txmDir
241
                        return false;
242
                }
243
                
244
                //2- Import into CWB
245
                def outDir = binDir.getAbsolutePath();
246
                CwbEncode cwbEn = new CwbEncode();
247
                cwbEn.setDebug(debug);
248
                CwbMakeAll cwbMa = new CwbMakeAll();
249
                cwbMa.setDebug(debug);
250
                
251
                List<String> pAttributesList = cqpbuilder.getpAttributs();
252
                List<String> sAttributesList = cqpbuilder.getsAttributs();
253
                println "pAttrs : "+pAttributesList
254
                println "sAttrs : "+sAttributesList
255
                String[] pAttributes = pAttributesList.toArray(new String[pAttributesList.size()])
256
                String[] sAttributes = sAttributesList.toArray(new String[sAttributesList.size()])
257
                
258
                try {
259
                        String regPath = outDir + "/registry/"+corpusname.toLowerCase(); // CQP wants lowercase registry name
260
                        cwbEn.run(outDir + "/data/${corpusname}", outDir + "/cqp/"+corpusname+".cqp", regPath,pAttributes, sAttributes);
261
                        if (!new File(regPath).exists()) {
262
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
263
                                return false;
264
                        }
265
                        cwbMa.run(corpusname.toUpperCase(), outDir + "/registry");
266
                        
267
                } catch (Exception ex) {System.out.println(ex); return false;}
268

    
269
                return true;
270
        }
271
        
272
        /**
273
         * Sets the debug.
274
         */
275
        public void setDebug()
276
        {
277
                this.debug = true;
278
        }
279
        
280
        /**
281
         * The main method.
282
         *
283
         * @param args the arguments
284
         */
285
        public static void main(String[] args)
286
        {
287
                File dir = new File("~/xml/geo");
288
                def c = new compiler();
289
                c.setDebug();
290
                c.setCwbPath("~/TXM/cwb/bin");
291
                c.run(dir,"geo");
292
        }
293
}