Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / hyperbase / compiler.groovy @ 1000

History | View | Annotate | Download (6.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2016-05-26 17:42:36 +0200 (jeu. 26 mai 2016) $
25
// $LastChangedRevision: 3219 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.scripts.importer.hyperbase;
29

    
30
import org.txm.importer.cwb.CwbEncode
31
import org.txm.importer.cwb.CwbMakeAll
32
import org.txm.scripts.importer.*;
33
import org.txm.scripts.*;
34
import org.txm.importer.scripts.xmltxm.*;
35
import org.txm.utils.treetagger.TreeTagger;
36

    
37
import javax.xml.stream.*;
38
import java.net.URL;
39
import java.io.File;
40
import java.util.HashMap;
41
import java.util.List;
42

    
43
// TODO: Auto-generated Javadoc
44
/**
45
 * The Class compiler.
46
 */
47
class compiler 
48
{
49
        
50
        /** The debug. */
51
        private boolean debug= false;
52
        
53
        /** The input data. */
54
        private def inputData;
55
        
56
        /** The factory. */
57
        private def factory;
58
        
59
        /** The parser. */
60
        private XMLStreamReader parser;
61
        
62
        /** The dir. */
63
        private def dir;
64
        
65
        /** The output. */
66
        private def output;
67
        
68
        /** The url. */
69
        private def url;
70
        
71
        /** The anahash. */
72
        private HashMap<String,String> anahash = new HashMap<String,String>() ;
73
        
74
        /** The text. */
75
        String text="";
76
        
77
        /** The base. */
78
        String base="";
79
        
80
        /** The project. */
81
        String project="";
82
        
83
        /** The text attributes. */
84
        String[] textAttributes = null;
85
        
86
        /** The lang. */
87
        private String lang ="fr";
88
        
89
        /**
90
         * initialize.
91
         *
92
         */
93
        public compiler(){}
94
        
95
        /**
96
         * Instantiates a new compiler.
97
         *
98
         * @param url the url
99
         * @param text the text
100
         * @param base the base
101
         * @param project the project
102
         */
103
        public compiler(URL url,String text,String base, String project)
104
        {
105
                this.text = text
106
                this.base = base;
107
                this.project = project;
108
                this.textAttributes = textAttributes;
109
                try {
110
                        this.url = url;
111
                        inputData = url.openStream();
112
                        
113
                        factory = XMLInputFactory.newInstance();
114
                        parser = factory.createXMLStreamReader(inputData);
115
                } catch (XMLStreamException ex) {
116
                        System.out.println(ex);
117
                }catch (IOException ex) {
118
                        System.err.println("IOException while parsing ");
119
                }
120
        }
121
        
122
        /**
123
         * set the language of the corpus.
124
         *
125
         * @param lang the lang
126
         * @return the java.lang. object
127
         */
128
        public setLang(String lang)
129
        {
130
                this.lang = lang;
131
        }
132
        
133
        /** The annotation success. */
134
        boolean annotationSuccess = false;
135
        
136
        /**
137
         * Sets the annotation success.
138
         *
139
         * @param value the value
140
         * @return the java.lang. object
141
         */
142
        public setAnnotationSuccess(boolean value)
143
        {
144
                this.annotationSuccess = value
145
        }
146
        
147
        /**
148
         * Creates the output.
149
         *
150
         * @param f the f
151
         * @return true, if successful
152
         */
153
        private boolean createOutput(File f){
154
                try {
155
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
156
                        return true;
157
                } catch (Exception e) {
158
                        System.err.println(e);
159
                        
160
                        return false;
161
                }
162
        }
163
        
164
        /**
165
         * Go to text.
166
         */
167
        private void GoToText()
168
        {
169
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
170
                {
171
                        if(event == XMLStreamConstants.END_ELEMENT)
172
                                if(parser.getLocalName().equals("teiHeader"))
173
                                        return;
174
                }
175
        }
176
        
177
        /**
178
         * Run.
179
         *
180
         * @param rootDirFile the root dir file
181
         * @param basename the basename
182
         * @return true, if successful
183
         */
184
        public boolean run(File binDir, File txmDir, String corpusname) 
185
        {
186
                String rootDir = binDir.getAbsolutePath();
187
                
188
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
189
                        println ("Error: CWB executables not well set.")
190
                        return false;
191
                }
192
                if(!binDir.exists())
193
                {
194
                        println ("binary directory does not exists: "+rootDir)
195
                        return false;
196
                }
197
                File cqpFile = new File(binDir,"cqp/"+corpusname+".cqp");
198
                new File(binDir,"cqp").deleteDir();
199
                new File(binDir,"cqp").mkdir();
200
                new File(binDir,"data").deleteDir();
201
                new File(binDir,"data").mkdir();
202
                new File(binDir,"registry").mkdir();
203
                
204
                //start corpus
205
                if (createOutput(cqpFile)) {
206
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
207
                        output.close();
208
                }
209
                
210
                String textid="";
211
                int counttext =0;
212
                List<File> files = txmDir.listFiles();
213
                Collections.sort(files);
214
                //1- Transform into CQP file
215
                XMLTXM2CQP cqpbuilder = null;
216
                println("Compiling "+files.size()+" files")
217
                for (File f : files) {
218
                        print "."
219
                        counttext++;
220
                        if (!f.exists()) {
221
                                println("file "+f+ " does not exists")        
222
                        }
223
                        else {        
224
                                cqpbuilder = new XMLTXM2CQP(f.toURI().toURL());
225
                                String txtname = f.getName().substring(0,f.getName().length()-4);
226
                                cqpbuilder.setTextInfo(txtname, corpusname, "project");
227

    
228
                                cqpbuilder.setBalisesToKeep(["text","p","s"]);
229
                                cqpbuilder.setSendToPAttributes(["s":["n"]]);
230
                                cqpbuilder.setLang(lang);
231
                                if (!cqpbuilder.transformFile(cqpFile)) {
232
                                        println("Failed to compile "+f)
233
                                }
234
                        }
235
                }
236
                println ""
237
                
238
                //end corpus
239
                if (createOutput(cqpFile)) {
240
                        output.write("</txmcorpus>\n");
241
                        output.close();
242
                }
243
                
244
                if (cqpbuilder == null) return false;
245
                
246
                //2- Import into CWB
247
                def outDir =rootDir;
248
                
249
                CwbEncode cwbEn = new CwbEncode();
250
                cwbEn.setDebug(debug);
251
                CwbMakeAll cwbMa = new CwbMakeAll();
252
                cwbMa.setDebug(debug);
253
                
254
                List<String> pAttributesList = cqpbuilder.getpAttributs();
255
                List<String> sAttributesList = cqpbuilder.getsAttributs();
256
                println "word properties : "+pAttributesList
257
                println "structures : "+sAttributesList
258
                String[] pAttributes = pAttributesList.toArray(new String[pAttributesList.size()])
259
                String[] sAttributes = sAttributesList.toArray(new String[sAttributesList.size()])
260
                
261
                try {
262
                        String regPath = outDir + "/registry/"+corpusname.toLowerCase();
263
                        cwbEn.run(outDir + "/data/$corpusname", outDir + "/cqp/"+corpusname+".cqp", regPath,pAttributes, sAttributes);
264
                        if (!new File(regPath).exists()) {
265
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
266
                                return false;
267
                        }
268
                        cwbMa.run(corpusname, outDir + "/registry");
269
                        
270
                } catch (Exception ex) {System.out.println(ex); return false;}
271
                
272
                return true;
273
        }
274
        
275
        /**
276
         * Sets the debug.
277
         */
278
        public void setDebug()
279
        {
280
                this.debug = true;
281
        }
282
        
283
        /**
284
         * The main method.
285
         *
286
         * @param args the arguments
287
         */
288
        public static void main(String[] args)
289
        {
290
                File dir = new File("~/xml/geo");
291
                def c = new compiler();
292
                c.setDebug();
293
                c.setCwbPath("~/TXM/cwb/bin");
294
                c.run(dir,"geo");
295
        }
296
}