Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / hyperbase / compiler.groovy @ 1804

History | View | Annotate | Download (6.6 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2016-05-26 17:42:36 +0200 (jeu. 26 mai 2016) $
25
// $LastChangedRevision: 3219 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.scripts.importer.hyperbase;
29

    
30
import org.txm.importer.cwb.CwbEncode
31
import org.txm.importer.cwb.CwbMakeAll
32
import org.txm.scripts.importer.*;
33
import org.txm.scripts.*;
34
import org.txm.importer.scripts.xmltxm.*;
35
import org.txm.utils.treetagger.TreeTagger;
36
import org.txm.objects.*
37
import org.txm.searchengine.cqp.corpus.*
38
import javax.xml.stream.*;
39
import java.net.URL;
40
import java.io.File;
41
import java.util.HashMap;
42
import java.util.List;
43

    
44
// TODO: Auto-generated Javadoc
45
/**
46
 * The Class compiler.
47
 */
48
class compiler 
49
{
50
        
51
        /** The debug. */
52
        private boolean debug= false;
53
        
54
        /** The input data. */
55
        private def inputData;
56
        
57
        /** The factory. */
58
        private def factory;
59
        
60
        /** The parser. */
61
        private XMLStreamReader parser;
62
        
63
        /** The dir. */
64
        private def dir;
65
        
66
        /** The output. */
67
        private def output;
68
        
69
        /** The url. */
70
        private def url;
71
        
72
        /** The anahash. */
73
        private HashMap<String,String> anahash = new HashMap<String,String>() ;
74
        
75
        /** The text. */
76
        String text="";
77
        
78
        /** The base. */
79
        String base="";
80
        
81
        /** The text attributes. */
82
        String[] textAttributes = null;
83
        
84
        /** The lang. */
85
        private String lang ="fr";
86
        
87
        /**
88
         * initialize.
89
         *
90
         */
91
        public compiler(){}
92
        
93
        /**
94
         * set the language of the corpus.
95
         *
96
         * @param lang the lang
97
         * @return the java.lang. object
98
         */
99
        public setLang(String lang)
100
        {
101
                this.lang = lang;
102
        }
103
        
104
        /** The annotation success. */
105
        boolean annotationSuccess = false;
106
        
107
        /**
108
         * Sets the annotation success.
109
         *
110
         * @param value the value
111
         * @return the java.lang. object
112
         */
113
        public setAnnotationSuccess(boolean value)
114
        {
115
                this.annotationSuccess = value
116
        }
117
        
118
        /**
119
         * Creates the output.
120
         *
121
         * @param f the f
122
         * @return true, if successful
123
         */
124
        private boolean createOutput(File f){
125
                try {
126
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
127
                        return true;
128
                } catch (Exception e) {
129
                        System.err.println(e);
130
                        
131
                        return false;
132
                }
133
        }
134
        
135
        /**
136
         * Go to text.
137
         */
138
        private void GoToText()
139
        {
140
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
141
                {
142
                        if(event == XMLStreamConstants.END_ELEMENT)
143
                                if(parser.getLocalName().equals("teiHeader"))
144
                                        return;
145
                }
146
        }
147
        
148
        /**
149
         * Run.
150
         *
151
         * @param rootDirFile the root dir file
152
         * @param basename the basename
153
         * @return true, if successful
154
         */
155
        public boolean run(Project project, File binDir, File txmDir, String corpusname) 
156
        {
157
                String rootDir = binDir.getAbsolutePath();
158
                
159
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
160
                        println ("Error: CWB executables not well set.")
161
                        return false;
162
                }
163
                CorpusBuild corpus = project.getCorpusBuild(project.getName(), MainCorpus.class);
164
                if (corpus != null) {
165
                        if (project.getDoUpdate()) {
166
                                corpus.clean(); // remove old files
167
                        } else {
168
                                corpus.delete(); // remove old files and TXMResult children
169
                        }
170
                } else {
171
                        corpus = new MainCorpus(project);
172
                        corpus.setID(project.getName());
173
                        corpus.setName(project.getName());
174
                }
175
                corpus.setDescription("Built with the Hyperbase import module");
176
                
177
                File cqpFile = new File(binDir,"cqp/"+corpusname+".cqp");
178
cqpFile.delete()
179
                new File(binDir,"cqp").mkdirs()
180
                new File(binDir,"data").mkdirs()
181
                new File(binDir,"registry").mkdirs()
182
                
183
                //start corpus
184
                if (createOutput(cqpFile)) {
185
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
186
                        output.close();
187
                }
188
                
189
                String textid="";
190
                int counttext =0;
191
                List<File> files = txmDir.listFiles();
192
                Collections.sort(files);
193
                //1- Transform into CQP file
194
                XMLTXM2CQP cqpbuilder = null;
195
                println("Compiling "+files.size()+" files")
196
                for (File f : files) {
197
                        print "."
198
                        counttext++;
199
                        if (!f.exists()) {
200
                                println("file "+f+ " does not exists")        
201
                        }
202
                        else {        
203
                                cqpbuilder = new XMLTXM2CQP(f.toURI().toURL());
204
                                String txtname = f.getName().substring(0,f.getName().length()-4);
205
                                cqpbuilder.setTextInfo(txtname, corpusname, "project");
206

    
207
                                cqpbuilder.setBalisesToKeep(["text","p","s"]);
208
                                cqpbuilder.setSendToPAttributes(["s":["n"]]);
209
                                cqpbuilder.setLang(lang);
210
                                if (!cqpbuilder.transformFile(cqpFile)) {
211
                                        println("Failed to compile "+f)
212
                                }
213
                        }
214
                }
215
                println ""
216
                
217
                //end corpus
218
                if (createOutput(cqpFile)) {
219
                        output.write("</txmcorpus>\n");
220
                        output.close();
221
                }
222
                
223
                if (cqpbuilder == null) return false;
224
                
225
                //2- Import into CWB
226
                def outDir =rootDir;
227
                
228
                CwbEncode cwbEn = new CwbEncode();
229
                cwbEn.setDebug(debug);
230
                CwbMakeAll cwbMa = new CwbMakeAll();
231
                cwbMa.setDebug(debug);
232
                
233
                List<String> pAttributesList = cqpbuilder.getpAttributs();
234
                List<String> sAttributesList = cqpbuilder.getsAttributs();
235
                println "word properties : "+pAttributesList
236
                println "structures : "+sAttributesList
237
                String[] pAttributes = pAttributesList.toArray(new String[pAttributesList.size()])
238
                String[] sAttributes = sAttributesList.toArray(new String[sAttributesList.size()])
239
                
240
                try {
241
                        String regPath = outDir + "/registry/"+corpusname.toLowerCase();
242
                        cwbEn.run(outDir + "/data/$corpusname", outDir + "/cqp/"+corpusname+".cqp", regPath,pAttributes, sAttributes);
243
                        if (!new File(regPath).exists()) {
244
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
245
                                return false;
246
                        }
247
                        cwbMa.run(corpusname, outDir + "/registry");
248
                        
249
                } catch (Exception ex) {System.out.println(ex); return false;}
250
                
251
                return true;
252
        }
253
        
254
        /**
255
         * Sets the debug.
256
         */
257
        public void setDebug()
258
        {
259
                this.debug = true;
260
        }
261
        
262
        /**
263
         * The main method.
264
         *
265
         * @param args the arguments
266
         */
267
        public static void main(String[] args)
268
        {
269
                File dir = new File("~/xml/geo");
270
                def c = new compiler();
271
                c.setDebug();
272
                c.setCwbPath("~/TXM/cwb/bin");
273
                c.run(dir,"geo");
274
        }
275
}