Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / quick / compiler.groovy @ 187

History | View | Annotate | Download (6.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (Tue, 24 Jan 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.quick
29

    
30
import org.txm.Toolbox;
31
import org.txm.importer.cwb.CwbEncode
32
import org.txm.importer.cwb.CwbMakeAll
33
import org.txm.importer.cwb.PatchCwbRegistry;
34
import org.txm.importer.*;
35
import org.txm.scripts.*;
36
import org.txm.scripts.teitxm.BuildTTSrc;
37
import org.txm.scripts.teitxm.*;
38
import org.txm.utils.treetagger.TreeTagger;
39

    
40
import javax.xml.stream.*;
41
import java.net.URL;
42
import java.io.File;
43
import java.util.HashMap;
44
import java.util.List;
45

    
46
// TODO: Auto-generated Javadoc
47
/**
48
 * The Class compiler.
49
 */
50
class compiler {
51
        
52
        /** The debug. */
53
        boolean debug = false;
54
        
55
        /** The input data. */
56
        private def inputData;
57
        
58
        /** The factory. */
59
        private def factory;
60
        
61
        /** The parser. */
62
        private XMLStreamReader parser;
63
        
64
        /** The dir. */
65
        private def dir;
66
        
67
        /** The output. */
68
        private def output;
69
        
70
        /** The url. */
71
        private def url;
72
        
73
        /** The anahash. */
74
        private HashMap<String,String> anahash = new HashMap<String,String>() ;
75
        
76
        /** The text. */
77
        String text="";
78
        
79
        /** The base. */
80
        String base="";
81
        
82
        /** The project. */
83
        String project="";
84
        
85
        /** The cwb loc. */
86
        String cwbLoc;
87
        
88
        /** The lang. */
89
        private String lang ="fr";
90
        
91
        /**
92
         * initialize.
93
         *
94
         * @param url the url
95
         * @param text the text
96
         * @param base the base
97
         * @param project the project
98
         */
99
        
100
        public compiler(URL url,String text,String base, String project)
101
        {
102
                this.text = text
103
                this.base = base;
104
                this.project = project;
105
                this.url = url;
106
        }
107
        
108
        /**
109
         * Sets the debug.
110
         *
111
         * @return the java.lang. object
112
         */
113
        public setDebug()
114
        {
115
                debug =true;
116
        }
117
        
118
        /** The annotation success. */
119
        boolean annotationSuccess = false;
120
        
121
        /**
122
         * Sets the annotation success.
123
         *
124
         * @param val the new annotation success
125
         */
126
        public void setAnnotationSuccess(boolean val)
127
        {
128
                this.annotationSuccess = val;
129
        }
130
        
131
        /** The metadata s attributes. */
132
        String metadataSAttributes = "";
133
        
134
        /**
135
         * Sets the metadata attributes.
136
         *
137
         * @param attrsdecl the new metadata attributes
138
         */
139
        public void setMetadataAttributes(String attrsdecl)
140
        {
141
                metadataSAttributes = attrsdecl;
142
        }
143
        
144
        /**
145
         * set the language of the corpus.
146
         *
147
         * @param lang the lang
148
         * @return the java.lang. object
149
         */
150
        public setLang(String lang)
151
        {
152
                this.lang = lang;
153
        }
154
        
155
        /**
156
         * Sets the cwb path.
157
         *
158
         * @param path the new cwb path
159
         */
160
        public void setCwbPath(String path)
161
        {
162
                if (!new File(path).exists())
163
                        System.err.println("CWB Path : "+path+" does not exists")
164
                cwbLoc = path;
165
        }
166
        
167
        /**
168
         * Creates the output.
169
         *
170
         * @param f the f
171
         * @return true, if successful
172
         */
173
        private boolean createOutput(File f) {
174
                try {
175
                        output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f,f.exists())) , "UTF-8");
176
                        return true;
177
                } catch (Exception e) { 
178
                        System.out.println(e.getLocalizedMessage());
179
                        return false;
180
                }
181
        }
182
        
183
        /**
184
         * Run.
185
         *
186
         * @param rootDirFile the root dir file
187
         * @param basename the basename
188
         * @return true, if successful
189
         */
190
        public boolean run(File binDir, File txmDir, String corpusname) 
191
        {
192
                if (cwbLoc == null)
193
                        cwbLoc = org.txm.Toolbox.getParam(org.txm.Toolbox.CQI_SERVER_PATH_TO_CQPLIB)+File.separator;
194
                
195
                if (!new File(cwbLoc).exists()) {
196
                        println ("CWB path error: "+cwbLoc)
197
                        return false;
198
                }
199
                if (!binDir.exists()) {
200
                        println ("Binary directory does not exists: "+binDir)
201
                        return false;
202
                }
203
                
204
                File wtcFile = new File(binDir,"wtc/"+corpusname+".wtc");
205
                new File(binDir, "wtc").deleteDir();
206
                new File(binDir, "wtc").mkdir();
207
                new File(binDir, "data").deleteDir();
208
                new File(binDir, "data").mkdir();
209
                new File(binDir, "registry").mkdir();
210
                
211
                String textid="";
212
                int counttext =0;
213
                
214
                //start corpus
215
                if (createOutput(wtcFile)) {
216
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
217
                        output.close();
218
                }
219
                
220
                //1- Transform into WTC file
221
                println("Compiling "+txmDir.listFiles().length+" files")
222
                XMLTXM2WTC wtcbuilder = null;
223
                ArrayList<File> files = txmDir.listFiles();
224
                Collections.sort(files);
225
                for (File txmfile : files) {
226
                        print "."
227
                        wtcbuilder = new XMLTXM2WTC(txmfile.toURI().toURL());
228
                        String txtname = txmfile.getName().substring(0,txmfile.getName().length()-4);
229
                        wtcbuilder.setTextInfo(txtname, this.base, this.project);
230

    
231
                        wtcbuilder.setBalisesToKeep(["text","s"]);
232
                        wtcbuilder.setSendToPAttributes(["lb":["n"]]);
233
                        wtcbuilder.setLang(lang);
234
                        if (!wtcbuilder.transformFile(wtcFile)) {
235
                                println("Failed to compile "+txmfile)
236
                        }
237
                }
238
                println ""
239
                
240
                //end corpus
241
                if (createOutput(wtcFile)) {
242
                        output.write("</txmcorpus>\n");
243
                        output.close();
244
                }
245
                
246
                //2- Import into CWB
247
                def outDir = binDir.getAbsolutePath()+"/";
248
                
249
                CwbEncode cwbEn = new CwbEncode();
250
                CwbMakeAll cwbMa = new CwbMakeAll();
251
                cwbEn.setDebug(debug);
252
                cwbMa.setDebug(debug);
253
                
254
                if(wtcbuilder == null)
255
                        return false;
256
                
257
                List<String> pAttributesList = wtcbuilder.getpAttributs();
258
                List<String> sAttributesList = wtcbuilder.getsAttributs();
259
                println "pAttrs : "+pAttributesList
260
                println "sAttrs : "+sAttributesList
261
                String[] pAttributes = pAttributesList.toArray(new String[pAttributesList.size()])
262
                String[] sAttributes = sAttributesList.toArray(new String[sAttributesList.size()])
263
                
264
                try {
265
                        String regPath = outDir + "/registry/"+corpusname.toLowerCase();
266
                        cwbEn.run(cwbLoc + "cwb-encode", outDir + "/data/$corpusname", outDir + "/wtc/"+corpusname+".wtc", regPath, pAttributes, sAttributes);
267
                        if (!new File(regPath).exists()) {
268
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
269
                                return false;
270
                        }
271
                        cwbMa.run(cwbLoc + "cwb-makeall", corpusname, outDir + "/registry");
272
                } catch (Exception ex) {System.out.println("CWB error: "+ex); return false;}
273
                
274
                return true;
275
        }
276
        
277
        /**
278
         * The main method.
279
         *
280
         * @param args the arguments
281
         */
282
        public static void main(String[] args)
283
        {
284
                File dir = new File("~/xml/perrault/txm/");
285
                List<File> files = dir.listFiles();
286
                new compiler().run(files);
287
        }
288
}