Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / txt / compiler.groovy @ 479

History | View | Annotate | Download (6.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-04-11 15:30:35 +0200 (mar. 11 avril 2017) $
25
// $LastChangedRevision: 3426 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.txt
29

    
30
import org.txm.Toolbox;
31
import org.txm.importer.cwb.CwbEncode
32
import org.txm.importer.cwb.CwbMakeAll
33
import org.txm.importer.cwb.PatchCwbRegistry;
34
import org.txm.importer.*;
35
import org.txm.scripts.*;
36
import org.txm.scripts.teitxm.*;
37
import org.txm.utils.treetagger.TreeTagger;
38

    
39
import javax.xml.stream.*;
40

    
41
import java.net.URL;
42
import java.io.File;
43
import java.util.HashMap;
44
import java.util.List;
45

    
46
// TODO: Auto-generated Javadoc
47
/**
48
 * The Class compiler.
49
 */
50
class compiler {
51
        
52
        /** The debug. */
53
        boolean debug = false;
54
        
55
        /** The input data. */
56
        private def inputData;
57
        
58
        /** The factory. */
59
        private def factory;
60
        
61
        /** The parser. */
62
        private XMLStreamReader parser;
63
        
64
        /** The dir. */
65
        private def dir;
66
        
67
        /** The output. */
68
        private def output;
69
        
70
        /** The url. */
71
        private def url;
72
        
73
        /** The anahash. */
74
        private HashMap<String,String> anahash = new HashMap<String,String>() ;
75
        
76
        /** The text. */
77
        String text="";
78
        
79
        /** The base. */
80
        String base="";
81
        
82
        /** The project. */
83
        String project="";
84
        
85
        /** The cwb loc. */
86
        String cwbLoc;
87
        
88
        /** The lang. */
89
        private String lang ="fr";
90
        
91
        /**
92
         * initialize.
93
         *
94
         * @param url the url
95
         * @param text the text
96
         * @param base the base
97
         * @param project the project
98
         */
99
        
100
        public compiler(URL url,String text,String base, String project)
101
        {
102
                this.text = text
103
                this.base = base;
104
                this.project = project;
105
                this.url = url;
106
        }
107
        
108
        /**
109
         * Sets the debug.
110
         *
111
         * @return the java.lang. object
112
         */
113
        public setDebug()
114
        {
115
                debug =true;
116
        }
117
        
118
        /** The annotation success. */
119
        boolean annotationSuccess = false;
120
        
121
        /**
122
         * Sets the annotation success.
123
         *
124
         * @param val the new annotation success
125
         */
126
        public void setAnnotationSuccess(boolean val)
127
        {
128
                this.annotationSuccess = val;
129
        }
130
        
131
        /** The metadata s attributes. */
132
        String metadataSAttributes = "";
133
        
134
        /**
135
         * Sets the metadata attributes.
136
         *
137
         * @param attrsdecl the new metadata attributes
138
         */
139
        public void setMetadataAttributes(String attrsdecl)
140
        {
141
                metadataSAttributes = attrsdecl;
142
        }
143
        
144
        /**
145
         * set the language of the corpus.
146
         *
147
         * @param lang the lang
148
         * @return the java.lang. object
149
         */
150
        public setLang(String lang)
151
        {
152
                this.lang = lang;
153
        }
154
        
155
        /**
156
         * Sets the cwb path.
157
         *
158
         * @param path the new cwb path
159
         */
160
        public void setCwbPath(String path)
161
        {
162
                if (!new File(path).exists())
163
                        System.err.println("CWB Path : "+path+" does not exists")
164
                cwbLoc = path;
165
        }
166
        
167
        /**
168
         * Creates the output.
169
         *
170
         * @param f the f
171
         * @return true, if successful
172
         */
173
        private boolean createOutput(File f) {
174
                try {
175
                        output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f,f.exists())) , "UTF-8");
176
                        return true;
177
                } catch (Exception e) { 
178
                        System.out.println(e.getLocalizedMessage());
179
                        return false;
180
                }
181
        }
182
        
183
        /**
184
         * Run.
185
         *
186
         * @param rootDirFile the root dir file
187
         * @param basename the basename
188
         * @return true, if successful
189
         */
190
        public boolean run(File binDir, File txmDir, String corpusname) 
191
        {
192
                if (cwbLoc == null)
193
                        cwbLoc = org.txm.Toolbox.getParam(org.txm.Toolbox.CQI_SERVER_PATH_TO_CQPLIB)+File.separator;
194
                
195
                if (!new File(cwbLoc).exists()) {
196
                        println ("CWB path error: "+cwbLoc)
197
                        return false;
198
                }
199
                if (!binDir.exists()) {
200
                        println ("Binary directory does not exists: "+binDir)
201
                        return false;
202
                }
203
                
204
                File wtcFile = new File(binDir,"wtc/"+corpusname+".wtc");
205
                new File(binDir, "wtc").deleteDir();
206
                new File(binDir, "wtc").mkdir();
207
                new File(binDir, "data").deleteDir();
208
                new File(binDir, "data").mkdir();
209
                new File(binDir, "registry").mkdir();
210
                
211
                String textid="";
212
                int counttext =0;
213
                
214
                //start corpus
215
                if (createOutput(wtcFile)) {
216
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
217
                        output.close();
218
                }
219
                
220
                //1- Transform into WTC file
221
                println("Compiling "+txmDir.listFiles().length+" files")
222
                XMLTXM2WTC wtcbuilder = null;
223
                ArrayList<File> files = txmDir.listFiles();
224
                Collections.sort(files);
225
                for (File txmfile : files) {
226
                        print "."
227
                        wtcbuilder = new XMLTXM2WTC(txmfile.toURI().toURL());
228
                        String txtname = txmfile.getName().substring(0,txmfile.getName().length()-4);
229
                        wtcbuilder.setTextInfo(txtname, this.base, this.project);
230

    
231
                        wtcbuilder.setBalisesToKeep(["text","s"]);
232
                        wtcbuilder.setSendToPAttributes(["lb":["n"]]);
233
                        wtcbuilder.setLang(lang);
234
                        if (!wtcbuilder.transformFile(wtcFile)) {
235
                                println("Failed to compile "+txmfile)
236
                        }
237
                        
238
//                        println "$txmfile pAttrs : "+wtcbuilder.getpAttributs()
239
//                        println "$txmfile sAttrs : "+wtcbuilder.getsAttributs()
240
                }
241
                println ""
242
                
243
                //end corpus
244
                if (createOutput(wtcFile)) {
245
                        output.write("</txmcorpus>\n");
246
                        output.close();
247
                }
248
                
249
                //2- Import into CWB
250
                def outDir = binDir.getAbsolutePath()+"/";
251
                
252
                CwbEncode cwbEn = new CwbEncode();
253
                CwbMakeAll cwbMa = new CwbMakeAll();
254
                cwbEn.setDebug(debug);
255
                cwbMa.setDebug(debug);
256
                
257
                if(wtcbuilder == null)
258
                        return false;
259
                
260
                List<String> pAttributesList = wtcbuilder.getpAttributs(); // use last text s and p attributes
261
                List<String> sAttributesList = wtcbuilder.getsAttributs();
262
                println "pAttrs : "+pAttributesList
263
                println "sAttrs : "+sAttributesList
264
                String[] pAttributes = pAttributesList.toArray(new String[pAttributesList.size()])
265
                String[] sAttributes = sAttributesList.toArray(new String[sAttributesList.size()])
266
                
267
                try {
268
                        String regPath = outDir + "/registry/"+corpusname.toLowerCase();
269
                        cwbEn.run(cwbLoc + "cwb-encode", outDir + "/data/$corpusname", outDir + "/wtc/"+corpusname+".wtc", regPath, pAttributes, sAttributes);
270
                        if (!new File(regPath).exists()) {
271
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
272
                                return false;
273
                        }
274
                        cwbMa.run(cwbLoc + "cwb-makeall", corpusname, outDir + "/registry");
275
                } catch (Exception ex) {System.out.println("CWB error: "+ex); return false;}
276
                
277
                return true;
278
        }
279
        
280
        /**
281
         * The main method.
282
         *
283
         * @param args the arguments
284
         */
285
        public static void main(String[] args)
286
        {
287
                File dir = new File("~/xml/perrault/txm/");
288
                List<File> files = dir.listFiles();
289
                new compiler().run(files);
290
        }
291
}