Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / txt / compiler.groovy @ 1094

History | View | Annotate | Download (6.7 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-04-11 15:30:35 +0200 (mar. 11 avril 2017) $
25
// $LastChangedRevision: 3426 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.scripts.importer.txt
29

    
30
import org.txm.Toolbox;
31
import org.txm.importer.cwb.CwbEncode
32
import org.txm.importer.cwb.CwbMakeAll
33
import org.txm.importer.cwb.PatchCwbRegistry;
34
import org.txm.scripts.importer.*;
35
import org.txm.scripts.*;
36
import org.txm.importer.scripts.xmltxm.*;
37
import org.txm.objects.Project
38
import org.txm.utils.treetagger.TreeTagger;
39

    
40
import javax.xml.stream.*;
41

    
42
import java.net.URL;
43
import java.io.File;
44
import java.util.HashMap;
45
import java.util.List;
46

    
47
// TODO: Auto-generated Javadoc
48
/**
49
 * The Class compiler.
50
 */
51
class compiler {
52
        
53
        /** The debug. */
54
        boolean debug = false;
55
        
56
        /** The input data. */
57
        private def inputData;
58
        
59
        /** The factory. */
60
        private def factory;
61
        
62
        /** The parser. */
63
        private XMLStreamReader parser;
64
        
65
        /** The dir. */
66
        private def dir;
67
        
68
        /** The output. */
69
        private def output;
70
        
71
        /** The url. */
72
        private def url;
73
        
74
        /** The anahash. */
75
        private HashMap<String, String> anahash = new HashMap<String,String>() ;
76
        
77
        /** The text. */
78
        String text="";
79
        
80
        /** The base. */
81
        String base="";
82
        
83
        /** The lang. */
84
        private String lang ="fr";
85
        
86
        /**
87
         * initialize.
88
         *
89
         * @param url the url
90
         * @param text the text
91
         * @param base the base
92
         * @param project the project
93
         */
94
        
95
        public compiler(URL url,String text,String base, String projectName)
96
        {
97
                this.text = text
98
                this.base = base;
99
                this.url = url;
100
        }
101
        
102
        /**
103
         * Sets the debug.
104
         *
105
         * @return the java.lang. object
106
         */
107
        public setDebug()
108
        {
109
                debug =true;
110
        }
111
        
112
        /** The annotation success. */
113
        boolean annotationSuccess = false;
114
        
115
        /**
116
         * Sets the annotation success.
117
         *
118
         * @param val the new annotation success
119
         */
120
        public void setAnnotationSuccess(boolean val)
121
        {
122
                this.annotationSuccess = val;
123
        }
124
        
125
        /** The metadata s attributes. */
126
        String metadataSAttributes = "";
127
        
128
        /**
129
         * Sets the metadata attributes.
130
         *
131
         * @param attrsdecl the new metadata attributes
132
         */
133
        public void setMetadataAttributes(String attrsdecl)
134
        {
135
                metadataSAttributes = attrsdecl;
136
        }
137
        
138
        /**
139
         * set the language of the corpus.
140
         *
141
         * @param lang the lang
142
         * @return the java.lang. object
143
         */
144
        public setLang(String lang)
145
        {
146
                this.lang = lang;
147
        }
148
        
149
        /**
150
         * Creates the output.
151
         *
152
         * @param f the f
153
         * @return true, if successful
154
         */
155
        private boolean createOutput(File f) {
156
                try {
157
                        output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f,f.exists())) , "UTF-8");
158
                        return true;
159
                } catch (Exception e) { 
160
                        System.out.println(e.getLocalizedMessage());
161
                        return false;
162
                }
163
        }
164
        
165
        /**
166
         * Run.
167
         *
168
         * @param rootDirFile the root dir file
169
         * @param basename the basename
170
         * @return true, if successful
171
         */
172
        public boolean run(Project project) 
173
        {
174
                File binDir = project.getProjectDirectory()
175
                String corpusname = project.getName();
176
                File txmDir = new File(binDir, "txm/"+corpusname)
177
                
178
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
179
                        println ("Error: CWB executables not well set.")
180
                        return false;
181
                }
182
                
183
                CorpusBuild corpus = project.getCorpusBuild(project.getName());
184
                if (corpus != null) {
185
                        println "CLEAN PREVIOUS CORPUS"
186
                        corpus.delete(); // remove old files
187
                }
188
                        
189
                // make new one
190
                corpus = new MainCorpus(project);
191
                corpus.setID(project.getName());
192
                corpus.setName(project.getName());
193
                corpus.setDescription("Build the alceste import module");
194
                
195
                File cqpFile = new File(binDir,"cqp/"+corpusname+".cqp");
196
                new File(binDir,"cqp").mkdirs()
197
                new File(binDir,"data").mkdirs()
198
                new File(binDir,"registry").mkdirs()
199
                
200
                String textid="";
201
                int counttext =0;
202
                
203
                //start corpus
204
                if (createOutput(cqpFile)) {
205
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
206
                        output.close();
207
                }
208
                
209
                //1- Transform into CQP file
210
                println("Compiling "+txmDir.listFiles().length+" files")
211
                XMLTXM2CQP cqpbuilder = null;
212
                ArrayList<File> files = txmDir.listFiles();
213
                Collections.sort(files);
214
                for (File txmfile : files) {
215
                        print "."
216
                        cqpbuilder = new XMLTXM2CQP(txmfile.toURI().toURL());
217
                        String txtname = txmfile.getName().substring(0,txmfile.getName().length()-4);
218
                        cqpbuilder.setTextInfo(txtname, this.base, this.project);
219

    
220
                        cqpbuilder.setBalisesToKeep(["text","s"]);
221
                        cqpbuilder.setSendToPAttributes(["lb":["n"]]);
222
                        cqpbuilder.setLang(lang);
223
                        if (!cqpbuilder.transformFile(cqpFile)) {
224
                                println("Failed to compile "+txmfile)
225
                        }
226
                }
227
                println ""
228
                
229
                //end corpus
230
                if (createOutput(cqpFile)) {
231
                        output.write("</txmcorpus>\n");
232
                        output.close();
233
                }
234
                
235
                //2- Import into CWB
236
                def outDir = binDir.getAbsolutePath()+"/";
237
                
238
                CwbEncode cwbEn = new CwbEncode();
239
                CwbMakeAll cwbMa = new CwbMakeAll();
240
                cwbEn.setDebug(debug);
241
                cwbMa.setDebug(debug);
242
                
243
                if(cqpbuilder == null)
244
                        return false;
245
                
246
                List<String> pAttributesList = cqpbuilder.getpAttributs(); // use last text s and p attributes
247
                List<String> sAttributesList = cqpbuilder.getsAttributs();
248
                println "pAttrs : "+pAttributesList
249
                println "sAttrs : "+sAttributesList
250
                String[] pAttributes = pAttributesList.toArray(new String[pAttributesList.size()])
251
                String[] sAttributes = sAttributesList.toArray(new String[sAttributesList.size()])
252
                
253
                try {
254
                        String regPath = outDir + "/registry/"+corpusname.toLowerCase();
255
                        cwbEn.run(outDir + "/data/${corpusname}", outDir + "/cqp/"+corpusname+".cqp", regPath, pAttributes, sAttributes);
256
                        if (!new File(regPath).exists()) {
257
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
258
                                return false;
259
                        }
260
                        cwbMa.run(corpusname, outDir + "/registry");
261
                } catch (Exception ex) {System.out.println("CWB error: "+ex); return false;}
262
                
263
                return true;
264
        }
265
        
266
        /**
267
         * The main method.
268
         *
269
         * @param args the arguments
270
         */
271
        public static void main(String[] args)
272
        {
273
                File dir = new File("~/xml/perrault/txm/");
274
                List<File> files = dir.listFiles();
275
                new compiler().run(files);
276
        }
277
}