Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / txt / compiler.groovy @ 2554

History | View | Annotate | Download (7 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-04-11 15:30:35 +0200 (mar. 11 avril 2017) $
25
// $LastChangedRevision: 3426 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.scripts.importer.txt
29

    
30
import org.txm.Toolbox;
31
import org.txm.importer.cwb.CwbEncode
32
import org.txm.importer.cwb.CwbMakeAll
33
import org.txm.importer.cwb.PatchCwbRegistry;
34
import org.txm.scripts.importer.*;
35
import org.txm.scripts.*;
36
import org.txm.importer.scripts.xmltxm.*;
37
import org.txm.objects.*
38
import org.txm.utils.*
39
import org.txm.utils.treetagger.TreeTagger;
40
import org.txm.searchengine.cqp.corpus.*
41
import javax.xml.stream.*;
42

    
43
import java.net.URL;
44
import java.io.File;
45
import java.util.HashMap;
46
import java.util.List;
47

    
48
// TODO: Auto-generated Javadoc
49
/**
50
 * The Class compiler.
51
 */
52
class compiler {
53
        
54
        /** The debug. */
55
        boolean debug = false;
56
        
57
        /** The input data. */
58
        private def inputData;
59
        
60
        /** The factory. */
61
        private def factory;
62
        
63
        /** The parser. */
64
        private XMLStreamReader parser;
65
        
66
        /** The dir. */
67
        private def dir;
68
        
69
        /** The output. */
70
        private def output;
71
        
72
        /** The url. */
73
        private def url;
74
        
75
        /** The anahash. */
76
        private HashMap<String, String> anahash = new HashMap<String,String>() ;
77
        
78
        /** The text. */
79
        String text="";
80
        
81
        /** The base. */
82
        String base="";
83
        
84
        /** The lang. */
85
        private String lang ="fr";
86
        
87
        /**
88
         * initialize.
89
         *
90
         * @param url the url
91
         * @param text the text
92
         * @param base the base
93
         * @param project the project
94
         */
95
        
96
        public compiler(URL url,String text,String base, String projectName)
97
        {
98
                this.text = text
99
                this.base = base;
100
                this.url = url;
101
        }
102
        
103
        /**
104
         * Sets the debug.
105
         *
106
         * @return the java.lang. object
107
         */
108
        public setDebug()
109
        {
110
                debug =true;
111
        }
112
        
113
        /** The annotation success. */
114
        boolean annotationSuccess = false;
115
        
116
        /**
117
         * Sets the annotation success.
118
         *
119
         * @param val the new annotation success
120
         */
121
        public void setAnnotationSuccess(boolean val)
122
        {
123
                this.annotationSuccess = val;
124
        }
125
        
126
        /** The metadata s attributes. */
127
        String metadataSAttributes = "";
128
        
129
        /**
130
         * Sets the metadata attributes.
131
         *
132
         * @param attrsdecl the new metadata attributes
133
         */
134
        public void setMetadataAttributes(String attrsdecl)
135
        {
136
                metadataSAttributes = attrsdecl;
137
        }
138
        
139
        /**
140
         * set the language of the corpus.
141
         *
142
         * @param lang the lang
143
         * @return the java.lang. object
144
         */
145
        public setLang(String lang)
146
        {
147
                this.lang = lang;
148
        }
149
        
150
        /**
151
         * Creates the output.
152
         *
153
         * @param f the f
154
         * @return true, if successful
155
         */
156
        private boolean createOutput(File f) {
157
                try {
158
                        output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f,f.exists())) , "UTF-8");
159
                        return true;
160
                } catch (Exception e) { 
161
                        System.out.println(e.getLocalizedMessage());
162
                        return false;
163
                }
164
        }
165
        
166
        /**
167
         * Run.
168
         *
169
         * @param rootDirFile the root dir file
170
         * @param basename the basename
171
         * @return true, if successful
172
         */
173
        public boolean run(Project project) 
174
        {
175
                File binDir = project.getProjectDirectory()
176
                String corpusname = project.getName();
177
                File txmDir = new File(binDir, "txm/"+corpusname)
178
                
179
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
180
                        println ("Error: CWB executable rights are not well setted.")
181
                        return false;
182
                }
183
                
184
                CorpusBuild corpus = project.getCorpusBuild(project.getName(), MainCorpus.class);
185
                if (corpus != null) {
186
                        if (project.getDoUpdate()) {
187
                                corpus.clean(); // remove old files
188
                        } else {
189
                                corpus.delete(); // remove old files and TXMResult children
190
                        }
191
                } else {
192
                        corpus = new MainCorpus(project);
193
                        corpus.setID(project.getName());
194
                        corpus.setName(project.getName());
195
                }
196
                corpus.setDescription("Built with the TXT+CSV import module");
197
                
198
                File cqpFile = new File(binDir,"cqp/"+corpusname+".cqp");
199
cqpFile.delete()
200
                new File(binDir,"cqp").mkdirs()
201
                new File(binDir,"data").mkdirs()
202
                new File(binDir,"registry").mkdirs()
203
                
204
                String textid="";
205
                int counttext =0;
206
                
207
                //start corpus
208
                if (createOutput(cqpFile)) {
209
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
210
                        output.close();
211
                }
212
                
213
                //1- Transform into CQP file
214
                XMLTXM2CQP cqpbuilder = null;
215
                cqpFile.delete()
216
                ArrayList<File> files = txmDir.listFiles();
217
                Collections.sort(files);
218
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
219
                for (File txmfile : files) {
220
                        cpb.tick()
221
                        cqpbuilder = new XMLTXM2CQP(txmfile.toURI().toURL());
222
                        String txtname = txmfile.getName().substring(0,txmfile.getName().length()-4);
223
                        cqpbuilder.setTextInfo(txtname, this.base, project.getName());
224

    
225
                        cqpbuilder.setBalisesToKeep(["text","s"]);
226
                        cqpbuilder.setSendToPAttributes(["lb":["n"]]);
227
                        cqpbuilder.setLang(lang);
228
                        if (!cqpbuilder.transformFile(cqpFile)) {
229
                                println("Failed to compile "+txmfile)
230
                        }
231
                }
232
                cpb.done()
233

    
234
                
235
                //end corpus
236
                if (createOutput(cqpFile)) {
237
                        output.write("</txmcorpus>\n");
238
                        output.close();
239
                }
240
                
241
                //2- Import into CWB
242
                def outDir = binDir.getAbsolutePath()+"/";
243
                
244
                CwbEncode cwbEn = new CwbEncode();
245
                CwbMakeAll cwbMa = new CwbMakeAll();
246
                cwbEn.setDebug(debug);
247
                cwbMa.setDebug(debug);
248
                
249
                if (cqpbuilder == null) {
250
                        return false;
251
                }
252
                
253
                List<String> pAttributesList = cqpbuilder.getpAttributs(); // use last text s and p attributes
254
                List<String> sAttributesList = cqpbuilder.getsAttributs();
255
                println "pAttrs : "+pAttributesList
256
                println "sAttrs : "+sAttributesList
257
                String[] pAttributes = pAttributesList.toArray(new String[pAttributesList.size()])
258
                String[] sAttributes = sAttributesList.toArray(new String[sAttributesList.size()])
259
                
260
                try {
261
                        String regPath = outDir + "/registry/"+corpusname.toLowerCase();
262
                        cwbEn.run(outDir + "/data/${corpusname}", outDir + "/cqp/"+corpusname+".cqp", regPath, pAttributes, sAttributes);
263
                        if (!new File(regPath).exists()) {
264
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
265
                                return false;
266
                        }
267
                        cwbMa.run(corpusname, outDir + "/registry");
268
                } catch (Exception ex) {System.out.println("CWB error: "+ex); return false;}
269
                
270
                return true;
271
        }
272
        
273
        /**
274
         * The main method.
275
         *
276
         * @param args the arguments
277
         */
278
        public static void main(String[] args)
279
        {
280
                File dir = new File("~/xml/perrault/txm/");
281
                List<File> files = dir.listFiles();
282
                new compiler().run(files);
283
        }
284
}