Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / lasla / compiler.groovy @ 187

History | View | Annotate | Download (7 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2016-05-26 17:42:36 +0200 (Thu, 26 May 2016) $
25
// $LastChangedRevision: 3219 $
26
// $LastChangedBy: mdecorde $ 
27
//
28

    
29

    
30
package org.txm.importer.lasla;
31

    
32
import java.util.ArrayList;;
33

    
34
import org.txm.importer.cwb.BuildCwbEncodeArgs;
35
import org.txm.importer.cwb.CwbEncode
36
import org.txm.importer.cwb.CwbMakeAll
37
import org.txm.importer.*;
38
import org.txm.scripts.*;
39
import org.txm.scripts.teitxm.BuildTTSrc;
40
import org.txm.scripts.teitxm.*;
41
import org.txm.utils.treetagger.TreeTagger;
42

    
43
import javax.xml.stream.*;
44
import java.net.URL;
45
import java.io.File;
46
import java.util.HashMap;
47
import java.util.List;
48

    
49
// TODO: Auto-generated Javadoc
50
/**
51
 * The Class compiler.
52
 */
53
class compiler 
54
{
55
        
56
        /** The debug. */
57
        private boolean debug= false;
58
        
59
        /** The input data. */
60
        private def inputData;
61
        
62
        /** The factory. */
63
        private def factory;
64
        
65
        /** The parser. */
66
        private XMLStreamReader parser;
67
        
68
        /** The dir. */
69
        private def dir;
70
        
71
        /** The output. */
72
        private def output;
73
        
74
        /** The url. */
75
        private def url;
76
        
77
        /** The anahash. */
78
        private HashMap<String,String> anahash =new HashMap<String,String>() ;
79
        
80
        /** The text. */
81
        String text="";
82
        
83
        /** The base. */
84
        String base="";
85
        
86
        /** The project. */
87
        String project="";
88
        
89
        /** The text attributes. */
90
        String[] textAttributes = null;
91
        
92
        /** The cwb loc. */
93
        String cwbLoc;
94
        
95
        /** The lang. */
96
        private String lang ="fr";
97
        
98
        /**
99
         * initialize.
100
         *
101
         */
102
        public compiler(){}
103
        
104
        /**
105
         * Instantiates a new compiler.
106
         *
107
         * @param url the url
108
         * @param text the text
109
         * @param base the base
110
         * @param project the project
111
         */
112
        public compiler(URL url,String text,String base, String project)
113
        {
114
                this.text = text
115
                this.base = base;
116
                this.project = project;
117
                this.textAttributes = textAttributes;
118
                try {
119
                        this.url = url;
120
                        inputData = url.openStream();
121
                        
122
                        factory = XMLInputFactory.newInstance();
123
                        parser = factory.createXMLStreamReader(inputData);
124
                } catch (XMLStreamException ex) {
125
                        System.out.println(ex);
126
                }catch (IOException ex) {
127
                        System.err.println("IOException while parsing ");
128
                }
129
        }
130
        
131
        /**
132
         * set the language of the corpus.
133
         *
134
         * @param lang the lang
135
         * @return the java.lang. object
136
         */
137
        public setLang(String lang)
138
        {
139
                this.lang = lang;
140
        }
141
        
142
        /**
143
         * Sets the cwb path.
144
         *
145
         * @param path the new cwb path
146
         */
147
        public void setCwbPath(String path)
148
        {
149
                if(!new File(path).exists())
150
                        System.err.println("CWB Path : "+path+" does not exists")
151
                cwbLoc = path;
152
        }
153
        
154
        /**
155
         * Creates the output.
156
         *
157
         * @param dirPathName the dir path name
158
         * @param fileName the file name
159
         * @return true, if successful
160
         */
161
        private boolean createOutput(String dirPathName, String fileName){
162
                try {
163
                        File f = new File(dirPathName, fileName)
164
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
165
                        return true;
166
                } catch (Exception e) {
167
                        System.err.println(e);
168
                        
169
                        return false;
170
                }
171
        }
172
        
173
        /**
174
         * Go to text.
175
         */
176
        private void GoToText()
177
        {
178
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
179
                {
180
                        if(event == XMLStreamConstants.END_ELEMENT)
181
                                if(parser.getLocalName().equals("teiHeader"))
182
                                        return;
183
                }
184
        }
185
        
186
        /**
187
         * Run.
188
         *
189
         * @param rootDirFile the root dir file
190
         * @param basename the basename
191
         * @param textAttributes the text attributes
192
         * @return true, if successful
193
         */
194
        public boolean run(File rootDirFile,String basename, String[] textAttributes) 
195
        {
196
                String rootDir =rootDirFile.getAbsolutePath();
197
                
198
                if(cwbLoc == null)
199
                        cwbLoc = org.txm.Toolbox.getParam(org.txm.Toolbox.CQI_SERVER_PATH_TO_CQPLIB)+File.separator;
200
                
201
                if(!new File(cwbLoc).exists())
202
                {
203
                        println ("CWB path error: "+cwbLoc)
204
                        return false;
205
                }
206
                if(!new File(rootDir).exists())
207
                {
208
                        println ("binary directory does not exists: "+rootDir)
209
                        return false;
210
                }
211

    
212
                new File(rootDir,"/wtc/").deleteDir();
213
                new File(rootDir,"/wtc/").mkdir();
214
                new File(rootDir,"/data/").deleteDir();
215
                new File(rootDir,"/data/").mkdir();
216
                new File(rootDir,"registry/").mkdir();
217
                
218
                String textid="";
219
                int counttext =0;
220
                List<File> files = new File(rootDirFile,"txm").listFiles();
221
                //1- Transform into WTC file
222
                def builder = null;
223
                for(File f : files)
224
                {
225
                        counttext++;
226
                        if(!f.exists())
227
                        {
228
                                println("file "+f+ " does not exists")        
229
                        }
230
                        else
231
                        {        
232
                                wtcbuilder = new XMLTXM2WTC(f.toURL());
233
                                String txtname = f.getName().substring(0,f.getName().length()-4);
234
                                wtcbuilder.setTextInfo(txtname, basename, "project");
235

    
236
                                wtcbuilder.setBalisesToKeep(["text"]);
237
                                wtcbuilder.setSendToPAttributes([]);
238
                                wtcbuilder.setLang(lang);
239
                                if(!wtcbuilder.transformFile(new File(rootDir,"wtc/"+basename+".wtc")))
240
                                {
241
                                        println("Failed to compile "+f)
242
                                }
243
                        }
244
                }
245
                
246
                //2- Import into CWB
247
                def outDir =rootDir;
248
                def outDirTxm = rootDir;
249
                
250
                CwbEncode cwbEn = new CwbEncode();
251
                cwbEn.setDebug(debug);
252
                CwbMakeAll cwbMa = new CwbMakeAll();
253
                cwbMa.setDebug(debug);
254
                String[] pAttributes = ["ref","s","sent","lemme","line","pos"];
255
                
256
                BuildCwbEncodeArgs argsgetter = new BuildCwbEncodeArgs();
257

    
258
                ArrayList<String> wordstag = ["w"];
259
                //argsgetter.process(new File(rootDir+"/txm",basename+".wtc"), wordstag);
260
                
261
                String[] sAttributes = ["txmcorpus:0+lang", "text:0+id+base+project"]
262
                
263
                try {
264
                        String regPath = outDirTxm + "/registry/"+basename.toLowerCase();
265
                        cwbEn.run(new File(cwbLoc,"cwb-encode").getAbsolutePath(), outDirTxm + "/data", outDir + "/wtc/"+basename+".wtc", regPath, pAttributes, sAttributes);
266
                        if (!new File(regPath).exists()) {
267
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
268
                                return false;
269
                        }
270
                        cwbMa.run(new File(cwbLoc,"cwb-makeall").getAbsolutePath(), basename.toUpperCase(), outDirTxm + "/registry");
271
                        
272
                } catch (Exception ex) {System.out.println(ex); return false;}
273
                
274
                System.out.println("Done.") 
275
                
276
                return true;
277
        }
278
        
279
        /**
280
         * Sets the debug.
281
         */
282
        public void setDebug()
283
        {
284
                this.debug = true;
285
        }
286
        
287
        /**
288
         * The main method.
289
         *
290
         * @param args the arguments
291
         */
292
        public static void main(String[] args)
293
        {
294
                File dir = new File("~/xml/geo");
295
                def c = new compiler();
296
                c.setDebug();
297
                c.setCwbPath("~/TXM/cwb/bin");
298
                c.run(dir,"geo");
299
        }
300
}