Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / discours / compiler.groovy @ 187

History | View | Annotate | Download (7.2 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-05-26 17:42:36 +0200 (Thu, 26 May 2016) $
25
// $LastChangedRevision: 3219 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.discours
29

    
30
import org.txm.importer.cwb.BuildCwbEncodeArgsFromTEITXM;
31
import org.txm.importer.cwb.CwbEncode
32
import org.txm.importer.cwb.CwbMakeAll
33
import org.txm.importer.*;
34
import org.txm.scripts.*;
35
import org.txm.scripts.teitxm.BuildTTSrc;
36
import org.txm.scripts.teitxm.*;
37
import org.txm.utils.treetagger.TreeTagger;
38

    
39
import javax.xml.stream.*;
40
import java.net.URL;
41
import java.io.File;
42
import java.util.HashMap;
43
import java.util.List;
44

    
45
// TODO: Auto-generated Javadoc
46
/**
47
 * The Class compiler.
48
 */
49
class compiler {
50

    
51
        /** The input data. */
52
        private def inputData;
53

    
54
        /** The factory. */
55
        private def factory;
56

    
57
        /** The parser. */
58
        private XMLStreamReader parser;
59

    
60
        /** The dir. */
61
        private def dir;
62

    
63
        /** The output. */
64
        private def output;
65

    
66
        /** The url. */
67
        private def url;
68

    
69
        /** The anahash. */
70
        private HashMap<String,String> anahash = new HashMap<String,String>() ;
71

    
72
        /** The text. */
73
        String text="";
74

    
75
        /** The base. */
76
        String base="";
77

    
78
        /** The project. */
79
        String project="";
80

    
81
        /** The cwb loc. */
82
        String cwbLoc;
83

    
84
        /** The lang. */
85
        private String lang ="fr";
86

    
87
        /** The text attributes. */
88
        static String textAttributes = "";
89

    
90
        /**
91
         * initialize.
92
         *
93
         */
94

    
95
        public compiler(){
96
        }
97

    
98
        /**
99
         * Instantiates a new compiler.
100
         *
101
         * @param url the url
102
         * @param text the text
103
         * @param base the base
104
         * @param project the project
105
         */
106
        public compiler(URL url,String text,String base, String project) {
107
                this.text = text
108
                this.base = base;
109
                this.project = project;
110
                try {
111
                        this.url = url;
112
                        inputData = url.openStream();
113

    
114
                        factory = XMLInputFactory.newInstance();
115
                        parser = factory.createXMLStreamReader(inputData);
116
                } catch (XMLStreamException ex) {
117
                        System.out.println(ex);
118
                }catch (IOException ex) {
119
                        System.err.println("IOException while parsing ");
120
                }
121
        }
122

    
123
        /**
124
         * set the language of the corpus.
125
         *
126
         * @param lang the lang
127
         * @return the java.lang. object
128
         */
129
        public setLang(String lang)
130
        {
131
                this.lang = lang;
132
        }
133

    
134
        /**
135
         * Sets the cwb path.
136
         *
137
         * @param path the new cwb path
138
         */
139
        public void setCwbPath(String path) {
140
                if(!new File(path).exists())
141
                        System.err.println("Wrong CWB directory path: "+path)
142
                cwbLoc = path;
143
        }
144

    
145
        /**
146
         * Creates the output.
147
         *
148
         * @param f the f
149
         * @return true, if successful
150
         */
151
        private boolean createOutput(File f){
152
                try {
153
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
154
                        return true;
155
                } catch (Exception e) {
156
                        System.err.println(e);
157
                        return false;
158
                }
159
        }
160

    
161
        /**
162
         * Go to text.
163
         */
164
        private void GoToText() {
165
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
166
                        if(event == XMLStreamConstants.END_ELEMENT)
167
                                if(parser.getLocalName().equals("teiHeader"))
168
                                        return;
169
                }
170
        }
171

    
172
        /** The debug. */
173
        boolean debug = false;
174

    
175
        /**
176
         * Sets the debug.
177
         */
178
        public void setDebug()
179
        {
180
                debug = true;
181
        }
182

    
183
        /**
184
         * Run.
185
         *
186
         * @param files the files
187
         * @param basename the basename
188
         * @return true, if successful
189
         */
190
        public boolean run(List<File> files, File binDir, File txmDir, String corpusname) {
191
                String binDirPath = binDir.getAbsolutePath()+"/";
192

    
193
                if(cwbLoc == null)
194
                        cwbLoc = org.txm.Toolbox.getParam(org.txm.Toolbox.CQI_SERVER_PATH_TO_CQPLIB)+File.separator;
195

    
196
                if(!new File(cwbLoc).exists() || !binDir.exists())
197
                {
198
                        println ("Wrong CWB directory path: "+cwbLoc+" , or Wrong corpus source path:"+binDirPath)
199
                        return false;
200
                }
201
                File wtcFile = new File(binDir,"wtc/${corpusname}.wtc");
202
                new File(binDir, "wtc.").deleteDir();
203
                new File(binDir, "wtc").mkdirs();
204
                new File(binDir, "data/$corpusname").deleteDir();
205
                new File(binDir, "data/$corpusname").mkdir();
206
                new File(binDir, "registry").deleteDir();
207
                new File(binDir, "registry").mkdir();
208

    
209
                //start corpus
210
                if (createOutput(wtcFile)) {
211
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
212
                        output.close();
213
                }
214

    
215
                def second = 0
216
                //1- Transform into WTC file
217

    
218
                Collections.sort(files);
219
                XMLTXM2WTC wtcbuilder = null;
220
                for (File f : files) {
221
                        if (second) { print(", ") }
222
                        if (second > 0 && (second % 5) == 0) println ""
223
                        print(f.getName().replaceFirst("\\.xml", ""));
224
                        second++
225

    
226
                        wtcbuilder = new XMLTXM2WTC(f.toURI().toURL());
227
                        String txtname = f.getName().substring(0,f.getName().length()-4);
228
                        wtcbuilder.setTextInfo(txtname, corpusname, "project");
229

    
230
                        wtcbuilder.setBalisesToKeep(["text","p","s"]);
231
                        wtcbuilder.setSendToPAttributes(["s":["id"], "p":["id"]]);
232
                        wtcbuilder.setLang(lang);
233
                        if (!wtcbuilder.transformFile(wtcFile)) {
234
                                println("Failed to compile "+f)
235
                        }
236

    
237
                }
238

    
239
                //end corpus
240
                if (createOutput(wtcFile)) {
241
                        output.write("</txmcorpus>\n");
242
                        output.close();
243
                }
244

    
245
                if (wtcbuilder == null) {
246
                        println "there was no files to process: "+files
247
                        return false;
248
                }
249

    
250
                //2- Import into CWB
251
                def outDir = binDirPath;
252

    
253
                CwbEncode cwbEn = new CwbEncode();
254
                CwbMakeAll cwbMa = new CwbMakeAll();
255

    
256
                List<String> pAttributesList = wtcbuilder.getpAttributs();
257
                List<String> sAttributesList = wtcbuilder.getsAttributs();
258
                println "pAttrs : "+pAttributesList
259
                println "sAttrs : "+sAttributesList
260
                String[] pAttributes = pAttributesList.toArray(new String[pAttributesList.size()])
261
                String[] sAttributes = sAttributesList.toArray(new String[sAttributesList.size()])
262

    
263
                try {
264
                        //println "Indexing "+compiler.textAttributes
265
                        cwbEn.setDebug(debug);
266
                        String regPath = outDir + "registry/"+corpusname.toLowerCase()
267
                        cwbEn.run(
268
                                        cwbLoc + "cwb-encode",
269
                                        outDir + "data/$corpusname",
270
                                        outDir + "/wtc/"+corpusname+".wtc",
271
                                        regPath,
272
                                        pAttributes,
273
                                        sAttributes);
274
                        if (!new File(regPath).exists()) {
275
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
276
                                return false;
277
                        }
278
                        //println "Binding all indexes"
279
                        cwbMa.setDebug(debug);
280
                        cwbMa.run(cwbLoc + "cwb-makeall", corpusname, outDir + "registry");
281

    
282
                } catch (Exception ex) { System.err.println(ex); return false;}
283

    
284
                return true;
285
        }
286

    
287
        /**
288
         * The main method.
289
         *
290
         * @param args the arguments
291
         */
292
        public static void main(String[] args) {
293
                File dir = new File(System.getProperty("user.home"),"xml/discours/txm/");
294
                List<File> files = dir.listFiles();
295
                def c = new compiler()
296
                c.setCwbLoc(System.getProperty("user.home")+"/TXM/cwb/bin/")
297
                if (!c.run(files,"discours")) {
298
                        println "Compiler failed"
299
                        return;
300
                }
301
        }
302
}