Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / discours / compiler.groovy @ 1000

History | View | Annotate | Download (6.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-05-26 17:42:36 +0200 (jeu. 26 mai 2016) $
25
// $LastChangedRevision: 3219 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.discours
29

    
30
import org.txm.importer.cwb.BuildCwbEncodeArgsFromTEITXM;
31
import org.txm.importer.cwb.CwbEncode
32
import org.txm.importer.cwb.CwbMakeAll
33
import org.txm.scripts.importer.*;
34
import org.txm.scripts.*;
35
import org.txm.importer.scripts.xmltxm.*;
36
import org.txm.utils.treetagger.TreeTagger;
37

    
38
import javax.xml.stream.*;
39
import java.net.URL;
40
import java.io.File;
41
import java.util.HashMap;
42
import java.util.List;
43

    
44
// TODO: Auto-generated Javadoc
45
/**
46
 * The Class compiler.
47
 */
48
class compiler {
49

    
50
        /** The input data. */
51
        private def inputData;
52

    
53
        /** The factory. */
54
        private def factory;
55

    
56
        /** The parser. */
57
        private XMLStreamReader parser;
58

    
59
        /** The dir. */
60
        private def dir;
61

    
62
        /** The output. */
63
        private def output;
64

    
65
        /** The url. */
66
        private def url;
67

    
68
        /** The anahash. */
69
        private HashMap<String,String> anahash = new HashMap<String,String>() ;
70

    
71
        /** The text. */
72
        String text="";
73

    
74
        /** The base. */
75
        String base="";
76

    
77
        /** The project. */
78
        String project="";
79

    
80
        /** The lang. */
81
        private String lang ="fr";
82

    
83
        /** The text attributes. */
84
        static String textAttributes = "";
85

    
86
        /**
87
         * initialize.
88
         *
89
         */
90

    
91
        public compiler(){
92
        }
93

    
94
        /**
95
         * Instantiates a new compiler.
96
         *
97
         * @param url the url
98
         * @param text the text
99
         * @param base the base
100
         * @param project the project
101
         */
102
        public compiler(URL url,String text,String base, String project) {
103
                this.text = text
104
                this.base = base;
105
                this.project = project;
106
                try {
107
                        this.url = url;
108
                        inputData = url.openStream();
109

    
110
                        factory = XMLInputFactory.newInstance();
111
                        parser = factory.createXMLStreamReader(inputData);
112
                } catch (XMLStreamException ex) {
113
                        System.out.println(ex);
114
                }catch (IOException ex) {
115
                        System.err.println("IOException while parsing ");
116
                }
117
        }
118

    
119
        /**
120
         * set the language of the corpus.
121
         *
122
         * @param lang the lang
123
         * @return the java.lang. object
124
         */
125
        public setLang(String lang)
126
        {
127
                this.lang = lang;
128
        }
129

    
130
        /**
131
         * Creates the output.
132
         *
133
         * @param f the f
134
         * @return true, if successful
135
         */
136
        private boolean createOutput(File f){
137
                try {
138
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
139
                        return true;
140
                } catch (Exception e) {
141
                        System.err.println(e);
142
                        return false;
143
                }
144
        }
145

    
146
        /**
147
         * Go to text.
148
         */
149
        private void GoToText() {
150
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
151
                        if(event == XMLStreamConstants.END_ELEMENT)
152
                                if(parser.getLocalName().equals("teiHeader"))
153
                                        return;
154
                }
155
        }
156

    
157
        /** The debug. */
158
        boolean debug = false;
159

    
160
        /**
161
         * Sets the debug.
162
         */
163
        public void setDebug()
164
        {
165
                debug = true;
166
        }
167

    
168
        /**
169
         * Run.
170
         *
171
         * @param files the files
172
         * @param basename the basename
173
         * @return true, if successful
174
         */
175
        public boolean run(List<File> files, File binDir, File txmDir, String corpusname) {
176
                String binDirPath = binDir.getAbsolutePath()+"/";
177

    
178
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
179
                        println ("Error: CWB executables not well set.")
180
                        return false;
181
                }
182
                File cqpFile = new File(binDir,"cqp/${corpusname}.cqp");
183
                new File(binDir, "cqp.").deleteDir();
184
                new File(binDir, "cqp").mkdirs();
185
                new File(binDir, "data/$corpusname").deleteDir();
186
                new File(binDir, "data/$corpusname").mkdir();
187
                new File(binDir, "registry").deleteDir();
188
                new File(binDir, "registry").mkdir();
189

    
190
                //start corpus
191
                if (createOutput(cqpFile)) {
192
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
193
                        output.close();
194
                }
195

    
196
                def second = 0
197
                //1- Transform into CQP file
198

    
199
                Collections.sort(files);
200
                XMLTXM2CQP cqpbuilder = null;
201
                for (File f : files) {
202
                        if (second) { print(", ") }
203
                        if (second > 0 && (second % 5) == 0) println ""
204
                        print(f.getName().replaceFirst("\\.xml", ""));
205
                        second++
206

    
207
                        cqpbuilder = new XMLTXM2CQP(f.toURI().toURL());
208
                        String txtname = f.getName().substring(0,f.getName().length()-4);
209
                        cqpbuilder.setTextInfo(txtname, corpusname, "project");
210

    
211
                        cqpbuilder.setBalisesToKeep(["text","p","s"]);
212
                        cqpbuilder.setSendToPAttributes(["s":["id"], "p":["id"]]);
213
                        cqpbuilder.setLang(lang);
214
                        if (!cqpbuilder.transformFile(cqpFile)) {
215
                                println("Failed to compile "+f)
216
                        }
217

    
218
                }
219

    
220
                //end corpus
221
                if (createOutput(cqpFile)) {
222
                        output.write("</txmcorpus>\n");
223
                        output.close();
224
                }
225

    
226
                if (cqpbuilder == null) {
227
                        println "there was no files to process: "+files
228
                        return false;
229
                }
230

    
231
                //2- Import into CWB
232
                def outDir = binDirPath;
233

    
234
                CwbEncode cwbEn = new CwbEncode();
235
                CwbMakeAll cwbMa = new CwbMakeAll();
236

    
237
                List<String> pAttributesList = cqpbuilder.getpAttributs();
238
                List<String> sAttributesList = cqpbuilder.getsAttributs();
239
                println "pAttrs : "+pAttributesList
240
                println "sAttrs : "+sAttributesList
241
                String[] pAttributes = pAttributesList.toArray(new String[pAttributesList.size()])
242
                String[] sAttributes = sAttributesList.toArray(new String[sAttributesList.size()])
243

    
244
                try {
245
                        //println "Indexing "+compiler.textAttributes
246
                        cwbEn.setDebug(debug);
247
                        String regPath = outDir + "registry/"+corpusname.toLowerCase()
248
                        cwbEn.run(
249
                                        outDir + "data/$corpusname",
250
                                        outDir + "/cqp/"+corpusname+".cqp",
251
                                        regPath,
252
                                        pAttributes,
253
                                        sAttributes);
254
                        if (!new File(regPath).exists()) {
255
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
256
                                return false;
257
                        }
258
                        //println "Binding all indexes"
259
                        cwbMa.setDebug(debug);
260
                        cwbMa.run(corpusname, outDir + "registry");
261

    
262
                } catch (Exception ex) { System.err.println(ex); return false;}
263

    
264
                return true;
265
        }
266

    
267
        /**
268
         * The main method.
269
         *
270
         * @param args the arguments
271
         */
272
        public static void main(String[] args) {
273
                File dir = new File(System.getProperty("user.home"),"xml/discours/txm/");
274
                List<File> files = dir.listFiles();
275
                def c = new compiler()
276
                c.setCwbLoc(System.getProperty("user.home")+"/TXM/cwb/bin/")
277
                if (!c.run(files,"discours")) {
278
                        println "Compiler failed"
279
                        return;
280
                }
281
        }
282
}