Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / discours / compiler.groovy @ 1804

History | View | Annotate | Download (6.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-05-26 17:42:36 +0200 (jeu. 26 mai 2016) $
25
// $LastChangedRevision: 3219 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.discours
29

    
30
import org.txm.importer.cwb.BuildCwbEncodeArgsFromTEITXM;
31
import org.txm.importer.cwb.CwbEncode
32
import org.txm.importer.cwb.CwbMakeAll
33
import org.txm.scripts.importer.*;
34
import org.txm.scripts.*;
35
import org.txm.importer.scripts.xmltxm.*;
36
import org.txm.utils.treetagger.TreeTagger;
37
import org.txm.objects.*
38
import javax.xml.stream.*;
39
import java.net.URL;
40
import java.io.File;
41
import java.util.HashMap;
42
import java.util.List;
43
import org.txm.searchengine.cqp.corpus.*
44

    
45
/**
46
 * The Class compiler.
47
 */
48
class compiler {
49

    
50
        /** The input data. */
51
        private def inputData;
52

    
53
        /** The factory. */
54
        private def factory;
55

    
56
        /** The parser. */
57
        private XMLStreamReader parser;
58

    
59
        /** The dir. */
60
        private def dir;
61

    
62
        /** The output. */
63
        private def output;
64

    
65
        /** The url. */
66
        private def url;
67

    
68
        /** The anahash. */
69
        private HashMap<String,String> anahash = new HashMap<String,String>() ;
70

    
71
        /** The text. */
72
        String text="";
73

    
74
        /** The base. */
75
        String base="";
76

    
77
        /** The lang. */
78
        private String lang ="fr";
79

    
80
        /** The text attributes. */
81
        static String textAttributes = "";
82

    
83
        /**
84
         * initialize.
85
         *
86
         */
87

    
88
        public compiler(){
89
        }
90

    
91
        /**
92
         * set the language of the corpus.
93
         *
94
         * @param lang the lang
95
         * @return the java.lang. object
96
         */
97
        public setLang(String lang)
98
        {
99
                this.lang = lang;
100
        }
101

    
102
        /**
103
         * Creates the output.
104
         *
105
         * @param f the f
106
         * @return true, if successful
107
         */
108
        private boolean createOutput(File f){
109
                try {
110
                        output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8");
111
                        return true;
112
                } catch (Exception e) {
113
                        System.err.println(e);
114
                        return false;
115
                }
116
        }
117

    
118
        /**
119
         * Go to text.
120
         */
121
        private void GoToText() {
122
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
123
                        if(event == XMLStreamConstants.END_ELEMENT)
124
                                if(parser.getLocalName().equals("teiHeader"))
125
                                        return;
126
                }
127
        }
128

    
129
        /** The debug. */
130
        boolean debug = false;
131

    
132
        /**
133
         * Sets the debug.
134
         */
135
        public void setDebug()
136
        {
137
                debug = true;
138
        }
139

    
140
        /**
141
         * Run.
142
         *
143
         * @param files the files
144
         * @param basename the basename
145
         * @return true, if successful
146
         */
147
        public boolean run(Project project, List<File> files, File binDir, File txmDir, String corpusname) {
148
                String binDirPath = binDir.getAbsolutePath()+"/";
149

    
150
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
151
                        println ("Error: CWB executables not well set.")
152
                        return false;
153
                }
154
                CorpusBuild corpus = project.getCorpusBuild(project.getName(), MainCorpus.class);
155
                if (corpus != null) {
156
                        if (project.getDoUpdate()) {
157
                                corpus.clean(); // remove old files
158
                        } else {
159
                                corpus.delete(); // remove old files and TXMResult children
160
                        }
161
                } else {
162
                        corpus = new MainCorpus(project);
163
                        corpus.setID(project.getName());
164
                        corpus.setName(project.getName());
165
                }
166
                corpus.setDescription("Built with the CNR+CSV import module");
167
                
168
                File cqpFile = new File(binDir,"cqp/"+corpusname+".cqp");
169
cqpFile.delete()
170
                new File(binDir,"cqp").mkdirs()
171
                new File(binDir,"data").mkdirs()
172
                new File(binDir,"registry").mkdirs()
173

    
174
                //start corpus
175
                if (createOutput(cqpFile)) {
176
                        output.write("<txmcorpus lang=\""+lang+"\">\n");
177
                        output.close();
178
                }
179

    
180
                def second = 0
181
                //1- Transform into CQP file
182

    
183
                Collections.sort(files);
184
                XMLTXM2CQP cqpbuilder = null;
185
                for (File f : files) {
186
                        if (second) { print(", ") }
187
                        if (second > 0 && (second % 5) == 0) println ""
188
                        print(f.getName().replaceFirst("\\.xml", ""));
189
                        second++
190

    
191
                        cqpbuilder = new XMLTXM2CQP(f.toURI().toURL());
192
                        String txtname = f.getName().substring(0,f.getName().length()-4);
193
                        cqpbuilder.setTextInfo(txtname, corpusname, "project");
194

    
195
                        cqpbuilder.setBalisesToKeep(["text","p","s"]);
196
                        cqpbuilder.setSendToPAttributes(["s":["id"], "p":["id"]]);
197
                        cqpbuilder.setLang(lang);
198
                        if (!cqpbuilder.transformFile(cqpFile)) {
199
                                println("Failed to compile "+f)
200
                        }
201

    
202
                }
203

    
204
                //end corpus
205
                if (createOutput(cqpFile)) {
206
                        output.write("</txmcorpus>\n");
207
                        output.close();
208
                }
209

    
210
                if (cqpbuilder == null) {
211
                        println "there was no files to process: "+files
212
                        return false;
213
                }
214

    
215
                //2- Import into CWB
216
                def outDir = binDirPath;
217

    
218
                CwbEncode cwbEn = new CwbEncode();
219
                CwbMakeAll cwbMa = new CwbMakeAll();
220

    
221
                List<String> pAttributesList = cqpbuilder.getpAttributs();
222
                List<String> sAttributesList = cqpbuilder.getsAttributs();
223
                println "pAttrs : "+pAttributesList
224
                println "sAttrs : "+sAttributesList
225
                String[] pAttributes = pAttributesList.toArray(new String[pAttributesList.size()])
226
                String[] sAttributes = sAttributesList.toArray(new String[sAttributesList.size()])
227

    
228
                try {
229
                        //println "Indexing "+compiler.textAttributes
230
                        cwbEn.setDebug(debug);
231
                        String regPath = outDir + "registry/"+corpusname.toLowerCase()
232
                        cwbEn.run(
233
                                        outDir + "data/$corpusname",
234
                                        outDir + "/cqp/"+corpusname+".cqp",
235
                                        regPath,
236
                                        pAttributes,
237
                                        sAttributes);
238
                        if (!new File(regPath).exists()) {
239
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
240
                                return false;
241
                        }
242
                        //println "Binding all indexes"
243
                        cwbMa.setDebug(debug);
244
                        cwbMa.run(corpusname, outDir + "registry");
245

    
246
                } catch (Exception ex) { System.err.println(ex); return false;}
247

    
248
                return true;
249
        }
250

    
251
        /**
252
         * The main method.
253
         *
254
         * @param args the arguments
255
         */
256
        public static void main(String[] args) {
257
                File dir = new File(System.getProperty("user.home"),"xml/discours/txm/");
258
                List<File> files = dir.listFiles();
259
                def c = new compiler()
260
                c.setCwbLoc(System.getProperty("user.home")+"/TXM/cwb/bin/")
261
                if (!c.run(files,"discours")) {
262
                        println "Compiler failed"
263
                        return;
264
                }
265
        }
266
}