Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / cqp / compiler.groovy @ 1804

History | View | Annotate | Download (5.7 kB)

1

    
2

    
3
// Copyright © 2010-2013 ENS de Lyon.
4
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
5
// Lyon 2, University of Franche-Comté, University of Nice
6
// Sophia Antipolis, University of Paris 3.
7
//
8
// The TXM platform is free software: you can redistribute it
9
// and/or modify it under the terms of the GNU General Public
10
// License as published by the Free Software Foundation,
11
// either version 2 of the License, or (at your option) any
12
// later version.
13
//
14
// The TXM platform is distributed in the hope that it will be
15
// useful, but WITHOUT ANY WARRANTY; without even the implied
16
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
17
// PURPOSE. See the GNU General Public License for more
18
// details.
19
//
20
// You should have received a copy of the GNU General
21
// Public License along with the TXM platform. If not, see
22
// http://www.gnu.org/licenses.
23
//
24
//
25
//
26
// $LastChangedDate: 2012-01-05 14:27:34 +0100 (jeu., 05 janv. 2012) $
27
// $LastChangedRevision: 2096 $
28
// $LastChangedBy: mdecorde $
29
//
30
package org.txm.scripts.importer.cqp
31

    
32
import org.txm.Toolbox;
33
import org.txm.importer.cwb.*
34
import org.txm.scripts.importer.*;
35
import org.txm.scripts.*;
36
import org.txm.objects.*;
37
import org.txm.searchengine.cqp.corpus.*;
38
import org.txm.importer.scripts.xmltxm.*;
39
import org.txm.utils.io.FileCopy;
40
import org.txm.utils.treetagger.TreeTagger;
41
import java.util.ArrayList;
42
import javax.xml.stream.*;
43
import java.net.URL;
44
import java.io.File;
45
import java.util.HashMap;
46
import java.util.List;
47

    
48
/**
49
 * The Class compiler.
50
 */
51
class compiler {
52
        /** The debug. */
53
        boolean debug = false;
54

    
55
        /** The dir. */
56
        private def dir;
57

    
58
        File srcCQPFile, srcRegistryFile;
59

    
60
        public def pAttributesList = [];
61
        public def sAttributesList = [];
62

    
63
        public compiler(File cqpFile, File registryFile) {
64
                this.srcCQPFile = cqpFile;
65
                this.srcRegistryFile = registryFile
66
        }
67

    
68
        /**
69
         * Sets the debug.
70
         *
71
         * @return the java.lang. object
72
         */
73
        public setDebug()
74
        {
75
                debug =true;
76
        }
77

    
78
        /**
79
         * Run.
80
         *
81
         * @param rootDirFile the root dir file
82
         * @param basename the basename
83
         * @return true, if successful
84
         */
85
        public boolean run(Project project, File binDir, String corpusname)
86
        {
87
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
88
                        println ("Error: CWB executables not well set.")
89
                        return false;
90
                }
91
                CorpusBuild corpus = project.getCorpusBuild(project.getName(), MainCorpus.class);
92
                if (corpus != null) {
93
                        if (project.getDoUpdate()) {
94
                                corpus.clean(); // remove old files
95
                        } else {
96
                                corpus.delete(); // remove old files and TXMResult children
97
                        }
98
                } else {
99
                        corpus = new MainCorpus(project);
100
                        corpus.setID(project.getName());
101
                        corpus.setName(project.getName());
102
                }
103
                corpus.setDescription("Built with the CQP import module");
104
                
105
                File cqpFile = new File(binDir,"cqp/"+corpusname+".cqp");
106
cqpFile.delete()
107
                new File(binDir,"cqp").mkdirs()
108
                new File(binDir,"data").mkdirs()
109
                new File(binDir,"registry").mkdirs()
110

    
111
                FileCopy.copy(srcCQPFile, cqpFile);
112

    
113
                //2- Import into CWB
114
                def outDir = binDir.getAbsolutePath()+"/";
115

    
116
                CwbEncode cwbEn = new CwbEncode();
117
                CwbMakeAll cwbMa = new CwbMakeAll();
118
                cwbEn.setDebug(debug);
119
                cwbMa.setDebug(debug);
120

    
121
                String[] pAttributes;
122
                String[] sAttributes;
123

    
124
                if (srcRegistryFile == null) {
125
                        println "WARNING: No registry file found in source directory"
126
                        println "We'll search for positional attributes and structural attributes in the CQP file"
127

    
128
                        // s attributes
129
                        BuildCwbEncodeArgsFromCQP argsgetter = new BuildCwbEncodeArgsFromCQP(); // XML stream
130
                        argsgetter.process(cqpFile); //$NON-NLS-1$
131
                        sAttributesList = argsgetter.getSAttributes();
132

    
133
                        // p attributes
134
                        int nbAttr = -1;
135
                        File tmp = File.createTempFile("txm", ".cqp", cqpFile.getParentFile());
136
                        int wcounter = 1;
137
                        println "Adding the 'id' property to the CQP file and getting word properties number."
138
                        tmp.withWriter("UTF-8") { writer ->
139
                                cqpFile.eachLine("UTF-8") { line ->
140
                                        if (!line.startsWith("<")) {
141
                                                if (nbAttr == -1) nbAttr = line.split("\t").size();
142
                                                writer.println(line+"\tw_"+(wcounter++))
143
                                        } else {
144
                                                writer.println(line)
145
                                        }
146
                                        writer.flush();
147
                                }
148
                        }
149
                        cqpFile.delete()
150
                        tmp.renameTo(cqpFile)
151

    
152
                        System.out.println("Found "+(nbAttr-1)+" word properties, $nbAttr with the 'id'");
153
                        for (int i = 1; i < nbAttr ; i++) {
154
                                pAttributesList << "p$i";
155
                        }
156
                        pAttributesList << "id"
157
                } else {
158
                        ReadRegistryFile reader = new ReadRegistryFile(srcRegistryFile);
159
                        pAttributesList = reader.getPAttributes();
160
                        sAttributesList = reader.getSAttributes();
161
                        pAttributesList.remove(0) // remove word
162

    
163
                        if (!pAttributesList.contains("id")) {
164
                                System.out.println("Error: The registry file does not declare the 'id' word property");
165
                                return false;
166
                        }
167
                }
168

    
169
                pAttributes = pAttributesList; // cast to array
170
                sAttributes = sAttributesList; // cast to array
171

    
172
                println "pAttrs : "+Arrays.toString(pAttributes)
173
                println "sAttrs : "+Arrays.toString(sAttributes)
174

    
175
                try {
176
                        cwbEn.setDebug(debug);
177
                        cwbMa.setDebug(debug);
178
                        String regPath =outDir + "/registry/"+corpusname.toLowerCase()
179
                        cwbEn.run(outDir + "/data/$corpusname", 
180
                                        cqpFile.getAbsolutePath(),
181
                                        regPath, pAttributes, sAttributes);
182
                        if (!new File(regPath).exists()) {
183
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
184
                                return false;
185
                        }
186
                        cwbMa.run(corpusname, outDir + "/registry");
187
                        return true;
188
                } catch (Exception ex) {System.out.println(ex); return false;}
189

    
190
                return true;
191
        }
192

    
193
        /**
194
         * The main method.
195
         *
196
         * @param args the arguments
197
         */
198
        public static void main(String[] args)
199
        {
200
                File dir = new File("~/xml/perrault/txm/");
201
                List<File> files = dir.listFiles();
202
                new compiler().run(files);
203
        }
204
}