Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / wtc / compiler.groovy @ 479

History | View | Annotate | Download (5.8 kB)

1

    
2

    
3
// Copyright © 2010-2013 ENS de Lyon.
4
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
5
// Lyon 2, University of Franche-Comté, University of Nice
6
// Sophia Antipolis, University of Paris 3.
7
//
8
// The TXM platform is free software: you can redistribute it
9
// and/or modify it under the terms of the GNU General Public
10
// License as published by the Free Software Foundation,
11
// either version 2 of the License, or (at your option) any
12
// later version.
13
//
14
// The TXM platform is distributed in the hope that it will be
15
// useful, but WITHOUT ANY WARRANTY; without even the implied
16
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
17
// PURPOSE. See the GNU General Public License for more
18
// details.
19
//
20
// You should have received a copy of the GNU General
21
// Public License along with the TXM platform. If not, see
22
// http://www.gnu.org/licenses.
23
//
24
//
25
//
26
// $LastChangedDate: 2012-01-05 14:27:34 +0100 (jeu., 05 janv. 2012) $
27
// $LastChangedRevision: 2096 $
28
// $LastChangedBy: mdecorde $
29
//
30
package org.txm.importer.wtc
31

    
32
import org.txm.Toolbox;
33
import org.txm.importer.cwb.*
34
import org.txm.importer.*;
35
import org.txm.scripts.*;
36
import org.txm.scripts.teitxm.*;
37
import org.txm.utils.io.FileCopy;
38
import org.txm.utils.treetagger.TreeTagger;
39
import java.util.ArrayList;
40
import javax.xml.stream.*;
41
import java.net.URL;
42
import java.io.File;
43
import java.util.HashMap;
44
import java.util.List;
45

    
46
// TODO: Auto-generated Javadoc
47
/**
48
 * The Class compiler.
49
 */
50
class compiler
51
{
52
        /** The debug. */
53
        boolean debug = false;
54

    
55
        /** The dir. */
56
        private def dir;
57

    
58
        /** The cwb loc. */
59
        String cwbLoc;
60
        File srcWTCFile, srcRegistryFile;
61

    
62
        public def pAttributesList = [];
63
        public def sAttributesList = [];
64

    
65
        public compiler(File wtcFile, File registryFile) {
66
                this.srcWTCFile = wtcFile;
67
                this.srcRegistryFile = registryFile
68
        }
69

    
70
        /**
71
         * Sets the debug.
72
         *
73
         * @return the java.lang. object
74
         */
75
        public setDebug()
76
        {
77
                debug =true;
78
        }
79

    
80
        /**
81
         * Sets the cwb path.
82
         *
83
         * @param path the new cwb path
84
         */
85
        public void setCwbPath(String path)
86
        {
87
                if (!new File(path).exists())
88
                        System.err.println("CWB Path : "+path+" does not exists")
89
                cwbLoc = path;
90
        }
91

    
92
        /**
93
         * Run.
94
         *
95
         * @param rootDirFile the root dir file
96
         * @param basename the basename
97
         * @return true, if successful
98
         */
99
        public boolean run(File binDir, String corpusname)
100
        {
101
                if (cwbLoc == null)
102
                        cwbLoc = org.txm.Toolbox.getParam(org.txm.Toolbox.CQI_SERVER_PATH_TO_CQPLIB)+File.separator;
103

    
104
                if (!new File(cwbLoc).exists()) {
105
                        println ("CWB path error: "+cwbLoc)
106
                        return false;
107
                }
108
                if (!binDir.exists()) {
109
                        println ("binary directory does not exists: "+binDir)
110
                        return false;
111
                }
112

    
113
                if (!srcWTCFile.exists()) {
114
                        println "Error: cannot find the WTC file $srcWTCFile"
115
                        return false;
116
                }
117

    
118
                File wtcFile = new File(binDir,"wtc/"+corpusname+".wtc");
119
                new File(binDir, "wtc").deleteDir();
120
                new File(binDir, "wtc").mkdir();
121
                new File(binDir, "data").deleteDir();
122
                new File(binDir, "data").mkdir();
123
                new File(binDir, "registry").mkdir();
124

    
125
                FileCopy.copy(srcWTCFile, wtcFile);
126

    
127
                //2- Import into CWB
128
                def outDir = binDir.getAbsolutePath()+"/";
129

    
130
                CwbEncode cwbEn = new CwbEncode();
131
                CwbMakeAll cwbMa = new CwbMakeAll();
132
                cwbEn.setDebug(debug);
133
                cwbMa.setDebug(debug);
134

    
135
                String[] pAttributes;
136
                String[] sAttributes;
137

    
138
                if (srcRegistryFile == null) {
139
                        println "WARNING: No registry file found in source directory"
140
                        println "We'll search for positional attributes and structural attributes in the WTC file"
141

    
142
                        // s attributes
143
                        BuildCwbEncodeArgsFromWTC argsgetter = new BuildCwbEncodeArgsFromWTC(); // XML stream
144
                        argsgetter.process(wtcFile); //$NON-NLS-1$
145
                        sAttributesList = argsgetter.getSAttributes();
146

    
147
                        // p attributes
148
                        int nbAttr = -1;
149
                        File tmp = File.createTempFile("txm", ".wtc", wtcFile.getParentFile());
150
                        int wcounter = 1;
151
                        println "Adding the 'id' property to the WTC file and getting word properties number."
152
                        tmp.withWriter("UTF-8") { writer ->
153
                                wtcFile.eachLine("UTF-8") { line ->
154
                                        if (!line.startsWith("<")) {
155
                                                if (nbAttr == -1) nbAttr = line.split("\t").size();
156
                                                writer.println(line+"\tw_"+(wcounter++))
157
                                        } else {
158
                                                writer.println(line)
159
                                        }
160
                                        writer.flush();
161
                                }
162
                        }
163
                        wtcFile.delete()
164
                        tmp.renameTo(wtcFile)
165

    
166
                        System.out.println("Found "+(nbAttr-1)+" word properties, $nbAttr with the 'id'");
167
                        for (int i = 1; i < nbAttr ; i++) {
168
                                pAttributesList << "p$i";
169
                        }
170
                        pAttributesList << "id"
171
                } else {
172
                        ReadRegistryFile reader = new ReadRegistryFile(srcRegistryFile);
173
                        pAttributesList = reader.getPAttributes();
174
                        sAttributesList = reader.getSAttributes();
175
                        pAttributesList.remove(0) // remove word
176

    
177
                        if (!pAttributesList.contains("id")) {
178
                                System.out.println("Error: The registry file does not declare the 'id' word property");
179
                                return false;
180
                        }
181
                }
182

    
183
                pAttributes = pAttributesList; // cast to array
184
                sAttributes = sAttributesList; // cast to array
185

    
186
                println "pAttrs : "+Arrays.toString(pAttributes)
187
                println "sAttrs : "+Arrays.toString(sAttributes)
188

    
189
                try {
190
                        cwbEn.setDebug(debug);
191
                        cwbMa.setDebug(debug);
192
                        String regPath =outDir + "/registry/"+corpusname.toLowerCase()
193
                        cwbEn.run(cwbLoc + "cwb-encode", outDir + "/data/$corpusname", 
194
                                        wtcFile.getAbsolutePath(),
195
                                        regPath, pAttributes, sAttributes);
196
                        if (!new File(regPath).exists()) {
197
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
198
                                return false;
199
                        }
200
                        cwbMa.run(cwbLoc + "cwb-makeall", corpusname, outDir + "/registry");
201
                        return true;
202
                } catch (Exception ex) {System.out.println(ex); return false;}
203

    
204
                return true;
205
        }
206

    
207
        /**
208
         * The main method.
209
         *
210
         * @param args the arguments
211
         */
212
        public static void main(String[] args)
213
        {
214
                File dir = new File("~/xml/perrault/txm/");
215
                List<File> files = dir.listFiles();
216
                new compiler().run(files);
217
        }
218
}