Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / limsi / compiler.groovy @ 479

History | View | Annotate | Download (5.1 kB)

1

    
2

    
3
// Copyright © 2010-2013 ENS de Lyon.
4
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
5
// Lyon 2, University of Franche-Comté, University of Nice
6
// Sophia Antipolis, University of Paris 3.
7
//
8
// The TXM platform is free software: you can redistribute it
9
// and/or modify it under the terms of the GNU General Public
10
// License as published by the Free Software Foundation,
11
// either version 2 of the License, or (at your option) any
12
// later version.
13
//
14
// The TXM platform is distributed in the hope that it will be
15
// useful, but WITHOUT ANY WARRANTY; without even the implied
16
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
17
// PURPOSE. See the GNU General Public License for more
18
// details.
19
//
20
// You should have received a copy of the GNU General
21
// Public License along with the TXM platform. If not, see
22
// http://www.gnu.org/licenses.
23
//
24
//
25
//
26
// $LastChangedDate: 2012-01-05 14:27:34 +0100 (jeu., 05 janv. 2012) $
27
// $LastChangedRevision: 2096 $
28
// $LastChangedBy: mdecorde $
29
//
30
package org.txm.importer.limsi
31

    
32
import org.txm.Toolbox;
33
import org.txm.importer.cwb.*
34
import org.txm.importer.*;
35
import org.txm.scripts.*;
36
import org.txm.scripts.teitxm.*;
37
import org.txm.stat.utils.ConsoleProgressBar
38
import org.txm.utils.io.FileCopy;
39
import org.txm.utils.treetagger.TreeTagger;
40
import java.util.ArrayList;
41
import javax.xml.stream.*;
42
import java.net.URL;
43
import java.io.File;
44
import java.util.HashMap;
45
import java.util.List;
46

    
47
// TODO: Auto-generated Javadoc
48
/**
49
 * The Class compiler.
50
 */
51
class compiler
52
{
53
        /** The debug. */
54
        boolean debug = false
55

    
56
        /** The dir. */
57
        private def dir
58

    
59
        /** The cwb loc. */
60
        String cwbLoc
61
        boolean annotationSuccess = false
62

    
63
        public def pAttributesList = []
64
        public def sAttributesList = []
65

    
66
        /**
67
         * Sets the debug.
68
         *
69
         * @return the java.lang. object
70
         */
71
        public setDebug()
72
        {
73
                debug = true
74
        }
75

    
76
        /**
77
         * Sets the cwb path.
78
         *
79
         * @param path the new cwb path
80
         */
81
        public void setCwbPath(String path)
82
        {
83
                if (!new File(path).exists())
84
                        System.err.println("CWB Path : "+path+" does not exists")
85
                cwbLoc = path
86
        }
87

    
88
        /**
89
         * Sets the annotation success.
90
         *
91
         * @param val the new annotation success
92
         */
93
        public void setAnnotationSuccess(boolean val)
94
        {
95
                this.annotationSuccess = val
96
        }
97

    
98
        /**
99
         * Run.
100
         *
101
         * @param rootDirFile the root dir file
102
         * @param basename the basename
103
         * @return true, if successful
104
         */
105
        public boolean run(File binDir, File txmDir, String corpusname)
106
        {
107
                if (cwbLoc == null)
108
                        cwbLoc = org.txm.Toolbox.getParam(org.txm.Toolbox.CQI_SERVER_PATH_TO_CQPLIB)+File.separator;
109

    
110
                if (!new File(cwbLoc).exists()) {
111
                        println ("CWB path error: "+cwbLoc)
112
                        return false
113
                }
114
                if (!binDir.exists()) {
115
                        println ("binary directory does not exists: "+binDir)
116
                        return false
117
                }
118

    
119
                //File wtcFile = new File(binDir,"wtc/"+corpusname+".wtc")
120
                new File(binDir, "wtc").deleteDir()
121
                new File(binDir, "wtc").mkdir()
122
                new File(binDir, "data").deleteDir()
123
                new File(binDir, "data").mkdir()
124
                new File(binDir, "registry").mkdir()
125

    
126
                //1- Merge WTC files
127
                if (annotationSuccess) {
128
                        File annotationDir = new File(binDir, "annotations")
129
                        if (annotationDir.exists()) txmDir = annotationDir;
130
                }
131
                def files = txmDir.listFiles()
132
                files.sort(); // sort files by name
133

    
134
                //2- Import into CWB
135
                def outDir = binDir.getAbsolutePath()+"/";
136

    
137
                CwbEncode cwbEn = new CwbEncode()
138
                CwbMakeAll cwbMa = new CwbMakeAll()
139
                cwbEn.setDebug(debug)
140
                cwbMa.setDebug(debug)
141

    
142
                def pAttrs = []; // cast to array
143
                if (annotationSuccess) {
144
                        pAttrs << "frpos"
145
                        pAttrs << "frlemma"
146
                }
147
                def sAttrs = ["text:0+id", "txmcorpus:0+lang"]; // cast to array
148

    
149

    
150
                String[] pAttributes = pAttrs // cwb-encode needs String[]
151
                String[] sAttributes = sAttrs // cwb-encode needs String[]
152
                println "pAttrs : "+pAttrs
153
                println "sAttrs : "+sAttrs
154

    
155
                try {
156
                        cwbEn.setDebug(debug)
157
                        cwbMa.setDebug(debug)
158
                        String regPath =outDir + "/registry/"+corpusname.toLowerCase()
159
                        cwbEn.run(cwbLoc + "cwb-encode", outDir + "/data/$corpusname",
160
                                        null,
161
                                        regPath, pAttributes, sAttributes);
162

    
163
                        def output = new OutputStreamWriter(cwbEn.getOutputStream());
164
                        if (output == null) {println "CWB-ENCODE OUTPUT NULL"; return false;}
165
                        ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
166
                        println "processing "+files.size()+" files."
167
                        output.write("<txmcorpus lang=\"fr\">\n")
168
                        for (File f : files) {
169
                                cpb.tick()
170
                                output.write(f.getText("UTF-8"))
171
                                output.flush();
172
                                output.write("\n")
173
                        }
174
                        output.write("</txmcorpus>\n")
175
                        output.close()
176
                        cwbEn.endProcess();
177
                        
178
                        if (!new File(regPath).exists()) {
179
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
180
                                return false;
181
                        }
182
                        cwbMa.run(cwbLoc + "cwb-makeall", corpusname, outDir + "/registry")
183
                        return true
184
                } catch (Exception ex) {System.out.println("Error while compiling indexes: $ex"); ex.printStackTrace(); return false;}
185

    
186
                return true;
187
        }
188

    
189
        /**
190
         * The main method.
191
         *
192
         * @param args the arguments
193
         */
194
        public static void main(String[] args)
195
        {
196
                File dir = new File("~/xml/perrault/txm/")
197
                List<File> files = dir.listFiles()
198
                new compiler().run(files)
199
        }
200
}