Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / limsi / compiler.groovy @ 1000

History | View | Annotate | Download (4.7 kB)

1

    
2

    
3
// Copyright © 2010-2013 ENS de Lyon.
4
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
5
// Lyon 2, University of Franche-Comté, University of Nice
6
// Sophia Antipolis, University of Paris 3.
7
//
8
// The TXM platform is free software: you can redistribute it
9
// and/or modify it under the terms of the GNU General Public
10
// License as published by the Free Software Foundation,
11
// either version 2 of the License, or (at your option) any
12
// later version.
13
//
14
// The TXM platform is distributed in the hope that it will be
15
// useful, but WITHOUT ANY WARRANTY; without even the implied
16
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
17
// PURPOSE. See the GNU General Public License for more
18
// details.
19
//
20
// You should have received a copy of the GNU General
21
// Public License along with the TXM platform. If not, see
22
// http://www.gnu.org/licenses.
23
//
24
//
25
//
26
// $LastChangedDate: 2012-01-05 14:27:34 +0100 (jeu., 05 janv. 2012) $
27
// $LastChangedRevision: 2096 $
28
// $LastChangedBy: mdecorde $
29
//
30
package org.txm.scripts.importer.limsi
31

    
32
import org.txm.Toolbox;
33
import org.txm.importer.cwb.*
34
import org.txm.scripts.importer.*;
35
import org.txm.scripts.*;
36
import org.txm.importer.scripts.xmltxm.*;
37
import org.txm.stat.utils.ConsoleProgressBar
38
import org.txm.utils.io.FileCopy;
39
import org.txm.utils.treetagger.TreeTagger;
40
import java.util.ArrayList;
41
import javax.xml.stream.*;
42
import java.net.URL;
43
import java.io.File;
44
import java.util.HashMap;
45
import java.util.List;
46

    
47
// TODO: Auto-generated Javadoc
48
/**
49
 * The Class compiler.
50
 */
51
class compiler
52
{
53
        /** The debug. */
54
        boolean debug = false
55

    
56
        /** The dir. */
57
        private def dir
58

    
59
        boolean annotationSuccess = false
60

    
61
        public def pAttributesList = []
62
        public def sAttributesList = []
63

    
64
        /**
65
         * Sets the debug.
66
         *
67
         * @return the java.lang. object
68
         */
69
        public setDebug()
70
        {
71
                debug = true
72
        }
73

    
74
        /**
75
         * Sets the annotation success.
76
         *
77
         * @param val the new annotation success
78
         */
79
        public void setAnnotationSuccess(boolean val)
80
        {
81
                this.annotationSuccess = val
82
        }
83

    
84
        /**
85
         * Run.
86
         *
87
         * @param rootDirFile the root dir file
88
         * @param basename the basename
89
         * @return true, if successful
90
         */
91
        public boolean run(File binDir, File txmDir, String corpusname)
92
        {
93
                if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
94
                        println ("Error: CWB executables not well set.")
95
                        return false;
96
                }
97
                if (!binDir.exists()) {
98
                        println ("binary directory does not exists: "+binDir)
99
                        return false
100
                }
101

    
102
                //File cqpFile = new File(binDir,"cqp/"+corpusname+".cqp")
103
                new File(binDir, "cqp").deleteDir()
104
                new File(binDir, "cqp").mkdir()
105
                new File(binDir, "data").deleteDir()
106
                new File(binDir, "data").mkdir()
107
                new File(binDir, "registry").mkdir()
108

    
109
                //1- Merge CQP files
110
                if (annotationSuccess) {
111
                        File annotationDir = new File(binDir, "annotations")
112
                        if (annotationDir.exists()) txmDir = annotationDir;
113
                }
114
                def files = txmDir.listFiles()
115
                files.sort(); // sort files by name
116

    
117
                //2- Import into CWB
118
                def outDir = binDir.getAbsolutePath()+"/";
119

    
120
                CwbEncode cwbEn = new CwbEncode()
121
                CwbMakeAll cwbMa = new CwbMakeAll()
122
                cwbEn.setDebug(debug)
123
                cwbMa.setDebug(debug)
124

    
125
                def pAttrs = []; // cast to array
126
                if (annotationSuccess) {
127
                        pAttrs << "frpos"
128
                        pAttrs << "frlemma"
129
                }
130
                def sAttrs = ["text:0+id", "txmcorpus:0+lang"]; // cast to array
131

    
132

    
133
                String[] pAttributes = pAttrs // cwb-encode needs String[]
134
                String[] sAttributes = sAttrs // cwb-encode needs String[]
135
                println "pAttrs : "+pAttrs
136
                println "sAttrs : "+sAttrs
137

    
138
                try {
139
                        cwbEn.setDebug(debug)
140
                        cwbMa.setDebug(debug)
141
                        String regPath =outDir + "/registry/"+corpusname.toLowerCase()
142
                        cwbEn.run(outDir + "/data/$corpusname",
143
                                        null,
144
                                        regPath, pAttributes, sAttributes);
145

    
146
                        def output = new OutputStreamWriter(cwbEn.getOutputStream());
147
                        if (output == null) {println "CWB-ENCODE OUTPUT NULL"; return false;}
148
                        ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
149
                        println "processing "+files.size()+" files."
150
                        output.write("<txmcorpus lang=\"fr\">\n")
151
                        for (File f : files) {
152
                                cpb.tick()
153
                                output.write(f.getText("UTF-8"))
154
                                output.flush();
155
                                output.write("\n")
156
                        }
157
                        output.write("</txmcorpus>\n")
158
                        output.close()
159
                        cwbEn.endProcess();
160
                        
161
                        if (!new File(regPath).exists()) {
162
                                println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
163
                                return false;
164
                        }
165
                        cwbMa.run(corpusname, outDir + "/registry")
166
                        return true
167
                } catch (Exception ex) {System.out.println("Error while compiling indexes: $ex"); ex.printStackTrace(); return false;}
168

    
169
                return true;
170
        }
171

    
172
        /**
173
         * The main method.
174
         *
175
         * @param args the arguments
176
         */
177
        public static void main(String[] args)
178
        {
179
                File dir = new File("~/xml/perrault/txm/")
180
                List<File> files = dir.listFiles()
181
                new compiler().run(files)
182
        }
183
}