Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / limsi / limsiLoader.groovy @ 499

History | View | Annotate | Download (7.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate:$
41
// $LastChangedRevision:$
42
// $LastChangedBy:$
43
//
44
package org.txm.importer.limsi;
45

    
46
import org.eclipse.swt.widgets.*;
47
import org.txm.importer.transcriber.compiler.*;
48
import org.txm.importer.transcriber.pager.*;
49
import org.txm.importer.transcriber.*;
50

    
51
import java.io.File;
52

    
53
import org.txm.scripts.teitxm.*;
54
import org.txm.stat.utils.ConsoleProgressBar
55
import org.txm.*;
56
import org.txm.objects.*;
57
import org.txm.utils.i18n.*;
58
import org.txm.utils.*;
59
import org.txm.importer.*;
60
import org.txm.utils.io.FileCopy;
61
import org.w3c.dom.Element
62
import org.txm.utils.xml.DomUtils;
63
import org.txm.rcp.commands.*
64

    
65

    
66
//PARAMETERS
67
boolean removeInterviewer = false;//if true the transcription of speakers (en1 and enq2) defined in metadatas file will be ignored
68
boolean includeComments = false;
69
boolean ignoreTranscriberMetadata = false;
70
int csvHeaderNumber = 1;
71
int maxlines = 200;
72

    
73
String userDir = System.getProperty("user.home");
74

    
75
def MONITOR;
76
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
77
BaseParameters params;
78
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
79
{        println "DEV MODE";//exception means we debug
80
        debug = true
81
        //
82
                params = new BaseParameters(new File(userDir, "xml/limsi/files/import.xml"))
83
                println "loading "+params.paramFile
84
                params.load()
85
        if (!org.txm.Toolbox.isInitialized()) {
86
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
87
                Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
88
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
89
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
90
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
91
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
92
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
93
        }
94
}
95
if (params == null) { println "no parameters. Aborting"; return; }
96

    
97
String corpusname = params.getCorpusName();
98
Element corpusElem = params.corpora.get(corpusname);
99
String basename = params.name;
100
String rootDir = params.rootDir;
101
String lang = corpusElem.getAttribute("lang");
102
String model = lang
103
String encoding = corpusElem.getAttribute("encoding");
104
boolean annotate = "true" == corpusElem.getAttribute("annotate");
105
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
106
def xslParams = params.getXsltParams(corpusElem);
107

    
108
File srcDir = new File(rootDir);
109
File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME),"corpora/"+basename);
110
//binDir.deleteDir();
111
binDir.mkdirs();
112
if (!binDir.exists()) {
113
        println "Could not create binDir "+binDir
114
        return;
115
}
116

    
117
File txmDir = new File(binDir,"txm/$corpusname");
118
//txmDir.deleteDir();
119
txmDir.mkdirs();
120

    
121
try {
122
        // select only xml files
123
        String ext = ".xml";
124
        ArrayList<File> limsiFiles = srcDir.listFiles(); //find all trs files
125
        if (limsiFiles  == null) {
126
                println ("No files in "+srcDir.getAbsolutePath())
127
                return false;
128
        }
129
        
130
        // remove non XML files
131
        for (int i = 0 ; i < limsiFiles.size() ; i++) {
132
                File f = limsiFiles.get(i);
133
                if (f.getName().equals("import.xml") || !f.getName().endsWith(ext) || !f.canRead() || f.isHidden()) {
134
                        limsiFiles.remove(i)
135
                        i--;
136
                }
137
        }
138

    
139
        if (limsiFiles.size() == 0) {
140
                println ("No transcriptions in "+srcDir.getAbsolutePath())
141
                return false;
142
        } else {
143
                println "Number of transcriptions: "+limsiFiles.size();
144
        }
145

    
146
        if (MONITOR != null) MONITOR.worked(1, "IMPORTER")
147
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
148
        println "-- IMPORTER"
149
        def imp = new importer(limsiFiles, binDir, txmDir) //put result in the txm folder of binDir
150
        if (!imp.run()) {
151
                println "Failed to prepare files - Aborting";
152
                return;
153
        }
154
        if (MONITOR != null) MONITOR.worked(20)
155

    
156
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
157
        if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
158
        println "-- ANNOTATE CQP- Running NLP tools"
159
        boolean annotationSuccess = false;
160
        if (annotate && new AnnotateWTC().run(binDir, txmDir, model+".par")) {
161
                annotationSuccess = true;
162
        }
163

    
164
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
165
        if (MONITOR != null) MONITOR.worked(25, "COMPILING")
166
        println "--COMPILING - Building Search Engine indexes"
167
        limsiFiles = txmDir.listFiles();
168

    
169
        def comp = new compiler()
170
        if(debug) comp.setDebug();
171
        comp.setAnnotationSuccess(annotationSuccess)
172
        if (!comp.run(binDir, txmDir, corpusname)) {
173
                println "Failed to compile files";
174
                return;
175
        }
176
        
177
        // create HTML directory but don't build an edition
178
        File htmlDir = new File(binDir,"HTML/$corpusname");
179
        htmlDir.mkdirs();
180
        
181
        def filelist;
182
        if (annotationSuccess)
183
                filelist = new File(binDir, "annotations").listFiles();
184
        else 
185
                filelist = txmDir.listFiles();
186
        filelist.sort();
187
        def second = 0
188
        
189
        println "Registering texts: "+filelist.size()
190
        ConsoleProgressBar cpb = new ConsoleProgressBar(filelist.size());
191
        for (File txmFile : filelist) {
192
                cpb.tick()
193
                
194
                String txtname = txmFile.getName();
195
                int i = txtname.lastIndexOf(".");
196
                if(i > 0) txtname = txtname.substring(0, i);
197
                
198
                Element text = params.addText(corpusElem, txtname, txmFile);
199
        }
200
}
201
catch (Exception e){org.txm.utils.logger.Log.printStackTrace(e);}
202

    
203
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
204
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
205
File paramFile = new File(binDir, "import.xml");
206
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;
207

    
208

    
209
// load corpus
210
println "Loading corpus..."
211
try {LoadBinaryCorpus.loadBase(binDir)} catch(Exception e777){try {AddBases.loadBase(binDir, MONITOR)} catch(Exception e){}} // LoadBinaryCorpus does not exist if TXM version is < 0.7.7
212

    
213
Toolbox.restartWorkspace();
214
Toolbox.restartSearchEngine();
215

    
216
Display.getDefault().syncExec(new Runnable() {
217
        @Override
218
        public void run() {
219
                println "Reloading corpora view..."
220
                RestartTXM.reloadViews();
221
                println "import done."
222
        }
223
});