Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / limsi / limsiLoader.groovy @ 148

History | View | Annotate | Download (7.4 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate:$
41
// $LastChangedRevision:$
42
// $LastChangedBy:$
43
//
44
package org.txm.importer.limsi;
45

    
46
import org.eclipse.swt.widgets.*;
47
import org.txm.importer.transcriber.compiler.*;
48
import org.txm.importer.transcriber.pager.*;
49
import org.txm.importer.transcriber.*;
50

    
51
import java.io.File;
52

    
53
import org.txm.scripts.teitxm.*;
54
import org.txm.stat.utils.ConsoleProgressBar
55
import org.txm.*;
56
import org.txm.objects.*;
57
import org.txm.utils.i18n.*;
58
import org.txm.utils.*;
59
import org.txm.importer.*;
60
import org.txm.utils.FileCopy;
61
import org.w3c.dom.Element
62
import org.txm.rcpapplication.commands.*
63

    
64

    
65
//PARAMETERS
66
boolean removeInterviewer = false;//if true the transcription of speakers (en1 and enq2) defined in metadatas file will be ignored
67
boolean includeComments = false;
68
boolean ignoreTranscriberMetadata = false;
69
int csvHeaderNumber = 1;
70
int maxlines = 200;
71

    
72
String userDir = System.getProperty("user.home");
73

    
74
def MONITOR;
75
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
76
BaseParameters params;
77
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
78
{        println "DEV MODE";//exception means we debug
79
        debug = true
80
        //
81
                params = new BaseParameters(new File(userDir, "xml/limsi/files/import.xml"))
82
                println "loading "+params.paramFile
83
                params.load()
84
        if (!org.txm.Toolbox.isInitialized()) {
85
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
86
                Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
87
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
88
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
89
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
90
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
91
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
92
        }
93
}
94
if (params == null) { println "no parameters. Aborting"; return; }
95

    
96
String corpusname = params.getCorpusName();
97
Element corpusElem = params.corpora.get(corpusname);
98
String basename = params.name;
99
String rootDir = params.rootDir;
100
String lang = corpusElem.getAttribute("lang");
101
String model = lang
102
String encoding = corpusElem.getAttribute("encoding");
103
boolean annotate = "true" == corpusElem.getAttribute("annotate");
104
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
105
def xslParams = params.getXsltParams(corpusElem);
106

    
107
File srcDir = new File(rootDir);
108
File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME),"corpora/"+basename);
109
//binDir.deleteDir();
110
binDir.mkdirs();
111
if (!binDir.exists()) {
112
        println "Could not create binDir "+binDir
113
        return;
114
}
115

    
116
File txmDir = new File(binDir,"txm/$corpusname");
117
//txmDir.deleteDir();
118
txmDir.mkdirs();
119

    
120
try {
121
        // select only xml files
122
        String ext = ".xml";
123
        ArrayList<File> limsiFiles = srcDir.listFiles(); //find all trs files
124
        if (limsiFiles  == null) {
125
                println ("No files in "+srcDir.getAbsolutePath())
126
                return false;
127
        }
128
        
129
        // remove non XML files
130
        for (int i = 0 ; i < limsiFiles.size() ; i++) {
131
                File f = limsiFiles.get(i);
132
                if (f.getName().equals("import.xml") || !f.getName().endsWith(ext) || !f.canRead() || f.isHidden()) {
133
                        limsiFiles.remove(i)
134
                        i--;
135
                }
136
        }
137

    
138
        if (limsiFiles.size() == 0) {
139
                println ("No transcriptions in "+srcDir.getAbsolutePath())
140
                return false;
141
        } else {
142
                println "Number of transcriptions: "+limsiFiles.size();
143
        }
144

    
145
        if (MONITOR != null) MONITOR.worked(1, "IMPORTER")
146
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
147
        println "-- IMPORTER"
148
        def imp = new importer(limsiFiles, binDir, txmDir) //put result in the txm folder of binDir
149
        if (!imp.run()) {
150
                println "Failed to prepare files - Aborting";
151
                return;
152
        }
153
        if (MONITOR != null) MONITOR.worked(20)
154

    
155
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
156
        if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
157
        println "-- ANNOTATE CQP- Running NLP tools"
158
        boolean annotationSuccess = false;
159
        if (annotate && new AnnotateWTC().run(binDir, txmDir, model+".par")) {
160
                annotationSuccess = true;
161
        }
162

    
163
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
164
        if (MONITOR != null) MONITOR.worked(25, "COMPILING")
165
        println "--COMPILING - Building Search Engine indexes"
166
        limsiFiles = txmDir.listFiles();
167

    
168
        def comp = new compiler()
169
        if(debug) comp.setDebug();
170
        comp.setAnnotationSuccess(annotationSuccess)
171
        if (!comp.run(binDir, txmDir, corpusname)) {
172
                println "Failed to compile files";
173
                return;
174
        }
175
        
176
        // create HTML directory but don't build an edition
177
        File htmlDir = new File(binDir,"HTML/$corpusname");
178
        htmlDir.mkdirs();
179
        
180
        def filelist;
181
        if (annotationSuccess)
182
                filelist = new File(binDir, "annotations").listFiles();
183
        else 
184
                filelist = txmDir.listFiles();
185
        filelist.sort();
186
        def second = 0
187
        
188
        println "Registering texts: "+filelist.size()
189
        ConsoleProgressBar cpb = new ConsoleProgressBar(filelist.size());
190
        for (File txmFile : filelist) {
191
                cpb.tick()
192
                
193
                String txtname = txmFile.getName();
194
                int i = txtname.lastIndexOf(".");
195
                if(i > 0) txtname = txtname.substring(0, i);
196
                
197
                Element text = params.addText(corpusElem, txtname, txmFile);
198
        }
199
}
200
catch (Exception e){org.txm.utils.logger.Log.printStackTrace(e);}
201

    
202
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
203
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
204
File paramFile = new File(binDir, "import.xml");
205
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;
206

    
207

    
208
// load corpus
209
println "Loading corpus..."
210
try {LoadBinaryCorpus.loadBase(binDir)} catch(Exception e777){try {AddBases.loadBase(binDir, MONITOR)} catch(Exception e){}} // LoadBinaryCorpus does not exist if TXM version is < 0.7.7
211

    
212
Toolbox.restartWorkspace();
213
Toolbox.restartSearchEngine();
214

    
215
Display.getDefault().syncExec(new Runnable() {
216
        @Override
217
        public void run() {
218
                println "Reloading corpora view..."
219
                RestartTXM.reloadViews();
220
                println "import done."
221
        }
222
});