Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / transcriber / transcriberLoader.groovy @ 479

History | View | Annotate | Download (11.4 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate:$
41
// $LastChangedRevision:$
42
// $LastChangedBy:$
43
//
44
package org.txm.importer.transcriber;
45

    
46
import java.io.File;
47
import org.txm.scripts.teitxm.*;
48
import org.txm.*;
49
import org.txm.objects.*;
50
import org.txm.utils.i18n.*;
51
import org.txm.utils.*;
52
import org.txm.importer.*;
53
import org.txm.metadatas.*;
54
import org.txm.utils.io.FileCopy;
55
import org.w3c.dom.Element
56
import org.txm.utils.xml.DomUtils;
57

    
58
//PARAMETERS
59
boolean removeInterviewer = false;//if true the transcription of speakers (en1 and enq2) defined in metadatas file will be ignored
60
boolean includeComments = false;
61
boolean ignoreTranscriberMetadata = false;
62
int csvHeaderNumber = 1;
63
int maxlines = 200;
64

    
65
String userDir = System.getProperty("user.home");
66

    
67
def MONITOR;
68
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
69
BaseParameters params;
70
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
71
{        println "DEV MODE";//exception means we debug
72
        debug = true
73
        params = new BaseParameters(new File(userDir, "xml/anapovoas/import.xml"))
74
        params.load()
75
        if (!org.txm.Toolbox.isInitialized()) {
76

    
77
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
78
                Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
79
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
80
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
81
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, "\t");
82
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "");
83
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
84
        }
85
}
86
if (params == null) { println "no parameters. Aborting"; return; }
87

    
88
String corpusname = params.getCorpusName();
89
Element corpusElem = params.corpora.get(corpusname);
90
String basename = params.name;
91
String rootDir = params.rootDir;
92
String lang = corpusElem.getAttribute("lang");
93
String model = lang
94
String encoding = corpusElem.getAttribute("encoding");
95
boolean annotate = "true" == corpusElem.getAttribute("annotate");
96
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
97
def xslParams = params.getXsltParams(corpusElem);
98
int wordsPerPage = params.getWordsPerPage("default")
99
String page_element = params.getPageElement("default")
100
boolean build_edition = params.getDoEdition("default")
101

    
102
File srcDir = new File(rootDir);
103
File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME),"corpora/"+basename);
104
binDir.deleteDir();
105
binDir.mkdirs();
106
if (!binDir.exists()) {
107
        println "Could not create binDir "+binDir
108
        return;
109
}
110

    
111
File txmDir = new File(binDir,"txm/$corpusname");
112
txmDir.deleteDir();
113
txmDir.mkdirs();
114

    
115
//get metadata values from CSV
116
Metadatas metadatas; // text metadata
117
File allmetadatasfile = new File(srcDir, "metadata.csv");
118
println "Trying to read metadatas values from: "+allmetadatasfile
119
if (allmetadatasfile.exists()) {
120
        File copy = new File(binDir, "metadata.csv")
121
        if (!FileCopy.copy(allmetadatasfile, copy)) {
122
                println "Error: could not create a copy of metadata file "+allmetadatasfile.getAbsoluteFile();
123
                return;
124
        }
125
        metadatas = new Metadatas(copy, Toolbox.getParam(Toolbox.METADATA_ENCODING), Toolbox.getParam(Toolbox.METADATA_COLSEPARATOR), Toolbox.getParam(Toolbox.METADATA_TXTSEPARATOR), 1)
126
}
127
else
128
        println "no metadata file: "+allmetadatasfile
129

    
130
File propertyFile = new File(srcDir, "import.properties")//default
131
Properties props = new Properties();
132
String[] metadatasToKeep;
133
if (propertyFile.exists() && propertyFile.canRead()) {
134
        FileInputStream input = new FileInputStream(propertyFile);
135
        props.load(input);
136
        input.close();
137

    
138
        if (props.getProperty("removeInterviewer") != null)
139
                removeInterviewer = Boolean.parseBoolean(props.get("removeInterviewer").toString());
140
        if (props.getProperty("ignoreTranscriberMetadata") != null)
141
                ignoreTranscriberMetadata = Boolean.parseBoolean(props.get("ignoreTranscriberMetadata").toString());
142
        if (props.getProperty("metadataList") != null)
143
                metadatasToKeep = props.get("metadataList").toString().split("|");
144
        if (props.getProperty("csvHeaderNumber") != null)
145
                csvHeaderNumber = props.get("csvHeaderNumber").toString().split("|");
146
        //if (props.getProperty("includeComments") != null)
147
        //        includeComments = props.get("includeComments").toString();
148

    
149
        println "import properties: "
150
        println " removeInterviewer: "+removeInterviewer
151
        println " ignoreTranscriberMetadata: "+ignoreTranscriberMetadata
152
        println " metadataToKeep: "+metadatasToKeep
153
        println " ignored csvHeaderSize: "+csvHeaderNumber
154
        //println " includeComments: "+includeComments
155
}
156

    
157
// Apply XSL
158
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
159
if (MONITOR != null) MONITOR.worked(1, "XSL")
160
if (xsl != null && xsl.trim().length() > 0) {
161
        if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src")))
162
                srcDir = new File(binDir, "src");
163
        println ""
164
}
165

    
166
try {
167
        // select only trs files
168
        String ext = "trs";
169
        ArrayList<File> trsfiles = srcDir.listFiles(); //find all trs files
170
        if (trsfiles  == null) {
171
                println ("No files in "+srcDir.getAbsolutePath())
172
                return false;
173
        }
174
        for (int i = 0 ; i < trsfiles.size() ; i++) {
175
                File f = trsfiles.get(i);
176
                if (!f.getName().endsWith(ext) || !f.canRead() || f.isHidden()) {
177
                        trsfiles.remove(i)
178
                        i--;
179
                }
180
        }
181

    
182
        if (trsfiles.size() == 0) {
183
                println ("No transcription file (*.trs) found in "+srcDir.getAbsolutePath()+". Aborting.")
184
                return false;
185
        }
186

    
187
        if (MONITOR != null) MONITOR.worked(1, "IMPORTER")
188
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
189
        println "-- IMPORTER"
190
        def imp = new importer(trsfiles, binDir, txmDir, metadatas, lang) //put result in the txm folder of binDir
191
        if (!imp.run()) {
192
                println "Failed to prepare files - Aborting";
193
                return;
194
        }
195
        if (MONITOR != null) MONITOR.worked(20)
196

    
197
        println "-- Xml Validation"
198
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
199
        for (File infile : txmDir.listFiles()) {
200
                if (!ValidateXml.test(infile)) {
201
                        println "$infile : Validation failed";
202
                        infile.delete();
203
                }
204
        }
205

    
206
        if (MONITOR != null) MONITOR.worked(5)
207
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
208
        println "-- Remove interviewer: "+removeInterviewer
209
        if (removeInterviewer)         {
210
                if (metadatas == null) {
211
                        println "Can't remove interviewer without a metadata.csv file defining who are the interviewers."
212
                } else {
213
                        println "Removing some speakers in "+txmDir.listFiles().length+" file(s)"
214
                        for (File infile : txmDir.listFiles()) {
215
                                String filename = infile.getName();
216
                                int idx = filename.indexOf(".xml");
217
                                if (idx > 0)
218
                                        filename = filename.substring(0, idx);
219

    
220
                                ArrayList<Pair<String, String>> metas = metadatas.get(filename)
221
                                //println "filename=$filename metas= $metas"
222
                                for (Pair p : metas) {
223
                                        if (p.getFirst().startsWith("enq")) {
224
                                                new RemoveSpeaker(infile, infile, p.getFirst())
225
                                        }
226
                                }
227
                        }
228
                }
229
        }
230

    
231
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
232
        if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
233
        println "-- ANNOTATE - Running NLP tools"
234
        boolean annotationSuccess = false;
235
        if (annotate && new Annotate().run(binDir, txmDir, model+".par")) {
236
                annotationSuccess = true;
237
        }
238

    
239
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
240
        if (MONITOR != null) MONITOR.worked(25, "COMPILING")
241
        println "--COMPILING - Building Search Engine indexes"
242
        trsfiles = txmDir.listFiles();
243

    
244
        def comp = new compiler()
245
        if(debug) comp.setDebug();
246
        comp.removeInterviewers(removeInterviewer);
247
        comp.setIgnoreTranscriberMetadata(ignoreTranscriberMetadata);
248
        comp.setAnnotationSucces(annotationSuccess)
249
        if (!comp.run(trsfiles, corpusname, "default", binDir)) {
250
                println "Failed to compile files";
251
                return;
252
        }
253
        
254
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
255
        
256
        File htmlDir = new File(binDir,"HTML/$corpusname");
257
        htmlDir.deleteDir()
258
        htmlDir.mkdirs();
259
        if (build_edition) {
260
                
261
                if (MONITOR != null) MONITOR.worked(20, "EDITION")
262
                println "-- EDITION - Building editions"
263
                
264
                List<File> filelist = txmDir.listFiles();
265
                Collections.sort(filelist);
266
                def second = 0
267

    
268
                println "Paginating texts: "
269
                for (File txmFile : filelist) {
270
                        print "."
271
                        String txtname = txmFile.getName();
272
                        int i = txtname.lastIndexOf(".");
273
                        if(i > 0) txtname = txtname.substring(0, i);
274

    
275
                        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
276
                        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
277

    
278
                        Element text = params.addText(corpusElem, txtname, txmFile);
279

    
280
                        def ed = new pager(txmFile, htmlDir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element, metadatas);
281
                        Element edition = params.addEdition(text, "default", htmlDir.getAbsolutePath(), "html");
282

    
283
                        for (i = 0 ; i < ed.getPageFiles().size();) {
284
                                File f = ed.getPageFiles().get(i);
285
                                String wordid = ed.getIdx().get(i);
286
                                params.addPage(edition, ""+(++i), wordid);
287
                        }
288

    
289
                        if (ed.getPageFiles().size() > 0) {
290
                                Element editionBD = params.addEdition(text, "onepage", htmlDir.getAbsolutePath(), "html");
291
                                params.addPage(editionBD, "1", ed.getIndexes().get(0));
292
                        }
293
                }
294

    
295
                //copy transcriber.css
296
                File cssfile = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME), "css/transcriber.css")
297
                if (cssfile.exists() && htmlDir.exists()) {
298
                        FileCopy.copy(cssfile, new File(htmlDir, "transcriber.css"));
299
                        FileCopy.copy(cssfile, new File(htmlDir, "onepage/transcriber.css"));
300
                        FileCopy.copy(cssfile, new File(htmlDir, "default/transcriber.css"));
301
                }
302
        }
303
}
304
catch (Exception e){org.txm.utils.logger.Log.printStackTrace(e);}
305

    
306
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
307
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
308
File paramFile = new File(binDir, "import.xml");
309
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;