Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / transcriber / transcriberLoader.groovy @ 2246

History | View | Annotate | Download (11.4 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate:$
41
// $LastChangedRevision:$
42
// $LastChangedBy:$
43
//
44
package org.txm.scripts.importer.transcriber;
45

    
46
import java.io.File;
47
import org.txm.importer.*
48
import org.txm.importer.scripts.xmltxm.*;
49
import org.txm.*;
50
import org.txm.core.engines.*;
51
import org.txm.objects.*;
52
import org.txm.utils.i18n.*;
53
import org.txm.utils.*;
54
import org.txm.scripts.importer.*;
55
import org.txm.metadatas.*;
56
import org.txm.utils.io.FileCopy;
57
import org.w3c.dom.Element
58
import org.txm.utils.xml.DomUtils;
59

    
60
//PARAMETERS
61
boolean removeInterviewer = false;//if true the transcription of speakers (en1 and enq2) defined in metadatas file will be ignored
62
boolean includeComments = false;
63
boolean ignoreTranscriberMetadata = false;
64
int csvHeaderNumber = 1;
65
int maxlines = 200;
66

    
67
String userDir = System.getProperty("user.home");
68

    
69
def MONITOR;
70
Project project;
71

    
72
try {project=projectBinding;MONITOR=monitor} catch (Exception)
73
{        }
74
if (project == null) { println "no project set. Aborting"; return; }
75

    
76
String corpusname = project.getName();
77
String basename = corpusname
78
String rootDir = project.getSrcdir();
79
String lang = project.getLang()
80
String model = lang
81
String encoding = project.getEncoding()
82
boolean annotate = project.getAnnotate()
83
String xsl = project.getFrontXSL();
84
def xslParams = project.getXsltParameters();
85
int wordsPerPage = project.getEditionDefinition("default").getWordsPerPage()
86
String page_element = project.getEditionDefinition("default").getPageElement()
87
boolean build_edition = project.getEditionDefinition("default").getBuildEdition()
88

    
89
File srcDir = new File(rootDir);
90
File binDir = project.getProjectDirectory();
91
binDir.mkdirs();
92
if (!binDir.exists()) {
93
        println "Could not create binDir "+binDir
94
        return;
95
}
96

    
97
File txmDir = new File(binDir,"txm/$corpusname");
98
txmDir.deleteDir();
99
txmDir.mkdirs();
100

    
101
//get metadata values from CSV
102
Metadatas metadatas; // text metadata
103
File allMetadataFile = Metadatas.findMetadataFile(srcDir);
104
println "Trying to read metadata values from: "+allMetadataFile
105
if (allMetadataFile.exists()) {
106
        File copy = new File(binDir, allMetadataFile.getName())
107
        if (!FileCopy.copy(allMetadataFile, copy)) {
108
                println "Error: could not create a copy of metadata file "+allMetadataFile.getAbsoluteFile();
109
                return;
110
        }
111
        metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(),
112
                        Toolbox.getMetadataColumnSeparator(),
113
                        Toolbox.getMetadataTextSeparator(), 1)
114
}
115
else
116
        println "no metadata file: "+allMetadataFile
117

    
118
File propertyFile = new File(srcDir, "import.properties")//default
119
Properties props = new Properties();
120
String[] metadatasToKeep;
121
if (propertyFile.exists() && propertyFile.canRead()) {
122
        FileInputStream input = new FileInputStream(propertyFile);
123
        props.load(input);
124
        input.close();
125

    
126
        if (props.getProperty("removeInterviewer") != null)
127
                removeInterviewer = Boolean.parseBoolean(props.get("removeInterviewer").toString());
128
        if (props.getProperty("ignoreTranscriberMetadata") != null)
129
                ignoreTranscriberMetadata = Boolean.parseBoolean(props.get("ignoreTranscriberMetadata").toString());
130
        if (props.getProperty("metadataList") != null)
131
                metadatasToKeep = props.get("metadataList").toString().split("|");
132
        if (props.getProperty("csvHeaderNumber") != null)
133
                csvHeaderNumber = props.get("csvHeaderNumber").toString().split("|");
134
        //if (props.getProperty("includeComments") != null)
135
        //        includeComments = props.get("includeComments").toString();
136

    
137
        println "import properties: "
138
        println " removeInterviewer: "+removeInterviewer
139
        println " ignoreTranscriberMetadata: "+ignoreTranscriberMetadata
140
        println " metadataToKeep: "+metadatasToKeep
141
        println " ignored csvHeaderSize: "+csvHeaderNumber
142
        //println " includeComments: "+includeComments
143
}
144

    
145

    
146

    
147
// Apply XSL
148
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
149
if (MONITOR != null) MONITOR.worked(1, "XSL")
150
if (xsl != null && xsl.trim().length() > 0) {
151
        if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src")))
152
                srcDir = new File(binDir, "src");
153
        println ""
154
}
155

    
156
try {
157
        // select only trs files
158
        String ext = "trs";
159
        ArrayList<File> trsfiles = srcDir.listFiles(); //find all trs files
160
        if (trsfiles  == null) {
161
                println ("No files in "+srcDir.getAbsolutePath())
162
                return false;
163
        }
164
        for (int i = 0 ; i < trsfiles.size() ; i++) {
165
                File f = trsfiles.get(i);
166
                if (!f.getName().endsWith(ext) || !f.canRead() || f.isHidden()) {
167
                        trsfiles.remove(i)
168
                        i--;
169
                }
170
        }
171

    
172
        if (trsfiles.size() == 0) {
173
                println ("No transcription file (*.trs) found in "+srcDir.getAbsolutePath()+". Aborting.")
174
                return false;
175
        }
176

    
177
        if (MONITOR != null) MONITOR.worked(1, "IMPORTER")
178
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
179
        println "-- IMPORTER"
180
        def imp = new importer(trsfiles, binDir, txmDir, metadatas, lang) //put result in the txm folder of binDir
181
        if (!imp.run()) {
182
                println "Failed to prepare files - Aborting";
183
                return;
184
        }
185
        if (MONITOR != null) MONITOR.worked(20)
186

    
187
        println "-- Xml Validation"
188
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
189
        for (File infile : txmDir.listFiles()) {
190
                if (!ValidateXml.test(infile)) {
191
                        println "$infile : Validation failed";
192
                        infile.delete();
193
                }
194
        }
195

    
196
        if (MONITOR != null) MONITOR.worked(5)
197
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
198
        println "-- Remove interviewer: "+removeInterviewer
199
        if (removeInterviewer)         {
200
                if (metadatas == null) {
201
                        println "Can't remove interviewer without a metadata.csv file defining who are the interviewers."
202
                } else {
203
                        println "Removing some speakers in "+txmDir.listFiles().length+" file(s)"
204
                        for (File infile : txmDir.listFiles()) {
205
                                String filename = infile.getName();
206
                                int idx = filename.indexOf(".xml");
207
                                if (idx > 0)
208
                                        filename = filename.substring(0, idx);
209

    
210
                                ArrayList<Pair<String, String>> metas = metadatas.get(filename)
211
                                //println "filename=$filename metas= $metas"
212
                                for (Pair p : metas) {
213
                                        if (p.getFirst().startsWith("enq")) {
214
                                                new RemoveSpeaker(infile, infile, p.getFirst())
215
                                        }
216
                                }
217
                        }
218
                }
219
        }
220

    
221
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
222
        if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
223

    
224
        boolean annotationSuccess = false;
225
        if (annotate) {
226
                println "-- ANNOTATE - Running NLP tools"
227
                def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
228
                if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
229
                        annotationSuccess = true;
230
                }
231
        }
232

    
233
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
234
        if (MONITOR != null) MONITOR.worked(25, "COMPILING")
235
        println "--COMPILING - Building Search Engine indexes"
236
        trsfiles = txmDir.listFiles();
237

    
238
        def comp = new compiler()
239
        if(debug) comp.setDebug();
240
        comp.removeInterviewers(removeInterviewer);
241
        comp.setIgnoreTranscriberMetadata(ignoreTranscriberMetadata);
242
        comp.setAnnotationSucces(annotationSuccess)
243
        if (!comp.run(project, trsfiles, corpusname, "default", binDir)) {
244
                println "Failed to compile files";
245
                return;
246
        }
247

    
248
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
249

    
250
        File htmlDir = new File(binDir,"HTML/$corpusname");
251
        htmlDir.deleteDir()
252
        htmlDir.mkdirs();
253
        if (build_edition) {
254

    
255
                if (MONITOR != null) MONITOR.worked(20, "EDITION")
256
                println "-- EDITION - Building editions"
257

    
258
                List<File> filelist = txmDir.listFiles();
259
                Collections.sort(filelist);
260
                def second = 0
261

    
262
                println "Paginating "+filelist.size()+" texts"
263
                ConsoleProgressBar cpb = new ConsoleProgressBar(filelist.size());
264
                for (File txmFile : filelist) {
265
                        cpb.tick()
266
                        String txtname = txmFile.getName();
267
                        int i = txtname.lastIndexOf(".");
268
                        if(i > 0) txtname = txtname.substring(0, i);
269

    
270
                        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
271
                        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
272

    
273
                        Text t = new Text(project);
274
                        t.setName(txtname);
275
                        t.setSourceFile(txmFile)
276
                        t.setTXMFile(txmFile)
277

    
278
                        def ed = new pager(txmFile, htmlDir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element, metadatas);
279
                        Edition edition = new Edition(t);
280
                        edition.setName("default");
281
                        edition.setIndex(htmlDir.getAbsolutePath());
282
                        for (i = 0 ; i < ed.getPageFiles().size();) {
283
                                File f = ed.getPageFiles().get(i);
284
                                String wordid = "w_0";
285
                                if (i < ed.getIdx().size()) wordid = ed.getIdx().get(i);
286
                                edition.addPage(""+(++i), wordid);
287
                        }
288
                }
289
                cpb.done()
290

    
291
                //copy transcriber.css
292
                File cssfile = new File(Toolbox.getTxmHomePath(), "css/transcriber.css")
293
                if (cssfile.exists() && htmlDir.exists()) {
294
                        FileCopy.copy(cssfile, new File(htmlDir, "transcriber.css"));
295
                        FileCopy.copy(cssfile, new File(htmlDir, "onepage/transcriber.css"));
296
                        FileCopy.copy(cssfile, new File(htmlDir, "default/transcriber.css"));
297
                }
298
                
299
                //copy media files
300
                println "Copying media files if any (mp3, wav, mp4 or avi) "+filelist.size()+" texts"
301
                cpb = new ConsoleProgressBar(filelist.size());
302
                for (File txmFile : filelist) {
303
                        cpb.tick()
304
                        String txtname = txmFile.getName();
305
                        int i = txtname.lastIndexOf(".");
306
                        if(i > 0) txtname = txtname.substring(0, i);
307
                        File mediaFile = new File(project.getSrcdir(), txtname + ".mp3")
308
                        if (!mediaFile.exists()) mediaFile = new File(project.getSrcdir(), txtname + ".wav")
309
                        if (!mediaFile.exists()) mediaFile = new File(project.getSrcdir(), txtname + ".mp4")
310
                        if (!mediaFile.exists()) mediaFile = new File(project.getSrcdir(), txtname + ".avi")
311
                                
312
                        if (mediaFile.exists()) {
313
                                File copy = new File(binDir, "media/"+mediaFile.getName())
314
                                copy.getParentFile().mkdirs()
315
                                FileCopy.copy(mediaFile, copy);
316
                        }
317
                }
318
                cpb.done()
319
        }
320
}
321
catch (Exception e){org.txm.utils.logger.Log.printStackTrace(e);}
322

    
323
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
324
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
325
readyToLoad = project.save();