Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / transcriber / transcriberLoader.groovy @ 2369

History | View | Annotate | Download (12.7 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate:$
41
// $LastChangedRevision:$
42
// $LastChangedBy:$
43
//
44
package org.txm.scripts.importer.transcriber;
45

    
46
import java.io.File;
47
import org.txm.importer.*
48
import org.txm.importer.scripts.xmltxm.*;
49
import org.txm.*;
50
import org.txm.core.engines.*;
51
import org.txm.objects.*;
52
import org.txm.utils.i18n.*;
53
import org.txm.utils.*;
54
import org.txm.scripts.importer.*;
55
import org.txm.metadatas.*;
56
import org.txm.utils.io.FileCopy;
57
import org.w3c.dom.Element
58
import org.txm.utils.xml.DomUtils;
59

    
60
//PARAMETERS
61
boolean removeInterviewer = false;//if true the transcription of speakers (en1 and enq2) defined in metadatas file will be ignored
62
boolean includeComments = false;
63
boolean ignoreTranscriberMetadata = false;
64
int csvHeaderNumber = 1;
65
int maxlines = 200;
66

    
67
String userDir = System.getProperty("user.home");
68

    
69
def MONITOR;
70
Project project;
71

    
72
try {project=projectBinding;MONITOR=monitor} catch (Exception)
73
{        }
74
if (project == null) { println "no project set. Aborting"; return; }
75

    
76
String corpusname = project.getName();
77
String basename = corpusname
78
String rootDir = project.getSrcdir();
79
String lang = project.getLang()
80
String model = lang
81
String encoding = project.getEncoding()
82
boolean annotate = project.getAnnotate()
83
String xsl = project.getFrontXSL();
84
def xslParams = project.getXsltParameters();
85
int wordsPerPage = project.getEditionDefinition("default").getWordsPerPage()
86
String page_element = project.getEditionDefinition("default").getPageElement()
87
boolean build_edition = project.getEditionDefinition("default").getBuildEdition()
88
boolean update = project.getDoUpdate()
89

    
90
File srcDir = new File(rootDir);
91
File binDir = project.getProjectDirectory();
92
binDir.mkdirs();
93
if (!binDir.exists()) {
94
        println "Could not create binDir "+binDir
95
        return;
96
}
97

    
98
File txmDir = new File(binDir,"txm/$corpusname");
99
if (!update) txmDir.deleteDir();
100
txmDir.mkdirs();
101

    
102
//get metadata values from CSV
103
Metadatas metadatas; // text metadata
104
File allMetadataFile = Metadatas.findMetadataFile(srcDir);
105
println "Trying to read metadata values from: "+allMetadataFile
106
if (allMetadataFile.exists()) {
107
        File copy = new File(binDir, allMetadataFile.getName())
108
        if (!FileCopy.copy(allMetadataFile, copy)) {
109
                println "Error: could not create a copy of metadata file "+allMetadataFile.getAbsoluteFile();
110
                return;
111
        }
112
        metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(),
113
                        Toolbox.getMetadataColumnSeparator(),
114
                        Toolbox.getMetadataTextSeparator(), 1)
115
}
116
else {
117
        println "no metadata file: "+allMetadataFile
118
}
119

    
120
final HashMap<String, String> textordersInfo = new HashMap<String, String>();
121
if (metadatas != null) {
122
        for (String t : metadatas.keySet()) {
123
                def ti = metadatas.get(t)
124
                for (org.txm.metadatas.Entry e : ti) {
125
                        if ("textorder".equals(e.getId())) {
126
                                String k = ""+t+".xml" // the sort test will use the xml-txm file names
127
                                textordersInfo[k] = e.value
128
                        }
129
                }
130
        }
131
}
132
File propertyFile = new File(srcDir, "import.properties")//default
133
Properties props = new Properties();
134
String[] metadatasToKeep;
135
if (propertyFile.exists() && propertyFile.canRead()) {
136
        FileInputStream input = new FileInputStream(propertyFile);
137
        props.load(input);
138
        input.close();
139

    
140
        if (props.getProperty("removeInterviewer") != null)
141
                removeInterviewer = Boolean.parseBoolean(props.get("removeInterviewer").toString());
142
        if (props.getProperty("ignoreTranscriberMetadata") != null)
143
                ignoreTranscriberMetadata = Boolean.parseBoolean(props.get("ignoreTranscriberMetadata").toString());
144
        if (props.getProperty("metadataList") != null)
145
                metadatasToKeep = props.get("metadataList").toString().split("|");
146
        if (props.getProperty("csvHeaderNumber") != null)
147
                csvHeaderNumber = props.get("csvHeaderNumber").toString().split("|");
148
        //if (props.getProperty("includeComments") != null)
149
        //        includeComments = props.get("includeComments").toString();
150

    
151
        println "import properties: "
152
        println " removeInterviewer: "+removeInterviewer
153
        println " ignoreTranscriberMetadata: "+ignoreTranscriberMetadata
154
        println " metadataToKeep: "+metadatasToKeep
155
        println " ignored csvHeaderSize: "+csvHeaderNumber
156
        //println " includeComments: "+includeComments
157
}
158

    
159

    
160

    
161

    
162

    
163
try {
164
        if (!update) {
165
                // Apply XSL
166
                if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
167
                if (MONITOR != null) MONITOR.worked(1, "XSL")
168
                if (xsl != null && xsl.trim().length() > 0) {
169
                        if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src")))
170
                                srcDir = new File(binDir, "src");
171
                        println ""
172
                }
173

    
174
                // select only trs files
175
                String ext = "trs";
176
                ArrayList<File> trsfiles = srcDir.listFiles(); //find all trs files
177
                if (trsfiles  == null) {
178
                        println ("No files in "+srcDir.getAbsolutePath())
179
                        return false;
180
                }
181
                for (int i = 0 ; i < trsfiles.size() ; i++) {
182
                        File f = trsfiles.get(i);
183
                        if (!f.getName().endsWith(ext) || !f.canRead() || f.isHidden()) {
184
                                trsfiles.remove(i)
185
                                i--;
186
                        }
187
                }
188

    
189
                if (trsfiles.size() == 0) {
190
                        println ("No transcription file (*.trs) found in "+srcDir.getAbsolutePath()+". Aborting.")
191
                        return false;
192
                }
193

    
194
                if (MONITOR != null) MONITOR.worked(1, "IMPORTER")
195
                if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
196
                println "-- IMPORTER"
197
                def imp = new importer(trsfiles, binDir, txmDir, metadatas, lang) //put result in the txm folder of binDir
198
                if (!imp.run()) {
199
                        println "Failed to prepare files - Aborting";
200
                        return;
201
                }
202
                if (MONITOR != null) MONITOR.worked(20)
203

    
204
                println "-- Xml Validation"
205
                if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
206
                for (File infile : txmDir.listFiles()) {
207
                        if (!ValidateXml.test(infile)) {
208
                                println "$infile : Validation failed";
209
                                infile.delete();
210
                        }
211
                }
212

    
213
                if (MONITOR != null) MONITOR.worked(5)
214
                if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
215
                println "-- Remove interviewer: "+removeInterviewer
216
                if (removeInterviewer)         {
217
                        if (metadatas == null) {
218
                                println "Can't remove interviewer without a metadata.csv file defining who are the interviewers."
219
                        } else {
220
                                println "Removing some speakers in "+txmDir.listFiles().length+" file(s)"
221
                                for (File infile : txmDir.listFiles()) {
222
                                        String filename = infile.getName();
223
                                        int idx = filename.indexOf(".xml");
224
                                        if (idx > 0)
225
                                                filename = filename.substring(0, idx);
226

    
227
                                        ArrayList<Pair<String, String>> metas = metadatas.get(filename)
228
                                        //println "filename=$filename metas= $metas"
229
                                        for (Pair p : metas) {
230
                                                if (p.getFirst().startsWith("enq")) {
231
                                                        new RemoveSpeaker(infile, infile, p.getFirst())
232
                                                }
233
                                        }
234
                                }
235
                        }
236
                }
237

    
238
                if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
239
                if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
240

    
241
                boolean annotationSuccess = false;
242
                if (annotate) {
243
                        println "-- ANNOTATE - Running NLP tools"
244
                        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
245
                        if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
246
                                annotationSuccess = true;
247
                        }
248
                }
249
        } // end of importer and annotate steps
250
        
251
        xmltxmFiles = new ArrayList<File>(Arrays.asList(txmDir.listFiles()));
252
        if (metadatas != null && metadatas.getPropertyNames().contains("textorder")) {
253
                Collections.sort(xmltxmFiles, new Comparator<File>() {
254
                                        public int compare(File f1, File f2) {
255
                                                String o1 = textordersInfo[f1.getName()];
256
                                                String o2 = textordersInfo[f2.getName()];
257
                                                if (o1 == null && o2 == null) {
258
                                                        return f1.compareTo(f2);
259
                                                } else if (o1 == null) {
260
                                                        return 1
261
                                                } else if (o2 == null) {
262
                                                        return -1
263
                                                } else {
264
                                                        int c = o1.compareTo(o2);
265
                                                        if (c == 0) return f1.compareTo(f2);
266
                                                        else return c;
267
                                                }
268
                                        }
269
                                });
270
        } else {
271
                Collections.sort(xmltxmFiles);
272
        }
273

    
274
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
275
        if (MONITOR != null) MONITOR.worked(25, "COMPILING")
276
        println "--COMPILING - Building Search Engine indexes"
277

    
278
        def comp = new compiler()
279
        if(debug) comp.setDebug();
280
        comp.removeInterviewers(removeInterviewer);
281
        comp.setIgnoreTranscriberMetadata(ignoreTranscriberMetadata);
282
        if (!comp.run(project, xmltxmFiles, corpusname, "default", binDir)) {
283
                println "Failed to compile files";
284
                return;
285
        }
286

    
287
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
288

    
289
        File htmlDir = new File(binDir,"HTML/$corpusname");
290
        htmlDir.deleteDir()
291
        htmlDir.mkdirs();
292
        if (build_edition) {
293

    
294
                if (MONITOR != null) MONITOR.worked(20, "EDITION")
295
                println "-- EDITION - Building editions"
296

    
297
                def second = 0
298

    
299
                println "Paginating "+xmltxmFiles.size()+" texts"
300
                ConsoleProgressBar cpb = new ConsoleProgressBar(xmltxmFiles.size());
301
                for (File txmFile : xmltxmFiles) {
302
                        cpb.tick()
303
                        String txtname = txmFile.getName();
304
                        int i = txtname.lastIndexOf(".");
305
                        if(i > 0) txtname = txtname.substring(0, i);
306

    
307
                        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
308
                        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
309

    
310
                        Text t = project.getText(txtname)
311
                        if (t == null) {
312
                                t = new Text(project);
313
                                t.setName(txtname);
314
                        }
315
                        t.setSourceFile(txmFile)
316
                        t.setTXMFile(txmFile)
317
                        
318
                        Edition edition = t.getEdition("default")
319
                        if (edition != null) {
320
                                edition.delete();
321
                                edition = null;
322
                        }
323
                        def ed = new pager(txmFile, htmlDir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element, metadatas);
324
                        edition = t.getEdition("default")
325
                        edition = new Edition(t);
326
                        edition.setName("default");
327
                        
328
                        edition.setIndex(htmlDir.getAbsolutePath());
329
                        for (i = 0 ; i < ed.getPageFiles().size();) {
330
                                File f = ed.getPageFiles().get(i);
331
                                String wordid = "w_0";
332
                                if (i < ed.getIdx().size()) wordid = ed.getIdx().get(i);
333
                                edition.addPage(""+(++i), wordid);
334
                        }
335
                }
336
                cpb.done()
337

    
338
                //copy transcriber.css
339
                File cssfile = new File(Toolbox.getTxmHomePath(), "css/transcriber.css")
340
                File cssTXMFile = new File(Toolbox.getTxmHomePath(), "css/txm.css")
341
                if (cssfile.exists() && htmlDir.exists()) {
342
                        FileCopy.copy(cssfile, new File(htmlDir, "onepage/transcriber.css"));
343
                        FileCopy.copy(cssfile, new File(htmlDir, "default/txm.css"));
344
                        FileCopy.copy(cssfile, new File(htmlDir, "default/transcriber.css"));
345
                }
346

    
347
                //copy media files
348
                println "Copying media files if any (mp3, wav, mp4 or avi) "+xmltxmFiles.size()+" texts"
349
                cpb = new ConsoleProgressBar(xmltxmFiles.size());
350
                for (File txmFile : xmltxmFiles) {
351
                        cpb.tick()
352
                        String txtname = txmFile.getName();
353
                        int i = txtname.lastIndexOf(".");
354
                        if(i > 0) txtname = txtname.substring(0, i);
355
                        File mediaFile = new File(project.getSrcdir(), txtname + ".mp3")
356
                        if (!mediaFile.exists()) mediaFile = new File(project.getSrcdir(), txtname + ".wav")
357
                        if (!mediaFile.exists()) mediaFile = new File(project.getSrcdir(), txtname + ".mp4")
358
                        if (!mediaFile.exists()) mediaFile = new File(project.getSrcdir(), txtname + ".avi")
359

    
360
                        if (mediaFile.exists()) {
361
                                File copy = new File(binDir, "media/"+mediaFile.getName())
362
                                copy.getParentFile().mkdirs()
363
                                FileCopy.copy(mediaFile, copy);
364
                        }
365
                }
366
                cpb.done()
367
        }
368
}
369
catch (Exception e){org.txm.utils.logger.Log.printStackTrace(e);}
370

    
371
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
372
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
373
readyToLoad = project.save();