Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / transcriber / transcriberLoader.groovy @ 2554

History | View | Annotate | Download (12.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate:$
41
// $LastChangedRevision:$
42
// $LastChangedBy:$
43
//
44
package org.txm.scripts.importer.transcriber;
45

    
46
import java.io.File;
47
import org.txm.importer.*
48
import org.txm.importer.scripts.xmltxm.*;
49
import org.txm.*;
50
import org.txm.core.engines.*;
51
import org.txm.objects.*;
52
import org.txm.utils.i18n.*;
53
import org.txm.utils.*;
54
import org.txm.scripts.importer.*;
55
import org.txm.metadatas.*;
56
import org.txm.utils.io.FileCopy;
57
import org.w3c.dom.Element
58
import org.txm.utils.xml.DomUtils;
59

    
60
//PARAMETERS
61
boolean removeInterviewer = false;//if true the transcription of speakers (en1 and enq2) defined in metadatas file will be ignored
62
boolean includeComments = false;
63
boolean ignoreTranscriberMetadata = false;
64
int csvHeaderNumber = 1;
65
int maxlines = 200;
66

    
67
String userDir = System.getProperty("user.home");
68

    
69
def MONITOR;
70
Project project;
71

    
72
try {project=projectBinding;MONITOR=monitor} catch (Exception)
73
{        }
74
if (project == null) { println "no project set. Aborting"; return; }
75

    
76
String corpusname = project.getName();
77
String basename = corpusname
78
String rootDir = project.getSrcdir();
79
String lang = project.getLang()
80
String model = lang
81
String encoding = project.getEncoding()
82
boolean annotate = project.getAnnotate()
83
String xsl = project.getFrontXSL();
84
def xslParams = project.getXsltParameters();
85
int wordsPerPage = project.getEditionDefinition("default").getWordsPerPage()
86
String page_element = project.getEditionDefinition("default").getPageElement()
87
boolean build_edition = project.getEditionDefinition("default").getBuildEdition()
88
boolean update = project.getDoUpdate()
89

    
90
File srcDir = new File(rootDir);
91
File binDir = project.getProjectDirectory();
92
binDir.mkdirs();
93
if (!binDir.exists()) {
94
        println "Could not create binDir "+binDir
95
        return;
96
}
97

    
98
File txmDir = new File(binDir,"txm/$corpusname");
99
if (!update) txmDir.deleteDir();
100
txmDir.mkdirs();
101

    
102
//get metadata values from CSV
103
Metadatas metadatas; // text metadata
104
File allMetadataFile = Metadatas.findMetadataFile(srcDir);
105
println "Trying to read metadata values from: "+allMetadataFile
106
if (allMetadataFile.exists()) {
107
        File copy = new File(binDir, allMetadataFile.getName())
108
        if (!FileCopy.copy(allMetadataFile, copy)) {
109
                println "Error: could not create a copy of metadata file "+allMetadataFile.getAbsoluteFile();
110
                return;
111
        }
112
        metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(),
113
                        Toolbox.getMetadataColumnSeparator(),
114
                        Toolbox.getMetadataTextSeparator(), 1)
115
}
116
else {
117
        println "no metadata file: "+allMetadataFile
118
}
119

    
120
final HashMap<String, String> textordersInfo = new HashMap<String, String>();
121
if (metadatas != null) {
122
        for (String t : metadatas.keySet()) {
123
                def ti = metadatas.get(t)
124
                for (org.txm.metadatas.Entry e : ti) {
125
                        if ("textorder".equals(e.getId())) {
126
                                String k = ""+t+".xml" // the sort test will use the xml-txm file names
127
                                textordersInfo[k] = e.value
128
                        }
129
                }
130
        }
131
}
132
File propertyFile = new File(srcDir, "import.properties")//default
133
Properties props = new Properties();
134
String[] metadatasToKeep;
135
if (propertyFile.exists() && propertyFile.canRead()) {
136
        FileInputStream input = new FileInputStream(propertyFile);
137
        props.load(input);
138
        input.close();
139

    
140
        if (props.getProperty("removeInterviewer") != null)
141
                removeInterviewer = Boolean.parseBoolean(props.get("removeInterviewer").toString());
142
        if (props.getProperty("ignoreTranscriberMetadata") != null)
143
                ignoreTranscriberMetadata = Boolean.parseBoolean(props.get("ignoreTranscriberMetadata").toString());
144
        if (props.getProperty("metadataList") != null)
145
                metadatasToKeep = props.get("metadataList").toString().split("|");
146
        if (props.getProperty("csvHeaderNumber") != null)
147
                csvHeaderNumber = props.get("csvHeaderNumber").toString().split("|");
148
        //if (props.getProperty("includeComments") != null)
149
        //        includeComments = props.get("includeComments").toString();
150

    
151
        println "import properties: "
152
        println " removeInterviewer: "+removeInterviewer
153
        println " ignoreTranscriberMetadata: "+ignoreTranscriberMetadata
154
        println " metadataToKeep: "+metadatasToKeep
155
        println " ignored csvHeaderSize: "+csvHeaderNumber
156
        //println " includeComments: "+includeComments
157
}
158

    
159

    
160

    
161

    
162

    
163
try {
164
        if (!update) {
165
                // Apply XSL
166
                if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
167
                if (MONITOR != null) MONITOR.worked(1, "XSL")
168
                if (xsl != null && xsl.trim().length() > 0) {
169
                        new File(binDir, "src").deleteDir() // delete previous outputed files
170
                        if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src")))
171
                                srcDir = new File(binDir, "src");
172
                        println ""
173
                }
174

    
175
                // select only trs files
176
                String ext = "trs";
177
                ArrayList<File> trsfiles = srcDir.listFiles(); //find all trs files
178
                if (trsfiles  == null) {
179
                        println ("No files in "+srcDir.getAbsolutePath())
180
                        return false;
181
                }
182
                for (int i = 0 ; i < trsfiles.size() ; i++) {
183
                        File f = trsfiles.get(i);
184
                        if (!f.getName().endsWith(ext) || !f.canRead() || f.isHidden()) {
185
                                trsfiles.remove(i)
186
                                i--;
187
                        }
188
                }
189

    
190
                if (trsfiles.size() == 0) {
191
                        println ("No transcription file (*.trs) found in "+srcDir.getAbsolutePath()+". Aborting.")
192
                        return false;
193
                }
194

    
195
                if (MONITOR != null) MONITOR.worked(1, "IMPORTER")
196
                if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
197
                println "-- IMPORTER"
198
                def imp = new importer(trsfiles, binDir, txmDir, metadatas, lang) //put result in the txm folder of binDir
199
                if (!imp.run()) {
200
                        println "Failed to prepare files - Aborting";
201
                        return;
202
                }
203
                if (MONITOR != null) MONITOR.worked(20)
204

    
205
                println "-- Xml Validation"
206
                if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
207
                for (File infile : txmDir.listFiles()) {
208
                        if (!ValidateXml.test(infile)) {
209
                                println "$infile : Validation failed";
210
                                infile.delete();
211
                        }
212
                }
213

    
214
                if (MONITOR != null) MONITOR.worked(5)
215
                if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
216
                println "-- Remove interviewer: "+removeInterviewer
217
                if (removeInterviewer)         {
218
                        if (metadatas == null) {
219
                                println "Can't remove interviewer without a metadata.csv file defining who are the interviewers."
220
                        } else {
221
                                println "Removing some speakers in "+txmDir.listFiles().length+" file(s)"
222
                                for (File infile : txmDir.listFiles()) {
223
                                        String filename = infile.getName();
224
                                        int idx = filename.indexOf(".xml");
225
                                        if (idx > 0)
226
                                                filename = filename.substring(0, idx);
227

    
228
                                        ArrayList<Pair<String, String>> metas = metadatas.get(filename)
229
                                        //println "filename=$filename metas= $metas"
230
                                        for (Pair p : metas) {
231
                                                if (p.getFirst().startsWith("enq")) {
232
                                                        new RemoveSpeaker(infile, infile, p.getFirst())
233
                                                }
234
                                        }
235
                                }
236
                        }
237
                }
238

    
239
                if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
240
                if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
241

    
242
                boolean annotationSuccess = false;
243
                if (annotate) {
244
                        println "-- ANNOTATE - Running NLP tools"
245
                        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
246
                        if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
247
                                annotationSuccess = true;
248
                        }
249
                }
250
        } // end of importer and annotate steps
251
        
252
        xmltxmFiles = new ArrayList<File>(Arrays.asList(txmDir.listFiles()));
253
        if (metadatas != null && metadatas.getPropertyNames().contains("textorder")) {
254
                Collections.sort(xmltxmFiles, new Comparator<File>() {
255
                                        public int compare(File f1, File f2) {
256
                                                String o1 = textordersInfo[f1.getName()];
257
                                                String o2 = textordersInfo[f2.getName()];
258
                                                if (o1 == null && o2 == null) {
259
                                                        return f1.compareTo(f2);
260
                                                } else if (o1 == null) {
261
                                                        return 1
262
                                                } else if (o2 == null) {
263
                                                        return -1
264
                                                } else {
265
                                                        int c = o1.compareTo(o2);
266
                                                        if (c == 0) return f1.compareTo(f2);
267
                                                        else return c;
268
                                                }
269
                                        }
270
                                });
271
        } else {
272
                Collections.sort(xmltxmFiles);
273
        }
274

    
275
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
276
        if (MONITOR != null) MONITOR.worked(25, "COMPILING")
277
        println "--COMPILING - Building Search Engine indexes"
278

    
279
        def comp = new compiler()
280
        if(debug) comp.setDebug();
281
        comp.removeInterviewers(removeInterviewer);
282
        comp.setIgnoreTranscriberMetadata(ignoreTranscriberMetadata);
283
        if (!comp.run(project, xmltxmFiles, corpusname, "default", binDir)) {
284
                println "Failed to compile files";
285
                return;
286
        }
287

    
288
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
289

    
290
        File htmlDir = new File(binDir,"HTML/$corpusname");
291
        htmlDir.deleteDir()
292
        htmlDir.mkdirs();
293
        if (build_edition) {
294

    
295
                if (MONITOR != null) MONITOR.worked(20, "EDITION")
296
                println "-- EDITION - Building editions"
297

    
298
                def second = 0
299

    
300
                println "Paginating "+xmltxmFiles.size()+" texts"
301
                ConsoleProgressBar cpb = new ConsoleProgressBar(xmltxmFiles.size());
302
                for (File txmFile : xmltxmFiles) {
303
                        cpb.tick()
304
                        String txtname = txmFile.getName();
305
                        int i = txtname.lastIndexOf(".");
306
                        if(i > 0) txtname = txtname.substring(0, i);
307

    
308
                        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
309
                        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
310

    
311
                        Text t = project.getText(txtname)
312
                        if (t == null) {
313
                                t = new Text(project);
314
                                t.setName(txtname);
315
                        }
316
                        t.setSourceFile(txmFile)
317
                        t.setTXMFile(txmFile)
318
                        
319
                        Edition edition = t.getEdition("default")
320
                        if (edition != null) {
321
                                edition.delete();
322
                                edition = null;
323
                        }
324
                        def ed = new pager(txmFile, htmlDir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element, metadatas);
325
                        edition = t.getEdition("default")
326
                        edition = new Edition(t);
327
                        edition.setName("default");
328
                        
329
                        edition.setIndex(htmlDir.getAbsolutePath());
330
                        for (i = 0 ; i < ed.getPageFiles().size();) {
331
                                File f = ed.getPageFiles().get(i);
332
                                String wordid = "w_0";
333
                                if (i < ed.getIdx().size()) wordid = ed.getIdx().get(i);
334
                                edition.addPage(""+(++i), wordid);
335
                        }
336
                }
337
                cpb.done()
338

    
339
                //copy transcriber.css
340
                File cssfile = new File(Toolbox.getTxmHomePath(), "css/transcriber.css")
341
                File cssTXMFile = new File(Toolbox.getTxmHomePath(), "css/txm.css")
342
                if (cssfile.exists() && htmlDir.exists()) {
343
                        FileCopy.copy(cssfile, new File(htmlDir, "onepage/transcriber.css"));
344
                        FileCopy.copy(cssfile, new File(htmlDir, "default/txm.css"));
345
                        FileCopy.copy(cssfile, new File(htmlDir, "default/transcriber.css"));
346
                }
347

    
348
                //copy media files
349
                println "Copying media files if any (mp3, wav, mp4 or avi) "+xmltxmFiles.size()+" texts"
350
                cpb = new ConsoleProgressBar(xmltxmFiles.size());
351
                for (File txmFile : xmltxmFiles) {
352
                        cpb.tick()
353
                        String txtname = txmFile.getName();
354
                        int i = txtname.lastIndexOf(".");
355
                        if(i > 0) txtname = txtname.substring(0, i);
356
                        File mediaFile = new File(project.getSrcdir(), txtname + ".mp3")
357
                        if (!mediaFile.exists()) mediaFile = new File(project.getSrcdir(), txtname + ".wav")
358
                        if (!mediaFile.exists()) mediaFile = new File(project.getSrcdir(), txtname + ".mp4")
359
                        if (!mediaFile.exists()) mediaFile = new File(project.getSrcdir(), txtname + ".avi")
360

    
361
                        if (mediaFile.exists()) {
362
                                File copy = new File(binDir, "media/"+mediaFile.getName())
363
                                copy.getParentFile().mkdirs()
364
                                FileCopy.copy(mediaFile, copy);
365
                        }
366
                }
367
                cpb.done()
368
        }
369
}
370
catch (Exception e){org.txm.utils.logger.Log.printStackTrace(e);}
371

    
372
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
373
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
374
readyToLoad = project.save();