root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / transcriber / transcriberLoader.groovy @ 2369
History | View | Annotate | Download (12.7 kB)
1 |
// Copyright © 2010-2013 ENS de Lyon.
|
---|---|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice
|
4 |
// Sophia Antipolis, University of Paris 3.
|
5 |
//
|
6 |
// The TXM platform is free software: you can redistribute it
|
7 |
// and/or modify it under the terms of the GNU General Public
|
8 |
// License as published by the Free Software Foundation,
|
9 |
// either version 2 of the License, or (at your option) any
|
10 |
// later version.
|
11 |
//
|
12 |
// The TXM platform is distributed in the hope that it will be
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 |
// PURPOSE. See the GNU General Public License for more
|
16 |
// details.
|
17 |
//
|
18 |
// You should have received a copy of the GNU General
|
19 |
// Public License along with the TXM platform. If not, see
|
20 |
// http://www.gnu.org/licenses.
|
21 |
|
22 |
//
|
23 |
// This file is part of the TXM platform.
|
24 |
//
|
25 |
// The TXM platform is free software: you can redistribute it and/or modif y
|
26 |
// it under the terms of the GNU General Public License as published by
|
27 |
// the Free Software Foundation, either version 3 of the License, or
|
28 |
// (at your option) any later version.
|
29 |
//
|
30 |
// The TXM platform is distributed in the hope that it will be useful,
|
31 |
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
32 |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
33 |
// GNU General Public License for more details.
|
34 |
//
|
35 |
// You should have received a copy of the GNU General Public License
|
36 |
// along with the TXM platform. If not, see <http://www.gnu.org/licenses/>.
|
37 |
//
|
38 |
//
|
39 |
//
|
40 |
// $LastChangedDate:$
|
41 |
// $LastChangedRevision:$
|
42 |
// $LastChangedBy:$
|
43 |
//
|
44 |
package org.txm.scripts.importer.transcriber;
|
45 |
|
46 |
import java.io.File; |
47 |
import org.txm.importer.* |
48 |
import org.txm.importer.scripts.xmltxm.*; |
49 |
import org.txm.*; |
50 |
import org.txm.core.engines.*; |
51 |
import org.txm.objects.*; |
52 |
import org.txm.utils.i18n.*; |
53 |
import org.txm.utils.*; |
54 |
import org.txm.scripts.importer.*; |
55 |
import org.txm.metadatas.*; |
56 |
import org.txm.utils.io.FileCopy; |
57 |
import org.w3c.dom.Element |
58 |
import org.txm.utils.xml.DomUtils; |
59 |
|
60 |
//PARAMETERS
|
61 |
boolean removeInterviewer = false;//if true the transcription of speakers (en1 and enq2) defined in metadatas file will be ignored |
62 |
boolean includeComments = false; |
63 |
boolean ignoreTranscriberMetadata = false; |
64 |
int csvHeaderNumber = 1; |
65 |
int maxlines = 200; |
66 |
|
67 |
String userDir = System.getProperty("user.home"); |
68 |
|
69 |
def MONITOR;
|
70 |
Project project; |
71 |
|
72 |
try {project=projectBinding;MONITOR=monitor} catch (Exception) |
73 |
{ } |
74 |
if (project == null) { println "no project set. Aborting"; return; } |
75 |
|
76 |
String corpusname = project.getName();
|
77 |
String basename = corpusname
|
78 |
String rootDir = project.getSrcdir();
|
79 |
String lang = project.getLang()
|
80 |
String model = lang
|
81 |
String encoding = project.getEncoding()
|
82 |
boolean annotate = project.getAnnotate()
|
83 |
String xsl = project.getFrontXSL();
|
84 |
def xslParams = project.getXsltParameters();
|
85 |
int wordsPerPage = project.getEditionDefinition("default").getWordsPerPage() |
86 |
String page_element = project.getEditionDefinition("default").getPageElement() |
87 |
boolean build_edition = project.getEditionDefinition("default").getBuildEdition() |
88 |
boolean update = project.getDoUpdate()
|
89 |
|
90 |
File srcDir = new File(rootDir); |
91 |
File binDir = project.getProjectDirectory();
|
92 |
binDir.mkdirs(); |
93 |
if (!binDir.exists()) {
|
94 |
println "Could not create binDir "+binDir
|
95 |
return;
|
96 |
} |
97 |
|
98 |
File txmDir = new File(binDir,"txm/$corpusname"); |
99 |
if (!update) txmDir.deleteDir();
|
100 |
txmDir.mkdirs(); |
101 |
|
102 |
//get metadata values from CSV
|
103 |
Metadatas metadatas; // text metadata
|
104 |
File allMetadataFile = Metadatas.findMetadataFile(srcDir);
|
105 |
println "Trying to read metadata values from: "+allMetadataFile
|
106 |
if (allMetadataFile.exists()) {
|
107 |
File copy = new File(binDir, allMetadataFile.getName()) |
108 |
if (!FileCopy.copy(allMetadataFile, copy)) {
|
109 |
println "Error: could not create a copy of metadata file "+allMetadataFile.getAbsoluteFile();
|
110 |
return;
|
111 |
} |
112 |
metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(),
|
113 |
Toolbox.getMetadataColumnSeparator(), |
114 |
Toolbox.getMetadataTextSeparator(), 1)
|
115 |
} |
116 |
else {
|
117 |
println "no metadata file: "+allMetadataFile
|
118 |
} |
119 |
|
120 |
final HashMap<String, String> textordersInfo = new HashMap<String, String>(); |
121 |
if (metadatas != null) { |
122 |
for (String t : metadatas.keySet()) { |
123 |
def ti = metadatas.get(t)
|
124 |
for (org.txm.metadatas.Entry e : ti) {
|
125 |
if ("textorder".equals(e.getId())) { |
126 |
String k = ""+t+".xml" // the sort test will use the xml-txm file names |
127 |
textordersInfo[k] = e.value |
128 |
} |
129 |
} |
130 |
} |
131 |
} |
132 |
File propertyFile = new File(srcDir, "import.properties")//default |
133 |
Properties props = new Properties(); |
134 |
String[] metadatasToKeep; |
135 |
if (propertyFile.exists() && propertyFile.canRead()) {
|
136 |
FileInputStream input = new FileInputStream(propertyFile); |
137 |
props.load(input); |
138 |
input.close(); |
139 |
|
140 |
if (props.getProperty("removeInterviewer") != null) |
141 |
removeInterviewer = Boolean.parseBoolean(props.get("removeInterviewer").toString()); |
142 |
if (props.getProperty("ignoreTranscriberMetadata") != null) |
143 |
ignoreTranscriberMetadata = Boolean.parseBoolean(props.get("ignoreTranscriberMetadata").toString()); |
144 |
if (props.getProperty("metadataList") != null) |
145 |
metadatasToKeep = props.get("metadataList").toString().split("|"); |
146 |
if (props.getProperty("csvHeaderNumber") != null) |
147 |
csvHeaderNumber = props.get("csvHeaderNumber").toString().split("|"); |
148 |
//if (props.getProperty("includeComments") != null)
|
149 |
// includeComments = props.get("includeComments").toString();
|
150 |
|
151 |
println "import properties: "
|
152 |
println " removeInterviewer: "+removeInterviewer
|
153 |
println " ignoreTranscriberMetadata: "+ignoreTranscriberMetadata
|
154 |
println " metadataToKeep: "+metadatasToKeep
|
155 |
println " ignored csvHeaderSize: "+csvHeaderNumber
|
156 |
//println " includeComments: "+includeComments
|
157 |
} |
158 |
|
159 |
|
160 |
|
161 |
|
162 |
|
163 |
try {
|
164 |
if (!update) {
|
165 |
// Apply XSL
|
166 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
167 |
if (MONITOR != null) MONITOR.worked(1, "XSL") |
168 |
if (xsl != null && xsl.trim().length() > 0) { |
169 |
if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"))) |
170 |
srcDir = new File(binDir, "src"); |
171 |
println ""
|
172 |
} |
173 |
|
174 |
// select only trs files
|
175 |
String ext = "trs"; |
176 |
ArrayList<File> trsfiles = srcDir.listFiles(); //find all trs files |
177 |
if (trsfiles == null) { |
178 |
println ("No files in "+srcDir.getAbsolutePath())
|
179 |
return false; |
180 |
} |
181 |
for (int i = 0 ; i < trsfiles.size() ; i++) { |
182 |
File f = trsfiles.get(i);
|
183 |
if (!f.getName().endsWith(ext) || !f.canRead() || f.isHidden()) {
|
184 |
trsfiles.remove(i) |
185 |
i--; |
186 |
} |
187 |
} |
188 |
|
189 |
if (trsfiles.size() == 0) { |
190 |
println ("No transcription file (*.trs) found in "+srcDir.getAbsolutePath()+". Aborting.") |
191 |
return false; |
192 |
} |
193 |
|
194 |
if (MONITOR != null) MONITOR.worked(1, "IMPORTER") |
195 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
196 |
println "-- IMPORTER"
|
197 |
def imp = new importer(trsfiles, binDir, txmDir, metadatas, lang) //put result in the txm folder of binDir |
198 |
if (!imp.run()) {
|
199 |
println "Failed to prepare files - Aborting";
|
200 |
return;
|
201 |
} |
202 |
if (MONITOR != null) MONITOR.worked(20) |
203 |
|
204 |
println "-- Xml Validation"
|
205 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
206 |
for (File infile : txmDir.listFiles()) { |
207 |
if (!ValidateXml.test(infile)) {
|
208 |
println "$infile : Validation failed";
|
209 |
infile.delete(); |
210 |
} |
211 |
} |
212 |
|
213 |
if (MONITOR != null) MONITOR.worked(5) |
214 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
215 |
println "-- Remove interviewer: "+removeInterviewer
|
216 |
if (removeInterviewer) {
|
217 |
if (metadatas == null) { |
218 |
println "Can't remove interviewer without a metadata.csv file defining who are the interviewers."
|
219 |
} else {
|
220 |
println "Removing some speakers in "+txmDir.listFiles().length+" file(s)" |
221 |
for (File infile : txmDir.listFiles()) { |
222 |
String filename = infile.getName();
|
223 |
int idx = filename.indexOf(".xml"); |
224 |
if (idx > 0) |
225 |
filename = filename.substring(0, idx);
|
226 |
|
227 |
ArrayList<Pair<String, String>> metas = metadatas.get(filename) |
228 |
//println "filename=$filename metas= $metas"
|
229 |
for (Pair p : metas) {
|
230 |
if (p.getFirst().startsWith("enq")) { |
231 |
new RemoveSpeaker(infile, infile, p.getFirst())
|
232 |
} |
233 |
} |
234 |
} |
235 |
} |
236 |
} |
237 |
|
238 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
239 |
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE") |
240 |
|
241 |
boolean annotationSuccess = false; |
242 |
if (annotate) {
|
243 |
println "-- ANNOTATE - Running NLP tools"
|
244 |
def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger") |
245 |
if (engine.processDirectory(txmDir, binDir, ["lang":model])) { |
246 |
annotationSuccess = true;
|
247 |
} |
248 |
} |
249 |
} // end of importer and annotate steps
|
250 |
|
251 |
xmltxmFiles = new ArrayList<File>(Arrays.asList(txmDir.listFiles())); |
252 |
if (metadatas != null && metadatas.getPropertyNames().contains("textorder")) { |
253 |
Collections.sort(xmltxmFiles, new Comparator<File>() { |
254 |
public int compare(File f1, File f2) { |
255 |
String o1 = textordersInfo[f1.getName()];
|
256 |
String o2 = textordersInfo[f2.getName()];
|
257 |
if (o1 == null && o2 == null) { |
258 |
return f1.compareTo(f2);
|
259 |
} else if (o1 == null) { |
260 |
return 1 |
261 |
} else if (o2 == null) { |
262 |
return -1 |
263 |
} else {
|
264 |
int c = o1.compareTo(o2);
|
265 |
if (c == 0) return f1.compareTo(f2); |
266 |
else return c; |
267 |
} |
268 |
} |
269 |
}); |
270 |
} else {
|
271 |
Collections.sort(xmltxmFiles);
|
272 |
} |
273 |
|
274 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
275 |
if (MONITOR != null) MONITOR.worked(25, "COMPILING") |
276 |
println "--COMPILING - Building Search Engine indexes"
|
277 |
|
278 |
def comp = new compiler() |
279 |
if(debug) comp.setDebug();
|
280 |
comp.removeInterviewers(removeInterviewer); |
281 |
comp.setIgnoreTranscriberMetadata(ignoreTranscriberMetadata); |
282 |
if (!comp.run(project, xmltxmFiles, corpusname, "default", binDir)) { |
283 |
println "Failed to compile files";
|
284 |
return;
|
285 |
} |
286 |
|
287 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
288 |
|
289 |
File htmlDir = new File(binDir,"HTML/$corpusname"); |
290 |
htmlDir.deleteDir() |
291 |
htmlDir.mkdirs(); |
292 |
if (build_edition) {
|
293 |
|
294 |
if (MONITOR != null) MONITOR.worked(20, "EDITION") |
295 |
println "-- EDITION - Building editions"
|
296 |
|
297 |
def second = 0 |
298 |
|
299 |
println "Paginating "+xmltxmFiles.size()+" texts" |
300 |
ConsoleProgressBar cpb = new ConsoleProgressBar(xmltxmFiles.size());
|
301 |
for (File txmFile : xmltxmFiles) { |
302 |
cpb.tick() |
303 |
String txtname = txmFile.getName();
|
304 |
int i = txtname.lastIndexOf("."); |
305 |
if(i > 0) txtname = txtname.substring(0, i); |
306 |
|
307 |
List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang); |
308 |
List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang); |
309 |
|
310 |
Text t = project.getText(txtname) |
311 |
if (t == null) { |
312 |
t = new Text(project);
|
313 |
t.setName(txtname); |
314 |
} |
315 |
t.setSourceFile(txmFile) |
316 |
t.setTXMFile(txmFile) |
317 |
|
318 |
Edition edition = t.getEdition("default")
|
319 |
if (edition != null) { |
320 |
edition.delete(); |
321 |
edition = null;
|
322 |
} |
323 |
def ed = new pager(txmFile, htmlDir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element, metadatas); |
324 |
edition = t.getEdition("default")
|
325 |
edition = new Edition(t);
|
326 |
edition.setName("default");
|
327 |
|
328 |
edition.setIndex(htmlDir.getAbsolutePath()); |
329 |
for (i = 0 ; i < ed.getPageFiles().size();) { |
330 |
File f = ed.getPageFiles().get(i);
|
331 |
String wordid = "w_0"; |
332 |
if (i < ed.getIdx().size()) wordid = ed.getIdx().get(i);
|
333 |
edition.addPage(""+(++i), wordid);
|
334 |
} |
335 |
} |
336 |
cpb.done() |
337 |
|
338 |
//copy transcriber.css
|
339 |
File cssfile = new File(Toolbox.getTxmHomePath(), "css/transcriber.css") |
340 |
File cssTXMFile = new File(Toolbox.getTxmHomePath(), "css/txm.css") |
341 |
if (cssfile.exists() && htmlDir.exists()) {
|
342 |
FileCopy.copy(cssfile, new File(htmlDir, "onepage/transcriber.css")); |
343 |
FileCopy.copy(cssfile, new File(htmlDir, "default/txm.css")); |
344 |
FileCopy.copy(cssfile, new File(htmlDir, "default/transcriber.css")); |
345 |
} |
346 |
|
347 |
//copy media files
|
348 |
println "Copying media files if any (mp3, wav, mp4 or avi) "+xmltxmFiles.size()+" texts" |
349 |
cpb = new ConsoleProgressBar(xmltxmFiles.size());
|
350 |
for (File txmFile : xmltxmFiles) { |
351 |
cpb.tick() |
352 |
String txtname = txmFile.getName();
|
353 |
int i = txtname.lastIndexOf("."); |
354 |
if(i > 0) txtname = txtname.substring(0, i); |
355 |
File mediaFile = new File(project.getSrcdir(), txtname + ".mp3") |
356 |
if (!mediaFile.exists()) mediaFile = new File(project.getSrcdir(), txtname + ".wav") |
357 |
if (!mediaFile.exists()) mediaFile = new File(project.getSrcdir(), txtname + ".mp4") |
358 |
if (!mediaFile.exists()) mediaFile = new File(project.getSrcdir(), txtname + ".avi") |
359 |
|
360 |
if (mediaFile.exists()) {
|
361 |
File copy = new File(binDir, "media/"+mediaFile.getName()) |
362 |
copy.getParentFile().mkdirs() |
363 |
FileCopy.copy(mediaFile, copy); |
364 |
} |
365 |
} |
366 |
cpb.done() |
367 |
} |
368 |
} |
369 |
catch (Exception e){org.txm.utils.logger.Log.printStackTrace(e);} |
370 |
|
371 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
372 |
if (MONITOR != null) MONITOR.worked(20, "FINALIZING") |
373 |
readyToLoad = project.save(); |