root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / transcriber / transcriberLoader.groovy @ 1000
History | View | Annotate | Download (11.4 kB)
1 | 321 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 321 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 321 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 321 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 321 | mdecorde | //
|
6 | 321 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 321 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 321 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 321 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 321 | mdecorde | // later version.
|
11 | 321 | mdecorde | //
|
12 | 321 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 321 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 321 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 321 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 321 | mdecorde | // details.
|
17 | 321 | mdecorde | //
|
18 | 321 | mdecorde | // You should have received a copy of the GNU General
|
19 | 321 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 321 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 321 | mdecorde | |
22 | 321 | mdecorde | //
|
23 | 321 | mdecorde | // This file is part of the TXM platform.
|
24 | 321 | mdecorde | //
|
25 | 321 | mdecorde | // The TXM platform is free software: you can redistribute it and/or modif y
|
26 | 321 | mdecorde | // it under the terms of the GNU General Public License as published by
|
27 | 321 | mdecorde | // the Free Software Foundation, either version 3 of the License, or
|
28 | 321 | mdecorde | // (at your option) any later version.
|
29 | 321 | mdecorde | //
|
30 | 321 | mdecorde | // The TXM platform is distributed in the hope that it will be useful,
|
31 | 321 | mdecorde | // but WITHOUT ANY WARRANTY; without even the implied warranty of
|
32 | 321 | mdecorde | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
33 | 321 | mdecorde | // GNU General Public License for more details.
|
34 | 321 | mdecorde | //
|
35 | 321 | mdecorde | // You should have received a copy of the GNU General Public License
|
36 | 321 | mdecorde | // along with the TXM platform. If not, see <http://www.gnu.org/licenses/>.
|
37 | 321 | mdecorde | //
|
38 | 321 | mdecorde | //
|
39 | 321 | mdecorde | //
|
40 | 321 | mdecorde | // $LastChangedDate:$
|
41 | 321 | mdecorde | // $LastChangedRevision:$
|
42 | 321 | mdecorde | // $LastChangedBy:$
|
43 | 321 | mdecorde | //
|
44 | 986 | mdecorde | package org.txm.scripts.importer.transcriber;
|
45 | 321 | mdecorde | |
46 | 1000 | mdecorde | import java.io.File; |
47 | 1000 | mdecorde | import org.txm.importer.* |
48 | 1000 | mdecorde | import org.txm.importer.scripts.xmltxm.*; |
49 | 927 | mdecorde | import org.txm.*; |
50 | 927 | mdecorde | import org.txm.core.engines.*; |
51 | 321 | mdecorde | import org.txm.objects.*; |
52 | 321 | mdecorde | import org.txm.utils.i18n.*; |
53 | 321 | mdecorde | import org.txm.utils.*; |
54 | 986 | mdecorde | import org.txm.scripts.importer.*; |
55 | 321 | mdecorde | import org.txm.metadatas.*; |
56 | 479 | mdecorde | import org.txm.utils.io.FileCopy; |
57 | 321 | mdecorde | import org.w3c.dom.Element |
58 | 479 | mdecorde | import org.txm.utils.xml.DomUtils; |
59 | 321 | mdecorde | |
60 | 321 | mdecorde | //PARAMETERS
|
61 | 321 | mdecorde | boolean removeInterviewer = false;//if true the transcription of speakers (en1 and enq2) defined in metadatas file will be ignored |
62 | 321 | mdecorde | boolean includeComments = false; |
63 | 321 | mdecorde | boolean ignoreTranscriberMetadata = false; |
64 | 321 | mdecorde | int csvHeaderNumber = 1; |
65 | 321 | mdecorde | int maxlines = 200; |
66 | 321 | mdecorde | |
67 | 321 | mdecorde | String userDir = System.getProperty("user.home"); |
68 | 321 | mdecorde | |
69 | 321 | mdecorde | def MONITOR;
|
70 | 321 | mdecorde | boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
|
71 | 321 | mdecorde | BaseParameters params; |
72 | 321 | mdecorde | try {params = paramsBinding;MONITOR=monitor} catch (Exception) |
73 | 321 | mdecorde | { println "DEV MODE";//exception means we debug |
74 | 321 | mdecorde | debug = true
|
75 | 321 | mdecorde | params = new BaseParameters(new File(userDir, "xml/anapovoas/import.xml")) |
76 | 321 | mdecorde | params.load() |
77 | 321 | mdecorde | if (!org.txm.Toolbox.isInitialized()) {
|
78 | 321 | mdecorde | |
79 | 321 | mdecorde | Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM")); |
80 | 321 | mdecorde | Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models")); |
81 | 321 | mdecorde | Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
|
82 | 321 | mdecorde | Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, "\t");
|
83 | 321 | mdecorde | Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "");
|
84 | 321 | mdecorde | Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM")); |
85 | 321 | mdecorde | } |
86 | 321 | mdecorde | } |
87 | 321 | mdecorde | if (params == null) { println "no parameters. Aborting"; return; } |
88 | 321 | mdecorde | |
89 | 321 | mdecorde | String corpusname = params.getCorpusName();
|
90 | 321 | mdecorde | Element corpusElem = params.corpora.get(corpusname);
|
91 | 321 | mdecorde | String basename = params.name;
|
92 | 321 | mdecorde | String rootDir = params.rootDir;
|
93 | 321 | mdecorde | String lang = corpusElem.getAttribute("lang"); |
94 | 321 | mdecorde | String model = lang
|
95 | 321 | mdecorde | String encoding = corpusElem.getAttribute("encoding"); |
96 | 321 | mdecorde | boolean annotate = "true" == corpusElem.getAttribute("annotate"); |
97 | 321 | mdecorde | String xsl = params.getXsltElement(corpusElem).getAttribute("xsl") |
98 | 321 | mdecorde | def xslParams = params.getXsltParams(corpusElem);
|
99 | 321 | mdecorde | int wordsPerPage = params.getWordsPerPage("default") |
100 | 321 | mdecorde | String page_element = params.getPageElement("default") |
101 | 321 | mdecorde | boolean build_edition = params.getDoEdition("default") |
102 | 321 | mdecorde | |
103 | 321 | mdecorde | File srcDir = new File(rootDir); |
104 | 878 | sjacqu01 | File binDir = new File(Toolbox.getTxmHomePath(),"corpora/"+basename); |
105 | 321 | mdecorde | binDir.deleteDir(); |
106 | 321 | mdecorde | binDir.mkdirs(); |
107 | 321 | mdecorde | if (!binDir.exists()) {
|
108 | 321 | mdecorde | println "Could not create binDir "+binDir
|
109 | 321 | mdecorde | return;
|
110 | 321 | mdecorde | } |
111 | 321 | mdecorde | |
112 | 321 | mdecorde | File txmDir = new File(binDir,"txm/$corpusname"); |
113 | 321 | mdecorde | txmDir.deleteDir(); |
114 | 321 | mdecorde | txmDir.mkdirs(); |
115 | 321 | mdecorde | |
116 | 321 | mdecorde | //get metadata values from CSV
|
117 | 321 | mdecorde | Metadatas metadatas; // text metadata
|
118 | 1000 | mdecorde | File allMetadataFile = Metadatas.findMetadataFile(srcDir);
|
119 | 1000 | mdecorde | println "Trying to read metadatas values from: "+allMetadataFile
|
120 | 1000 | mdecorde | if (allMetadataFile.exists()) {
|
121 | 1000 | mdecorde | File copy = new File(binDir, allMetadataFile.getName()) |
122 | 1000 | mdecorde | if (!FileCopy.copy(allMetadataFile, copy)) {
|
123 | 1000 | mdecorde | println "Error: could not create a copy of metadata file "+allMetadataFile.getAbsoluteFile();
|
124 | 321 | mdecorde | return;
|
125 | 321 | mdecorde | } |
126 | 945 | mdecorde | metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(),
|
127 | 945 | mdecorde | Toolbox.getMetadataColumnSeparator(), |
128 | 945 | mdecorde | Toolbox.getMetadataTextSeparator(), 1)
|
129 | 321 | mdecorde | } |
130 | 321 | mdecorde | else
|
131 | 1000 | mdecorde | println "no metadata file: "+allMetadataFile
|
132 | 321 | mdecorde | |
133 | 321 | mdecorde | File propertyFile = new File(srcDir, "import.properties")//default |
134 | 321 | mdecorde | Properties props = new Properties(); |
135 | 321 | mdecorde | String[] metadatasToKeep; |
136 | 321 | mdecorde | if (propertyFile.exists() && propertyFile.canRead()) {
|
137 | 321 | mdecorde | FileInputStream input = new FileInputStream(propertyFile); |
138 | 321 | mdecorde | props.load(input); |
139 | 321 | mdecorde | input.close(); |
140 | 321 | mdecorde | |
141 | 321 | mdecorde | if (props.getProperty("removeInterviewer") != null) |
142 | 321 | mdecorde | removeInterviewer = Boolean.parseBoolean(props.get("removeInterviewer").toString()); |
143 | 321 | mdecorde | if (props.getProperty("ignoreTranscriberMetadata") != null) |
144 | 321 | mdecorde | ignoreTranscriberMetadata = Boolean.parseBoolean(props.get("ignoreTranscriberMetadata").toString()); |
145 | 321 | mdecorde | if (props.getProperty("metadataList") != null) |
146 | 321 | mdecorde | metadatasToKeep = props.get("metadataList").toString().split("|"); |
147 | 321 | mdecorde | if (props.getProperty("csvHeaderNumber") != null) |
148 | 321 | mdecorde | csvHeaderNumber = props.get("csvHeaderNumber").toString().split("|"); |
149 | 321 | mdecorde | //if (props.getProperty("includeComments") != null)
|
150 | 321 | mdecorde | // includeComments = props.get("includeComments").toString();
|
151 | 321 | mdecorde | |
152 | 321 | mdecorde | println "import properties: "
|
153 | 321 | mdecorde | println " removeInterviewer: "+removeInterviewer
|
154 | 321 | mdecorde | println " ignoreTranscriberMetadata: "+ignoreTranscriberMetadata
|
155 | 321 | mdecorde | println " metadataToKeep: "+metadatasToKeep
|
156 | 321 | mdecorde | println " ignored csvHeaderSize: "+csvHeaderNumber
|
157 | 321 | mdecorde | //println " includeComments: "+includeComments
|
158 | 321 | mdecorde | } |
159 | 321 | mdecorde | |
160 | 321 | mdecorde | // Apply XSL
|
161 | 321 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
162 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(1, "XSL") |
163 | 321 | mdecorde | if (xsl != null && xsl.trim().length() > 0) { |
164 | 321 | mdecorde | if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"))) |
165 | 321 | mdecorde | srcDir = new File(binDir, "src"); |
166 | 321 | mdecorde | println ""
|
167 | 321 | mdecorde | } |
168 | 321 | mdecorde | |
169 | 321 | mdecorde | try {
|
170 | 321 | mdecorde | // select only trs files
|
171 | 321 | mdecorde | String ext = "trs"; |
172 | 321 | mdecorde | ArrayList<File> trsfiles = srcDir.listFiles(); //find all trs files |
173 | 321 | mdecorde | if (trsfiles == null) { |
174 | 321 | mdecorde | println ("No files in "+srcDir.getAbsolutePath())
|
175 | 321 | mdecorde | return false; |
176 | 321 | mdecorde | } |
177 | 321 | mdecorde | for (int i = 0 ; i < trsfiles.size() ; i++) { |
178 | 321 | mdecorde | File f = trsfiles.get(i);
|
179 | 321 | mdecorde | if (!f.getName().endsWith(ext) || !f.canRead() || f.isHidden()) {
|
180 | 321 | mdecorde | trsfiles.remove(i) |
181 | 321 | mdecorde | i--; |
182 | 321 | mdecorde | } |
183 | 321 | mdecorde | } |
184 | 321 | mdecorde | |
185 | 321 | mdecorde | if (trsfiles.size() == 0) { |
186 | 321 | mdecorde | println ("No transcription file (*.trs) found in "+srcDir.getAbsolutePath()+". Aborting.") |
187 | 321 | mdecorde | return false; |
188 | 321 | mdecorde | } |
189 | 321 | mdecorde | |
190 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(1, "IMPORTER") |
191 | 321 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
192 | 321 | mdecorde | println "-- IMPORTER"
|
193 | 321 | mdecorde | def imp = new importer(trsfiles, binDir, txmDir, metadatas, lang) //put result in the txm folder of binDir |
194 | 321 | mdecorde | if (!imp.run()) {
|
195 | 321 | mdecorde | println "Failed to prepare files - Aborting";
|
196 | 321 | mdecorde | return;
|
197 | 321 | mdecorde | } |
198 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(20) |
199 | 321 | mdecorde | |
200 | 321 | mdecorde | println "-- Xml Validation"
|
201 | 321 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
202 | 321 | mdecorde | for (File infile : txmDir.listFiles()) { |
203 | 321 | mdecorde | if (!ValidateXml.test(infile)) {
|
204 | 321 | mdecorde | println "$infile : Validation failed";
|
205 | 321 | mdecorde | infile.delete(); |
206 | 321 | mdecorde | } |
207 | 321 | mdecorde | } |
208 | 321 | mdecorde | |
209 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(5) |
210 | 321 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
211 | 321 | mdecorde | println "-- Remove interviewer: "+removeInterviewer
|
212 | 321 | mdecorde | if (removeInterviewer) {
|
213 | 321 | mdecorde | if (metadatas == null) { |
214 | 321 | mdecorde | println "Can't remove interviewer without a metadata.csv file defining who are the interviewers."
|
215 | 321 | mdecorde | } else {
|
216 | 321 | mdecorde | println "Removing some speakers in "+txmDir.listFiles().length+" file(s)" |
217 | 321 | mdecorde | for (File infile : txmDir.listFiles()) { |
218 | 321 | mdecorde | String filename = infile.getName();
|
219 | 321 | mdecorde | int idx = filename.indexOf(".xml"); |
220 | 321 | mdecorde | if (idx > 0) |
221 | 321 | mdecorde | filename = filename.substring(0, idx);
|
222 | 321 | mdecorde | |
223 | 321 | mdecorde | ArrayList<Pair<String, String>> metas = metadatas.get(filename) |
224 | 321 | mdecorde | //println "filename=$filename metas= $metas"
|
225 | 321 | mdecorde | for (Pair p : metas) {
|
226 | 321 | mdecorde | if (p.getFirst().startsWith("enq")) { |
227 | 321 | mdecorde | new RemoveSpeaker(infile, infile, p.getFirst())
|
228 | 321 | mdecorde | } |
229 | 321 | mdecorde | } |
230 | 321 | mdecorde | } |
231 | 321 | mdecorde | } |
232 | 321 | mdecorde | } |
233 | 321 | mdecorde | |
234 | 321 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
235 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(20, "ANNOTATE") |
236 | 321 | mdecorde | println "-- ANNOTATE - Running NLP tools"
|
237 | 321 | mdecorde | boolean annotationSuccess = false; |
238 | 927 | mdecorde | if (annotate) {
|
239 | 927 | mdecorde | def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger") |
240 | 927 | mdecorde | if (engine.processDirectory(txmDir, binDir, ["lang":model])) { |
241 | 927 | mdecorde | annotationSuccess = true;
|
242 | 927 | mdecorde | } |
243 | 927 | mdecorde | } |
244 | 321 | mdecorde | |
245 | 321 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
246 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(25, "COMPILING") |
247 | 321 | mdecorde | println "--COMPILING - Building Search Engine indexes"
|
248 | 321 | mdecorde | trsfiles = txmDir.listFiles(); |
249 | 321 | mdecorde | |
250 | 321 | mdecorde | def comp = new compiler() |
251 | 321 | mdecorde | if(debug) comp.setDebug();
|
252 | 321 | mdecorde | comp.removeInterviewers(removeInterviewer); |
253 | 321 | mdecorde | comp.setIgnoreTranscriberMetadata(ignoreTranscriberMetadata); |
254 | 321 | mdecorde | comp.setAnnotationSucces(annotationSuccess) |
255 | 321 | mdecorde | if (!comp.run(trsfiles, corpusname, "default", binDir)) { |
256 | 321 | mdecorde | println "Failed to compile files";
|
257 | 321 | mdecorde | return;
|
258 | 321 | mdecorde | } |
259 | 321 | mdecorde | |
260 | 321 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
261 | 321 | mdecorde | |
262 | 321 | mdecorde | File htmlDir = new File(binDir,"HTML/$corpusname"); |
263 | 321 | mdecorde | htmlDir.deleteDir() |
264 | 321 | mdecorde | htmlDir.mkdirs(); |
265 | 321 | mdecorde | if (build_edition) {
|
266 | 321 | mdecorde | |
267 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(20, "EDITION") |
268 | 321 | mdecorde | println "-- EDITION - Building editions"
|
269 | 321 | mdecorde | |
270 | 321 | mdecorde | List<File> filelist = txmDir.listFiles(); |
271 | 321 | mdecorde | Collections.sort(filelist);
|
272 | 321 | mdecorde | def second = 0 |
273 | 321 | mdecorde | |
274 | 321 | mdecorde | println "Paginating texts: "
|
275 | 321 | mdecorde | for (File txmFile : filelist) { |
276 | 321 | mdecorde | print "."
|
277 | 321 | mdecorde | String txtname = txmFile.getName(); |
278 | 321 | mdecorde | int i = txtname.lastIndexOf("."); |
279 | 321 | mdecorde | if(i > 0) txtname = txtname.substring(0, i); |
280 | 321 | mdecorde | |
281 | 321 | mdecorde | List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang); |
282 | 321 | mdecorde | List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang); |
283 | 321 | mdecorde | |
284 | 321 | mdecorde | Element text = params.addText(corpusElem, txtname, txmFile);
|
285 | 321 | mdecorde | |
286 | 321 | mdecorde | def ed = new pager(txmFile, htmlDir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element, metadatas); |
287 | 321 | mdecorde | Element edition = params.addEdition(text, "default", htmlDir.getAbsolutePath(), "html"); |
288 | 321 | mdecorde | |
289 | 321 | mdecorde | for (i = 0 ; i < ed.getPageFiles().size();) { |
290 | 321 | mdecorde | File f = ed.getPageFiles().get(i);
|
291 | 321 | mdecorde | String wordid = ed.getIdx().get(i);
|
292 | 321 | mdecorde | params.addPage(edition, ""+(++i), wordid);
|
293 | 321 | mdecorde | } |
294 | 321 | mdecorde | |
295 | 321 | mdecorde | if (ed.getPageFiles().size() > 0) { |
296 | 321 | mdecorde | Element editionBD = params.addEdition(text, "onepage", htmlDir.getAbsolutePath(), "html"); |
297 | 321 | mdecorde | params.addPage(editionBD, "1", ed.getIndexes().get(0)); |
298 | 321 | mdecorde | } |
299 | 321 | mdecorde | } |
300 | 321 | mdecorde | |
301 | 321 | mdecorde | //copy transcriber.css
|
302 | 878 | sjacqu01 | File cssfile = new File(Toolbox.getTxmHomePath(), "css/transcriber.css") |
303 | 321 | mdecorde | if (cssfile.exists() && htmlDir.exists()) {
|
304 | 321 | mdecorde | FileCopy.copy(cssfile, new File(htmlDir, "transcriber.css")); |
305 | 321 | mdecorde | FileCopy.copy(cssfile, new File(htmlDir, "onepage/transcriber.css")); |
306 | 321 | mdecorde | FileCopy.copy(cssfile, new File(htmlDir, "default/transcriber.css")); |
307 | 321 | mdecorde | } |
308 | 321 | mdecorde | } |
309 | 321 | mdecorde | } |
310 | 321 | mdecorde | catch (Exception e){org.txm.utils.logger.Log.printStackTrace(e);} |
311 | 321 | mdecorde | |
312 | 321 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
313 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(20, "FINALIZING") |
314 | 321 | mdecorde | File paramFile = new File(binDir, "import.xml"); |
315 | 321 | mdecorde | DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true; |