Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / transcriber / transcriberLoader.groovy @ 1000

History | View | Annotate | Download (11.4 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
22 321 mdecorde
//
23 321 mdecorde
// This file is part of the TXM platform.
24 321 mdecorde
//
25 321 mdecorde
// The TXM platform is free software: you can redistribute it and/or modif y
26 321 mdecorde
// it under the terms of the GNU General Public License as published by
27 321 mdecorde
// the Free Software Foundation, either version 3 of the License, or
28 321 mdecorde
// (at your option) any later version.
29 321 mdecorde
//
30 321 mdecorde
// The TXM platform is distributed in the hope that it will be useful,
31 321 mdecorde
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32 321 mdecorde
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33 321 mdecorde
// GNU General Public License for more details.
34 321 mdecorde
//
35 321 mdecorde
// You should have received a copy of the GNU General Public License
36 321 mdecorde
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37 321 mdecorde
//
38 321 mdecorde
//
39 321 mdecorde
//
40 321 mdecorde
// $LastChangedDate:$
41 321 mdecorde
// $LastChangedRevision:$
42 321 mdecorde
// $LastChangedBy:$
43 321 mdecorde
//
44 986 mdecorde
package org.txm.scripts.importer.transcriber;
45 321 mdecorde
46 1000 mdecorde
import java.io.File;
47 1000 mdecorde
import org.txm.importer.*
48 1000 mdecorde
import org.txm.importer.scripts.xmltxm.*;
49 927 mdecorde
import org.txm.*;
50 927 mdecorde
import org.txm.core.engines.*;
51 321 mdecorde
import org.txm.objects.*;
52 321 mdecorde
import org.txm.utils.i18n.*;
53 321 mdecorde
import org.txm.utils.*;
54 986 mdecorde
import org.txm.scripts.importer.*;
55 321 mdecorde
import org.txm.metadatas.*;
56 479 mdecorde
import org.txm.utils.io.FileCopy;
57 321 mdecorde
import org.w3c.dom.Element
58 479 mdecorde
import org.txm.utils.xml.DomUtils;
59 321 mdecorde
60 321 mdecorde
//PARAMETERS
61 321 mdecorde
boolean removeInterviewer = false;//if true the transcription of speakers (en1 and enq2) defined in metadatas file will be ignored
62 321 mdecorde
boolean includeComments = false;
63 321 mdecorde
boolean ignoreTranscriberMetadata = false;
64 321 mdecorde
int csvHeaderNumber = 1;
65 321 mdecorde
int maxlines = 200;
66 321 mdecorde
67 321 mdecorde
String userDir = System.getProperty("user.home");
68 321 mdecorde
69 321 mdecorde
def MONITOR;
70 321 mdecorde
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
71 321 mdecorde
BaseParameters params;
72 321 mdecorde
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
73 321 mdecorde
{        println "DEV MODE";//exception means we debug
74 321 mdecorde
        debug = true
75 321 mdecorde
        params = new BaseParameters(new File(userDir, "xml/anapovoas/import.xml"))
76 321 mdecorde
        params.load()
77 321 mdecorde
        if (!org.txm.Toolbox.isInitialized()) {
78 321 mdecorde
79 321 mdecorde
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
80 321 mdecorde
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
81 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
82 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, "\t");
83 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "");
84 321 mdecorde
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
85 321 mdecorde
        }
86 321 mdecorde
}
87 321 mdecorde
if (params == null) { println "no parameters. Aborting"; return; }
88 321 mdecorde
89 321 mdecorde
String corpusname = params.getCorpusName();
90 321 mdecorde
Element corpusElem = params.corpora.get(corpusname);
91 321 mdecorde
String basename = params.name;
92 321 mdecorde
String rootDir = params.rootDir;
93 321 mdecorde
String lang = corpusElem.getAttribute("lang");
94 321 mdecorde
String model = lang
95 321 mdecorde
String encoding = corpusElem.getAttribute("encoding");
96 321 mdecorde
boolean annotate = "true" == corpusElem.getAttribute("annotate");
97 321 mdecorde
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
98 321 mdecorde
def xslParams = params.getXsltParams(corpusElem);
99 321 mdecorde
int wordsPerPage = params.getWordsPerPage("default")
100 321 mdecorde
String page_element = params.getPageElement("default")
101 321 mdecorde
boolean build_edition = params.getDoEdition("default")
102 321 mdecorde
103 321 mdecorde
File srcDir = new File(rootDir);
104 878 sjacqu01
File binDir = new File(Toolbox.getTxmHomePath(),"corpora/"+basename);
105 321 mdecorde
binDir.deleteDir();
106 321 mdecorde
binDir.mkdirs();
107 321 mdecorde
if (!binDir.exists()) {
108 321 mdecorde
        println "Could not create binDir "+binDir
109 321 mdecorde
        return;
110 321 mdecorde
}
111 321 mdecorde
112 321 mdecorde
File txmDir = new File(binDir,"txm/$corpusname");
113 321 mdecorde
txmDir.deleteDir();
114 321 mdecorde
txmDir.mkdirs();
115 321 mdecorde
116 321 mdecorde
//get metadata values from CSV
117 321 mdecorde
Metadatas metadatas; // text metadata
118 1000 mdecorde
File allMetadataFile = Metadatas.findMetadataFile(srcDir);
119 1000 mdecorde
println "Trying to read metadatas values from: "+allMetadataFile
120 1000 mdecorde
if (allMetadataFile.exists()) {
121 1000 mdecorde
        File copy = new File(binDir, allMetadataFile.getName())
122 1000 mdecorde
        if (!FileCopy.copy(allMetadataFile, copy)) {
123 1000 mdecorde
                println "Error: could not create a copy of metadata file "+allMetadataFile.getAbsoluteFile();
124 321 mdecorde
                return;
125 321 mdecorde
        }
126 945 mdecorde
        metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(),
127 945 mdecorde
                Toolbox.getMetadataColumnSeparator(),
128 945 mdecorde
                Toolbox.getMetadataTextSeparator(), 1)
129 321 mdecorde
}
130 321 mdecorde
else
131 1000 mdecorde
        println "no metadata file: "+allMetadataFile
132 321 mdecorde
133 321 mdecorde
File propertyFile = new File(srcDir, "import.properties")//default
134 321 mdecorde
Properties props = new Properties();
135 321 mdecorde
String[] metadatasToKeep;
136 321 mdecorde
if (propertyFile.exists() && propertyFile.canRead()) {
137 321 mdecorde
        FileInputStream input = new FileInputStream(propertyFile);
138 321 mdecorde
        props.load(input);
139 321 mdecorde
        input.close();
140 321 mdecorde
141 321 mdecorde
        if (props.getProperty("removeInterviewer") != null)
142 321 mdecorde
                removeInterviewer = Boolean.parseBoolean(props.get("removeInterviewer").toString());
143 321 mdecorde
        if (props.getProperty("ignoreTranscriberMetadata") != null)
144 321 mdecorde
                ignoreTranscriberMetadata = Boolean.parseBoolean(props.get("ignoreTranscriberMetadata").toString());
145 321 mdecorde
        if (props.getProperty("metadataList") != null)
146 321 mdecorde
                metadatasToKeep = props.get("metadataList").toString().split("|");
147 321 mdecorde
        if (props.getProperty("csvHeaderNumber") != null)
148 321 mdecorde
                csvHeaderNumber = props.get("csvHeaderNumber").toString().split("|");
149 321 mdecorde
        //if (props.getProperty("includeComments") != null)
150 321 mdecorde
        //        includeComments = props.get("includeComments").toString();
151 321 mdecorde
152 321 mdecorde
        println "import properties: "
153 321 mdecorde
        println " removeInterviewer: "+removeInterviewer
154 321 mdecorde
        println " ignoreTranscriberMetadata: "+ignoreTranscriberMetadata
155 321 mdecorde
        println " metadataToKeep: "+metadatasToKeep
156 321 mdecorde
        println " ignored csvHeaderSize: "+csvHeaderNumber
157 321 mdecorde
        //println " includeComments: "+includeComments
158 321 mdecorde
}
159 321 mdecorde
160 321 mdecorde
// Apply XSL
161 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
162 321 mdecorde
if (MONITOR != null) MONITOR.worked(1, "XSL")
163 321 mdecorde
if (xsl != null && xsl.trim().length() > 0) {
164 321 mdecorde
        if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src")))
165 321 mdecorde
                srcDir = new File(binDir, "src");
166 321 mdecorde
        println ""
167 321 mdecorde
}
168 321 mdecorde
169 321 mdecorde
try {
170 321 mdecorde
        // select only trs files
171 321 mdecorde
        String ext = "trs";
172 321 mdecorde
        ArrayList<File> trsfiles = srcDir.listFiles(); //find all trs files
173 321 mdecorde
        if (trsfiles  == null) {
174 321 mdecorde
                println ("No files in "+srcDir.getAbsolutePath())
175 321 mdecorde
                return false;
176 321 mdecorde
        }
177 321 mdecorde
        for (int i = 0 ; i < trsfiles.size() ; i++) {
178 321 mdecorde
                File f = trsfiles.get(i);
179 321 mdecorde
                if (!f.getName().endsWith(ext) || !f.canRead() || f.isHidden()) {
180 321 mdecorde
                        trsfiles.remove(i)
181 321 mdecorde
                        i--;
182 321 mdecorde
                }
183 321 mdecorde
        }
184 321 mdecorde
185 321 mdecorde
        if (trsfiles.size() == 0) {
186 321 mdecorde
                println ("No transcription file (*.trs) found in "+srcDir.getAbsolutePath()+". Aborting.")
187 321 mdecorde
                return false;
188 321 mdecorde
        }
189 321 mdecorde
190 321 mdecorde
        if (MONITOR != null) MONITOR.worked(1, "IMPORTER")
191 321 mdecorde
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
192 321 mdecorde
        println "-- IMPORTER"
193 321 mdecorde
        def imp = new importer(trsfiles, binDir, txmDir, metadatas, lang) //put result in the txm folder of binDir
194 321 mdecorde
        if (!imp.run()) {
195 321 mdecorde
                println "Failed to prepare files - Aborting";
196 321 mdecorde
                return;
197 321 mdecorde
        }
198 321 mdecorde
        if (MONITOR != null) MONITOR.worked(20)
199 321 mdecorde
200 321 mdecorde
        println "-- Xml Validation"
201 321 mdecorde
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
202 321 mdecorde
        for (File infile : txmDir.listFiles()) {
203 321 mdecorde
                if (!ValidateXml.test(infile)) {
204 321 mdecorde
                        println "$infile : Validation failed";
205 321 mdecorde
                        infile.delete();
206 321 mdecorde
                }
207 321 mdecorde
        }
208 321 mdecorde
209 321 mdecorde
        if (MONITOR != null) MONITOR.worked(5)
210 321 mdecorde
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
211 321 mdecorde
        println "-- Remove interviewer: "+removeInterviewer
212 321 mdecorde
        if (removeInterviewer)         {
213 321 mdecorde
                if (metadatas == null) {
214 321 mdecorde
                        println "Can't remove interviewer without a metadata.csv file defining who are the interviewers."
215 321 mdecorde
                } else {
216 321 mdecorde
                        println "Removing some speakers in "+txmDir.listFiles().length+" file(s)"
217 321 mdecorde
                        for (File infile : txmDir.listFiles()) {
218 321 mdecorde
                                String filename = infile.getName();
219 321 mdecorde
                                int idx = filename.indexOf(".xml");
220 321 mdecorde
                                if (idx > 0)
221 321 mdecorde
                                        filename = filename.substring(0, idx);
222 321 mdecorde
223 321 mdecorde
                                ArrayList<Pair<String, String>> metas = metadatas.get(filename)
224 321 mdecorde
                                //println "filename=$filename metas= $metas"
225 321 mdecorde
                                for (Pair p : metas) {
226 321 mdecorde
                                        if (p.getFirst().startsWith("enq")) {
227 321 mdecorde
                                                new RemoveSpeaker(infile, infile, p.getFirst())
228 321 mdecorde
                                        }
229 321 mdecorde
                                }
230 321 mdecorde
                        }
231 321 mdecorde
                }
232 321 mdecorde
        }
233 321 mdecorde
234 321 mdecorde
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
235 321 mdecorde
        if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
236 321 mdecorde
        println "-- ANNOTATE - Running NLP tools"
237 321 mdecorde
        boolean annotationSuccess = false;
238 927 mdecorde
if (annotate) {
239 927 mdecorde
        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
240 927 mdecorde
        if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
241 927 mdecorde
                annotationSuccess = true;
242 927 mdecorde
        }
243 927 mdecorde
}
244 321 mdecorde
245 321 mdecorde
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
246 321 mdecorde
        if (MONITOR != null) MONITOR.worked(25, "COMPILING")
247 321 mdecorde
        println "--COMPILING - Building Search Engine indexes"
248 321 mdecorde
        trsfiles = txmDir.listFiles();
249 321 mdecorde
250 321 mdecorde
        def comp = new compiler()
251 321 mdecorde
        if(debug) comp.setDebug();
252 321 mdecorde
        comp.removeInterviewers(removeInterviewer);
253 321 mdecorde
        comp.setIgnoreTranscriberMetadata(ignoreTranscriberMetadata);
254 321 mdecorde
        comp.setAnnotationSucces(annotationSuccess)
255 321 mdecorde
        if (!comp.run(trsfiles, corpusname, "default", binDir)) {
256 321 mdecorde
                println "Failed to compile files";
257 321 mdecorde
                return;
258 321 mdecorde
        }
259 321 mdecorde
260 321 mdecorde
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
261 321 mdecorde
262 321 mdecorde
        File htmlDir = new File(binDir,"HTML/$corpusname");
263 321 mdecorde
        htmlDir.deleteDir()
264 321 mdecorde
        htmlDir.mkdirs();
265 321 mdecorde
        if (build_edition) {
266 321 mdecorde
267 321 mdecorde
                if (MONITOR != null) MONITOR.worked(20, "EDITION")
268 321 mdecorde
                println "-- EDITION - Building editions"
269 321 mdecorde
270 321 mdecorde
                List<File> filelist = txmDir.listFiles();
271 321 mdecorde
                Collections.sort(filelist);
272 321 mdecorde
                def second = 0
273 321 mdecorde
274 321 mdecorde
                println "Paginating texts: "
275 321 mdecorde
                for (File txmFile : filelist) {
276 321 mdecorde
                        print "."
277 321 mdecorde
                        String txtname = txmFile.getName();
278 321 mdecorde
                        int i = txtname.lastIndexOf(".");
279 321 mdecorde
                        if(i > 0) txtname = txtname.substring(0, i);
280 321 mdecorde
281 321 mdecorde
                        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
282 321 mdecorde
                        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
283 321 mdecorde
284 321 mdecorde
                        Element text = params.addText(corpusElem, txtname, txmFile);
285 321 mdecorde
286 321 mdecorde
                        def ed = new pager(txmFile, htmlDir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element, metadatas);
287 321 mdecorde
                        Element edition = params.addEdition(text, "default", htmlDir.getAbsolutePath(), "html");
288 321 mdecorde
289 321 mdecorde
                        for (i = 0 ; i < ed.getPageFiles().size();) {
290 321 mdecorde
                                File f = ed.getPageFiles().get(i);
291 321 mdecorde
                                String wordid = ed.getIdx().get(i);
292 321 mdecorde
                                params.addPage(edition, ""+(++i), wordid);
293 321 mdecorde
                        }
294 321 mdecorde
295 321 mdecorde
                        if (ed.getPageFiles().size() > 0) {
296 321 mdecorde
                                Element editionBD = params.addEdition(text, "onepage", htmlDir.getAbsolutePath(), "html");
297 321 mdecorde
                                params.addPage(editionBD, "1", ed.getIndexes().get(0));
298 321 mdecorde
                        }
299 321 mdecorde
                }
300 321 mdecorde
301 321 mdecorde
                //copy transcriber.css
302 878 sjacqu01
                File cssfile = new File(Toolbox.getTxmHomePath(), "css/transcriber.css")
303 321 mdecorde
                if (cssfile.exists() && htmlDir.exists()) {
304 321 mdecorde
                        FileCopy.copy(cssfile, new File(htmlDir, "transcriber.css"));
305 321 mdecorde
                        FileCopy.copy(cssfile, new File(htmlDir, "onepage/transcriber.css"));
306 321 mdecorde
                        FileCopy.copy(cssfile, new File(htmlDir, "default/transcriber.css"));
307 321 mdecorde
                }
308 321 mdecorde
        }
309 321 mdecorde
}
310 321 mdecorde
catch (Exception e){org.txm.utils.logger.Log.printStackTrace(e);}
311 321 mdecorde
312 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
313 321 mdecorde
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
314 321 mdecorde
File paramFile = new File(binDir, "import.xml");
315 321 mdecorde
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;