Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / doc / docLoader.groovy @ 966

History | View | Annotate | Download (12.5 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
//
22 321 mdecorde
//
23 321 mdecorde
//
24 321 mdecorde
// $LastChangedDate: 2012-04-24 14:30:47 +0200 (mar., 24 avr. 2012) $
25 321 mdecorde
// $LastChangedRevision: 2174 $
26 321 mdecorde
// $LastChangedBy: mdecorde $
27 321 mdecorde
//
28 321 mdecorde
package org.txm.importer.doc;
29 321 mdecorde
30 321 mdecorde
import org.txm.importer.*;
31 321 mdecorde
import javax.xml.stream.XMLStreamReader;
32 321 mdecorde
import org.txm.importer.ApplyXsl2;
33 321 mdecorde
import org.txm.importer.ValidateXml;
34 321 mdecorde
import org.txm.importer.xml.importer;
35 321 mdecorde
import org.txm.importer.xml.compiler;
36 321 mdecorde
import org.txm.objects.*;
37 321 mdecorde
import org.txm.tokenizer.TokenizerClasses;
38 321 mdecorde
import org.txm.utils.*;
39 927 mdecorde
import org.txm.utils.io.*;
40 321 mdecorde
import org.txm.*;
41 927 mdecorde
import org.txm.core.preferences.TBXPreferences
42 927 mdecorde
import org.txm.core.engines.*;
43 927 mdecorde
import org.txm.importer.xmltxm.*;
44 321 mdecorde
import org.txm.utils.i18n.*;
45 321 mdecorde
import org.txm.metadatas.*;
46 321 mdecorde
import javax.xml.stream.*;
47 321 mdecorde
import org.w3c.dom.Element;
48 479 mdecorde
import org.txm.utils.xml.DomUtils;
49 321 mdecorde
import org.txm.sw.RemoveTag;
50 321 mdecorde
import org.txm.doc.*;
51 321 mdecorde
52 321 mdecorde
String userDir = System.getProperty("user.home");
53 321 mdecorde
54 321 mdecorde
def MONITOR;
55 321 mdecorde
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
56 321 mdecorde
BaseParameters params;
57 321 mdecorde
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
58 321 mdecorde
{        println "DEV MODE";//exception means we debug
59 321 mdecorde
        debug = true
60 321 mdecorde
        params = new BaseParameters(new File(userDir, "xml/doc/import.xml"))
61 321 mdecorde
        params.load()
62 321 mdecorde
        if (!org.txm.Toolbox.isInitialized()) {
63 321 mdecorde
                TokenizerClasses.loadFromNode(params.getTokenizerElement(params.getCorpusElement()));
64 321 mdecorde
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
65 321 mdecorde
                //Toolbox.setParam(Toolbox.INSTALL_DIR,new File("C:\\Program Files\\TXM"));//For Windows
66 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
67 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
68 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
69 321 mdecorde
                //Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File("C:\\Program Files\\treetagger\\models"));//for Windows
70 321 mdecorde
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
71 321 mdecorde
        }
72 321 mdecorde
}
73 321 mdecorde
if (params == null) { println "no parameters. Aborting"; return; }
74 321 mdecorde
75 321 mdecorde
String corpusname = params.getCorpusName();
76 321 mdecorde
Element corpusElem = params.corpora.get(corpusname);
77 321 mdecorde
String basename = params.name;
78 321 mdecorde
String rootDir = params.rootDir;
79 321 mdecorde
String lang = corpusElem.getAttribute("lang");
80 321 mdecorde
String model = lang
81 321 mdecorde
boolean annotate = "true" == corpusElem.getAttribute("annotate");
82 321 mdecorde
int wordsPerPage = params.getWordsPerPage("default")
83 321 mdecorde
boolean build_edition = params.getDoEdition("default")
84 321 mdecorde
85 321 mdecorde
File srcDir = new File(rootDir);
86 878 sjacqu01
File binDir = new File(Toolbox.getTxmHomePath(), "corpora/"+basename);
87 321 mdecorde
binDir.deleteDir();
88 321 mdecorde
binDir.mkdirs();
89 321 mdecorde
if (!binDir.exists()) {
90 321 mdecorde
        println "Could not create binDir "+binDir
91 321 mdecorde
        return;
92 321 mdecorde
}
93 321 mdecorde
94 321 mdecorde
File txmDir = new File(binDir, "txm/$corpusname");
95 321 mdecorde
txmDir.deleteDir();
96 321 mdecorde
txmDir.mkdirs();
97 321 mdecorde
98 321 mdecorde
String textSortAttribute = null; // a property in metadata.csv file
99 321 mdecorde
String paginationElement = "pb";
100 321 mdecorde
boolean normalizeMetadata = true;
101 321 mdecorde
102 965 mdecorde
File allmetadatasfile = Metadatas.findMetadataFile(srcDir);
103 321 mdecorde
104 321 mdecorde
if (txmDir.listFiles() == null) {
105 321 mdecorde
        println "No file to process"
106 321 mdecorde
        return;
107 321 mdecorde
}
108 321 mdecorde
109 321 mdecorde
def srcfiles = [];
110 321 mdecorde
for (File f : srcDir.listFiles()) {
111 321 mdecorde
        String name = f.getName().toLowerCase();
112 321 mdecorde
        if (name.endsWith(".properties")) continue;
113 321 mdecorde
        if (name.equals("import.xml")) continue;
114 321 mdecorde
        if (name.endsWith(".csv")) continue;
115 321 mdecorde
        if (f.isDirectory()) continue;
116 321 mdecorde
        if (f.isHidden()) continue;
117 321 mdecorde
        if (name.endsWith(".doc") || name.endsWith(".docx") || name.endsWith(".odt") || name.endsWith(".rtf")) {
118 321 mdecorde
                srcfiles.add(f)
119 321 mdecorde
        }
120 321 mdecorde
}
121 321 mdecorde
122 321 mdecorde
if (srcfiles.size() == null) {
123 321 mdecorde
        println "No file to process"
124 321 mdecorde
        return;
125 321 mdecorde
}
126 321 mdecorde
127 321 mdecorde
// convert to TEI
128 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
129 321 mdecorde
if (MONITOR != null) MONITOR.worked(20, "CREATING TEI FILES")
130 321 mdecorde
println "-- CONVERTER - Converting source files"
131 927 mdecorde
File xsldir = new File(Toolbox.getPreference(TBXPreferences.INSTALL_DIR), "redist/xsl")
132 321 mdecorde
if (!DocumentToTei.processFiles(srcfiles, txmDir, xsldir)) {
133 321 mdecorde
        println "Abord import"
134 321 mdecorde
        return;
135 321 mdecorde
}
136 321 mdecorde
println ""
137 321 mdecorde
138 321 mdecorde
// move data folders and build css file
139 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
140 321 mdecorde
if (MONITOR != null) MONITOR.worked(20, "RETRIEVE STYLES")
141 321 mdecorde
println "Retrieving data folders and style files"
142 321 mdecorde
println txmDir.listFiles()
143 321 mdecorde
File docfiles = new File(binDir, "docfiles");
144 321 mdecorde
docfiles.mkdir()
145 321 mdecorde
for (File infile : txmDir.listFiles()) {
146 321 mdecorde
        if (infile.isDirectory()) {
147 321 mdecorde
                File unzipDir = new File(docfiles, infile.getName())
148 321 mdecorde
                infile.renameTo(unzipDir)
149 321 mdecorde
150 321 mdecorde
                //println "zipdir "+unzipDir
151 321 mdecorde
                StylesToCSS converter = new StylesToCSS(unzipDir);
152 321 mdecorde
                if (!converter.process(new File(unzipDir, "style.css"))) {
153 321 mdecorde
                        println "WARNING: Failed to build css file of $unzipDir"
154 321 mdecorde
                }
155 321 mdecorde
                // and get the soft page breaks and styles parents
156 321 mdecorde
                def parentStyles = converter.parentStyles;
157 321 mdecorde
                def beforebreaks = converter.beforebreaks
158 321 mdecorde
                def afterbreaks = converter.afterbreaks
159 321 mdecorde
160 321 mdecorde
                //println "BEFORES: "+beforebreaks
161 321 mdecorde
                //println "AFTERS: "+afterbreaks
162 321 mdecorde
                //println "PARENTS: "+parentStyles
163 321 mdecorde
164 321 mdecorde
                // se servir de ça pour insérer <pb/> et remplacer styles automatiques
165 321 mdecorde
                File xmlFile = new File(txmDir, unzipDir.getName().substring(6))
166 321 mdecorde
                //println "PATCH : $xmlFile"
167 321 mdecorde
                new FixBreaksAndAutomaticStyles().run(xmlFile, beforebreaks, afterbreaks, parentStyles)
168 321 mdecorde
        }
169 321 mdecorde
}
170 321 mdecorde
171 321 mdecorde
// set new Root
172 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
173 321 mdecorde
if (MONITOR != null) MONITOR.worked(20, "FIXING TEI")
174 321 mdecorde
println "Setting new XML root element"
175 321 mdecorde
for (File infile : txmDir.listFiles()) {
176 321 mdecorde
        if (infile.isHidden() || infile.isDirectory() || infile.getName().startsWith(".")) continue;
177 321 mdecorde
        print "."
178 321 mdecorde
        if (!SetNewXmlDocumentRoot.process(infile, "//tei:text")) {
179 321 mdecorde
                println "Failed to reset XML root $infile"
180 321 mdecorde
                return
181 321 mdecorde
        }
182 321 mdecorde
}
183 321 mdecorde
println ""
184 321 mdecorde
185 321 mdecorde
// filtering
186 321 mdecorde
println "Filtering XML files with xpaths: [//tei:term]"
187 321 mdecorde
def xpaths = params.getExcludeXpaths()
188 321 mdecorde
if (xpaths == null) xpaths = ["//tei:term"]
189 321 mdecorde
else xpaths << "//tei:term"
190 321 mdecorde
191 321 mdecorde
if (xpaths != null) {
192 321 mdecorde
        println "Filtering XML files with xpaths: $xpaths"
193 321 mdecorde
        for (File infile : txmDir.listFiles()) {
194 321 mdecorde
                if (infile.isHidden() || infile.isDirectory() || infile.getName().startsWith(".")) continue;
195 321 mdecorde
                print "."
196 321 mdecorde
                if (!RemoveTag.xpath(infile, xpaths)) {
197 321 mdecorde
                        println "Failed to filter $infile"
198 321 mdecorde
                        return
199 321 mdecorde
                }
200 321 mdecorde
        }
201 321 mdecorde
        println ""
202 321 mdecorde
}
203 321 mdecorde
204 321 mdecorde
//get metadata values from CSV
205 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
206 321 mdecorde
if (MONITOR != null) MONITOR.worked(20, "READ METADATA")
207 321 mdecorde
Metadatas metadatas; // text metadata
208 321 mdecorde
//println "Trying to read metadata from: "+allmetadatasfile
209 321 mdecorde
if (allmetadatasfile.exists()) {
210 966 mdecorde
        File copy = new File(binDir, allmetadatasfile.getName())
211 321 mdecorde
        if (!FileCopy.copy(allmetadatasfile, copy)) {
212 966 mdecorde
                println "Error: could not create a copy of the $allmetadatasfile file "+allmetadatasfile.getAbsoluteFile();
213 321 mdecorde
                return;
214 321 mdecorde
        }
215 788 mdecorde
        metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(),
216 788 mdecorde
                Toolbox.getMetadataColumnSeparator(),
217 788 mdecorde
                Toolbox.getMetadataTextSeparator(), 1)
218 321 mdecorde
} else {
219 321 mdecorde
        println "no $allmetadatasfile metadata file found."
220 321 mdecorde
}
221 321 mdecorde
222 321 mdecorde
for (File infile : txmDir.listFiles()) {
223 321 mdecorde
        if (infile.isHidden()) infile.delete();
224 321 mdecorde
        if (infile.getName().startsWith(".")) infile.delete();
225 321 mdecorde
}
226 321 mdecorde
227 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
228 321 mdecorde
if (MONITOR != null) MONITOR.worked(5, "IMPORTER")
229 321 mdecorde
println "-- IMPORTER - Reading source files"
230 321 mdecorde
def imp = new importer();
231 321 mdecorde
imp.doValidation(true) // change this to not validate xml
232 321 mdecorde
imp.doTokenize(true) // change this, to not tokenize xml
233 321 mdecorde
if (!imp.run(srcDir, binDir, txmDir, basename, "", lang)) {
234 321 mdecorde
        println "import process stopped";
235 321 mdecorde
        return;
236 321 mdecorde
}
237 321 mdecorde
238 321 mdecorde
if (metadatas != null) {
239 321 mdecorde
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
240 321 mdecorde
        if (MONITOR != null) MONITOR.worked(20, "INJECTING METADATA")
241 321 mdecorde
        println "-- INJECTING METADATA - from csv file: "+allmetadatasfile
242 321 mdecorde
243 321 mdecorde
        println("Injecting metadata: "+metadatas.getHeadersList()+" in texts of directory "+new File(binDir,"txm"))
244 321 mdecorde
        for (File infile : txmDir.listFiles()) {
245 321 mdecorde
                if (infile.isHidden() || infile.isDirectory() || infile.getName().startsWith(".")) continue;
246 321 mdecorde
                print "."
247 321 mdecorde
248 321 mdecorde
                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
249 321 mdecorde
                if (!metadatas.injectMetadatasInXml(infile, outfile, "text", null)) {
250 321 mdecorde
                        outfile.delete();
251 321 mdecorde
                } else {
252 321 mdecorde
                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
253 321 mdecorde
                        if (!infile.exists()) {
254 321 mdecorde
                                println "Error: could not replace $infile by $outfile"
255 321 mdecorde
                                return false;
256 321 mdecorde
                        }
257 321 mdecorde
                }
258 321 mdecorde
        }
259 321 mdecorde
        println ""
260 321 mdecorde
}
261 321 mdecorde
262 321 mdecorde
List<File> files = txmDir.listFiles()
263 321 mdecorde
if (files == null || files.size() == 0) {
264 321 mdecorde
        return;
265 321 mdecorde
}
266 321 mdecorde
267 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
268 321 mdecorde
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
269 321 mdecorde
println "-- ANNOTATE - Running NLP tools"
270 321 mdecorde
boolean annotationSuccess = false;
271 927 mdecorde
if (annotate) {
272 927 mdecorde
        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
273 927 mdecorde
        if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
274 927 mdecorde
                annotationSuccess = true;
275 927 mdecorde
        }
276 321 mdecorde
}
277 321 mdecorde
278 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
279 321 mdecorde
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
280 321 mdecorde
println "-- COMPILING - Building Search Engine indexes"
281 321 mdecorde
def c = new compiler();
282 321 mdecorde
if(debug) c.setDebug();
283 321 mdecorde
//c.setCwbPath("~/TXM/cwb/bin");
284 321 mdecorde
c.setOptions(textSortAttribute, normalizeMetadata);
285 321 mdecorde
c.setAnnotationSuccess(annotationSuccess)
286 321 mdecorde
c.setLang(lang);
287 321 mdecorde
if (!c.run(binDir, txmDir, corpusname, null, srcfiles, metadatas)) {
288 321 mdecorde
        println "import process stopped";
289 321 mdecorde
        return;
290 321 mdecorde
}
291 321 mdecorde
292 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
293 321 mdecorde
294 321 mdecorde
new File(binDir,"HTML/$corpusname").deleteDir();
295 321 mdecorde
new File(binDir,"HTML/$corpusname").mkdirs();
296 321 mdecorde
if (build_edition) {
297 321 mdecorde
298 321 mdecorde
        println "-- EDITION - Building edition"
299 321 mdecorde
        if (MONITOR != null) MONITOR.worked(25, "EDITION")
300 321 mdecorde
301 321 mdecorde
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
302 321 mdecorde
        outdir.mkdirs();
303 321 mdecorde
        List<File> filelist = txmDir.listFiles();
304 321 mdecorde
        Collections.sort(filelist);
305 321 mdecorde
        def second = 0
306 321 mdecorde
307 321 mdecorde
        println "Paginating texts: "
308 321 mdecorde
309 321 mdecorde
        for (File txmFile : filelist) {
310 321 mdecorde
                if (txmFile.isHidden() || txmFile.isDirectory() || txmFile.getName().startsWith(".")) continue;
311 321 mdecorde
                print "."
312 321 mdecorde
313 321 mdecorde
                String txtname = txmFile.getName();
314 321 mdecorde
                int i = txtname.lastIndexOf(".");
315 321 mdecorde
                if(i > 0) txtname = txtname.substring(0, i);
316 321 mdecorde
317 321 mdecorde
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
318 321 mdecorde
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
319 321 mdecorde
320 321 mdecorde
                Element text = params.addText(corpusElem, txtname, txmFile);
321 321 mdecorde
322 321 mdecorde
                def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, "pb");
323 321 mdecorde
                Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
324 321 mdecorde
325 321 mdecorde
                for (i = 0 ; i < ed.getPageFiles().size();) {
326 321 mdecorde
                        File f = ed.getPageFiles().get(i);
327 321 mdecorde
                        String wordid = ed.getIdx().get(i);
328 321 mdecorde
                        params.addPage(edition, ""+(++i), wordid);
329 321 mdecorde
                }
330 321 mdecorde
        }
331 321 mdecorde
332 321 mdecorde
        for (File unzipDir : docfiles.listFiles()) {
333 321 mdecorde
                File css = new File(unzipDir, "style.css")
334 321 mdecorde
                String textname = unzipDir.getName();
335 321 mdecorde
                textname = textname.substring(6, textname.lastIndexOf(".")) // remove "files-" and ".xml"
336 321 mdecorde
                File newcss = new File(outdir, textname+".css")
337 321 mdecorde
                css.renameTo(newcss)
338 321 mdecorde
        }
339 878 sjacqu01
        File doc = new File(Toolbox.getTxmHomePath(), "css/doc.css")
340 321 mdecorde
        doc_copy = new File(outdir, "doc.css")
341 321 mdecorde
        FileCopy.copy(doc, doc_copy);
342 321 mdecorde
343 321 mdecorde
344 321 mdecorde
        // copy images : images externaly linked must be added manually in the html/CORPUS/default directory
345 321 mdecorde
        println "Copying internal images..."
346 321 mdecorde
        for (File txmFile : filelist) {
347 321 mdecorde
                if (txmFile.isHidden() || txmFile.isDirectory() || txmFile.getName().startsWith(".")) continue;
348 321 mdecorde
                print "."
349 321 mdecorde
                File picturesDirectory = new File(binDir, "docfiles/files-"+txmFile.getName()+"/Pictures")
350 321 mdecorde
                println picturesDirectory
351 321 mdecorde
                if (picturesDirectory.exists()) {
352 321 mdecorde
                        File picturesDirectory_copy = new File(outdir, "Pictures")
353 479 mdecorde
                        FileCopy.copyFiles(picturesDirectory, picturesDirectory_copy)
354 479 mdecorde
                        if (!picturesDirectory_copy.exists()) {
355 321 mdecorde
                                println "Failed to copy $picturesDirectory to $picturesDirectory_copy"
356 321 mdecorde
                        }
357 321 mdecorde
                }
358 321 mdecorde
        }
359 321 mdecorde
        println ""
360 321 mdecorde
}
361 321 mdecorde
362 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
363 321 mdecorde
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
364 321 mdecorde
File paramFile = new File(binDir, "import.xml");
365 321 mdecorde
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;