Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / doc / docLoader.groovy @ 1000

History | View | Annotate | Download (12.6 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2012-04-24 14:30:47 +0200 (mar., 24 avr. 2012) $
25
// $LastChangedRevision: 2174 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.doc;
29

    
30
import org.txm.scripts.importer.*;
31
import javax.xml.stream.XMLStreamReader;
32
import org.txm.importer.ApplyXsl2;
33
import org.txm.importer.ValidateXml;
34
import org.txm.scripts.importer.xml.importer;
35
import org.txm.scripts.importer.xml.compiler;
36
import org.txm.objects.*;
37
import org.txm.tokenizer.TokenizerClasses;
38
import org.txm.utils.*;
39
import org.txm.utils.io.*;
40
import org.txm.*;
41
import org.txm.core.preferences.TBXPreferences
42
import org.txm.core.engines.*;
43
import org.txm.importer.scripts.xmltxm.*;
44
import org.txm.utils.i18n.*;
45
import org.txm.metadatas.*;
46
import javax.xml.stream.*;
47
import org.w3c.dom.Element;
48
import org.txm.utils.xml.DomUtils;
49
import org.txm.scripts.importer.RemoveTag;
50
import org.txm.scripts.doc.*;
51

    
52
String userDir = System.getProperty("user.home");
53

    
54
def MONITOR;
55
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
56
BaseParameters params;
57
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
58
{        println "DEV MODE";//exception means we debug
59
        debug = true
60
        params = new BaseParameters(new File(userDir, "xml/doc/import.xml"))
61
        params.load()
62
        if (!org.txm.Toolbox.isInitialized()) {
63
                TokenizerClasses.loadFromNode(params.getTokenizerElement(params.getCorpusElement()));
64
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
65
                //Toolbox.setParam(Toolbox.INSTALL_DIR,new File("C:\\Program Files\\TXM"));//For Windows
66
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
67
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
68
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
69
                //Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File("C:\\Program Files\\treetagger\\models"));//for Windows
70
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
71
        }
72
}
73
if (params == null) { println "no parameters. Aborting"; return; }
74

    
75
String corpusname = params.getCorpusName();
76
Element corpusElem = params.corpora.get(corpusname);
77
String basename = params.name;
78
String rootDir = params.rootDir;
79
String lang = corpusElem.getAttribute("lang");
80
String model = lang
81
boolean annotate = "true" == corpusElem.getAttribute("annotate");
82
int wordsPerPage = params.getWordsPerPage("default")
83
boolean build_edition = params.getDoEdition("default")
84

    
85
File srcDir = new File(rootDir);
86
File binDir = new File(Toolbox.getTxmHomePath(), "corpora/"+basename);
87
binDir.deleteDir();
88
binDir.mkdirs();
89
if (!binDir.exists()) {
90
        println "Could not create binDir "+binDir
91
        return;
92
}
93

    
94
File txmDir = new File(binDir, "txm/$corpusname");
95
txmDir.deleteDir();
96
txmDir.mkdirs();
97

    
98
String textSortAttribute = null; // a property in metadata.csv file
99
String paginationElement = "pb";
100
boolean normalizeMetadata = true;
101

    
102
File allMetadataFile = Metadatas.findMetadataFile(srcDir);
103

    
104
if (txmDir.listFiles() == null) {
105
        println "No file to process"
106
        return;
107
}
108

    
109
def srcfiles = [];
110
for (File f : srcDir.listFiles()) {
111
        String name = f.getName().toLowerCase();
112
        if (name.endsWith(".properties")) continue;
113
        if (name.equals("import.xml")) continue;
114
        if (name.endsWith(".csv")) continue;
115
        if (f.isDirectory()) continue;
116
        if (f.isHidden()) continue;
117
        if (name.endsWith(".doc") || name.endsWith(".docx") || name.endsWith(".odt") || name.endsWith(".rtf")) {
118
                srcfiles.add(f)
119
        }
120
}
121

    
122
if (srcfiles.size() == null) {
123
        println "No file to process"
124
        return;
125
}
126

    
127
// convert to TEI
128
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
129
if (MONITOR != null) MONITOR.worked(20, "CREATING TEI FILES")
130
println "-- CONVERTER - Converting source files"
131
File xsldir = new File(Toolbox.getPreference(TBXPreferences.INSTALL_DIR), "redist/xsl")
132
if (!DocumentToTei.processFiles(srcfiles, txmDir, xsldir)) {
133
        println "Abord import"
134
        return;
135
}
136
println ""
137

    
138
// move data folders and build css file
139
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
140
if (MONITOR != null) MONITOR.worked(20, "RETRIEVE STYLES")
141
println "Retrieving data folders and style files"
142
println txmDir.listFiles()
143
File docfiles = new File(binDir, "docfiles");
144
docfiles.mkdir()
145

    
146
for (File infile : txmDir.listFiles()) {
147
        if (infile.isDirectory()) {
148
                File unzipDir = new File(docfiles, infile.getName())
149
                infile.renameTo(unzipDir)
150

    
151
                //println "zipdir "+unzipDir
152
                StylesToCSS converter = new StylesToCSS(unzipDir);
153
                if (!converter.process(new File(unzipDir, "style.css"))) {
154
                        println "WARNING: Failed to build css file of $unzipDir"
155
                }
156
                // and get the soft page breaks and styles parents
157
                def parentStyles = converter.parentStyles;
158
                def beforebreaks = converter.beforebreaks
159
                def afterbreaks = converter.afterbreaks
160

    
161
                //println "BEFORES: "+beforebreaks
162
                //println "AFTERS: "+afterbreaks
163
                //println "PARENTS: "+parentStyles
164

    
165
                // se servir de ça pour insérer <pb/> et remplacer styles automatiques
166
                File xmlFile = new File(txmDir, unzipDir.getName().substring(6))
167
                //println "PATCH : $xmlFile"
168
                new FixBreaksAndAutomaticStyles().run(xmlFile, beforebreaks, afterbreaks, parentStyles)
169
        }
170
}
171

    
172
// set new Root
173
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
174
if (MONITOR != null) MONITOR.worked(20, "FIXING TEI")
175
println "Setting new XML root element"
176
for (File infile : txmDir.listFiles()) {
177
        if (infile.isHidden() || infile.isDirectory() || infile.getName().startsWith(".")) continue;
178
        print "."
179
        if (!SetNewXmlDocumentRoot.process(infile, "//tei:text")) {
180
                println "Failed to reset XML root $infile"
181
                return
182
        }
183
}
184
println ""
185

    
186
// filtering
187
println "Filtering XML files with xpaths: [//tei:term]"
188
def xpaths = params.getExcludeXpaths()
189
if (xpaths == null) xpaths = ["//tei:term"]
190
else xpaths << "//tei:term"
191

    
192
if (xpaths != null) {
193
        println "Filtering XML files with xpaths: $xpaths"
194
        for (File infile : txmDir.listFiles()) {
195
                if (infile.isHidden() || infile.isDirectory() || infile.getName().startsWith(".")) continue;
196
                print "."
197
                if (!RemoveTag.xpath(infile, xpaths)) {
198
                        println "Failed to filter $infile"
199
                        return
200
                }
201
        }
202
        println ""
203
}
204

    
205
//get metadata values from CSV
206
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
207
if (MONITOR != null) MONITOR.worked(20, "READ METADATA")
208
Metadatas metadatas; // text metadata
209
//println "Trying to read metadata from: "+allMetadataFile
210
if (allMetadataFile.exists()) {
211
        File copy = new File(binDir, allMetadataFile.getName())
212
        if (!FileCopy.copy(allMetadataFile, copy)) {
213
                println "Error: could not create a copy of the $allMetadataFile file "+allMetadataFile.getAbsoluteFile();
214
                return;
215
        }
216
        metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(), 
217
                Toolbox.getMetadataColumnSeparator(), 
218
                Toolbox.getMetadataTextSeparator(), 1)
219
} else {
220
        println "no $allMetadataFile metadata file found."
221
}
222

    
223
for (File infile : txmDir.listFiles()) {
224
        if (infile.isHidden()) infile.delete();
225
        if (infile.getName().startsWith(".")) infile.delete();
226
}
227

    
228
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
229
if (MONITOR != null) MONITOR.worked(5, "IMPORTER")
230
println "-- IMPORTER - Reading source files"
231
def imp = new importer();
232
imp.doValidation(true) // change this to not validate xml
233
imp.doTokenize(true) // change this, to not tokenize xml
234
if (!imp.run(srcDir, binDir, txmDir, basename, "", lang)) {
235
        println "import process stopped";
236
        return;
237
}
238

    
239
if (metadatas != null) {
240
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
241
        if (MONITOR != null) MONITOR.worked(20, "INJECTING METADATA")
242
        println "-- INJECTING METADATA - from csv file: "+allMetadataFile
243
        
244
        println("Injecting metadata: "+metadatas.getHeadersList()+" in texts of directory "+new File(binDir,"txm"))
245
        for (File infile : txmDir.listFiles()) {
246
                if (infile.isHidden() || infile.isDirectory() || infile.getName().startsWith(".")) continue;
247
                print "."
248

    
249
                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
250
                if (!metadatas.injectMetadatasInXml(infile, outfile, "text", null)) {
251
                        outfile.delete();
252
                } else {
253
                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
254
                        if (!infile.exists()) {
255
                                println "Error: could not replace $infile by $outfile"
256
                                return false;
257
                        }
258
                }
259
        }
260
        println ""
261
}
262

    
263
List<File> files = txmDir.listFiles()
264
if (files == null || files.size() == 0) {
265
        return;
266
}
267

    
268
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
269
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
270
println "-- ANNOTATE - Running NLP tools"
271
boolean annotationSuccess = false;
272
if (annotate) {
273
        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
274
        if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
275
                annotationSuccess = true;
276
        }
277
}
278

    
279
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
280
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
281
println "-- COMPILING - Building Search Engine indexes"
282
def c = new compiler();
283
if(debug) c.setDebug();
284
//c.setCwbPath("~/TXM/cwb/bin");
285
c.setOptions(textSortAttribute, normalizeMetadata);
286
c.setAnnotationSuccess(annotationSuccess)
287
c.setLang(lang);
288
if (!c.run(binDir, txmDir, corpusname, null, srcfiles, metadatas)) {
289
        println "import process stopped";
290
        return;
291
}
292

    
293
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
294

    
295
new File(binDir,"HTML/$corpusname").deleteDir();
296
new File(binDir,"HTML/$corpusname").mkdirs();
297
if (build_edition) {
298

    
299
        println "-- EDITION - Building edition"
300
        if (MONITOR != null) MONITOR.worked(25, "EDITION")
301

    
302
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
303
        outdir.mkdirs();
304
        List<File> filelist = txmDir.listFiles();
305
        Collections.sort(filelist);
306
        def second = 0
307

    
308
        println "Paginating texts: "
309

    
310
        for (File txmFile : filelist) {
311
                if (txmFile.isHidden() || txmFile.isDirectory() || txmFile.getName().startsWith(".")) continue;
312
                print "."
313

    
314
                String txtname = txmFile.getName();
315
                int i = txtname.lastIndexOf(".");
316
                if(i > 0) txtname = txtname.substring(0, i);
317

    
318
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
319
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
320

    
321
                Element text = params.addText(corpusElem, txtname, txmFile);
322

    
323
                def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, "pb");
324
                Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
325

    
326
                for (i = 0 ; i < ed.getPageFiles().size();) {
327
                        File f = ed.getPageFiles().get(i);
328
                        String wordid = ed.getIdx().get(i);
329
                        params.addPage(edition, ""+(++i), wordid);
330
                }
331
        }
332

    
333
        for (File unzipDir : docfiles.listFiles()) {
334
                File css = new File(unzipDir, "style.css")
335
                String textname = unzipDir.getName();
336
                textname = textname.substring(6, textname.lastIndexOf(".")) // remove "files-" and ".xml"
337
                File newcss = new File(outdir, textname+".css")
338
                css.renameTo(newcss)
339
        }
340
        File doc = new File(Toolbox.getTxmHomePath(), "css/doc.css")
341
        doc_copy = new File(outdir, "doc.css")
342
        FileCopy.copy(doc, doc_copy);
343

    
344

    
345
        // copy images : images externaly linked must be added manually in the html/CORPUS/default directory
346
        println "Copying internal images..."
347
        for (File txmFile : filelist) {
348
                if (txmFile.isHidden() || txmFile.isDirectory() || txmFile.getName().startsWith(".")) continue;
349
                print "."
350
                File picturesDirectory = new File(binDir, "docfiles/files-"+txmFile.getName()+"/Pictures")
351
                println picturesDirectory
352
                if (picturesDirectory.exists()) {
353
                        File picturesDirectory_copy = new File(outdir, "Pictures")
354
                        FileCopy.copyFiles(picturesDirectory, picturesDirectory_copy)
355
                        if (!picturesDirectory_copy.exists()) {
356
                                println "Failed to copy $picturesDirectory to $picturesDirectory_copy"
357
                        }
358
                }
359
        }
360
        println ""
361
}
362

    
363
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
364
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
365
File paramFile = new File(binDir, "import.xml");
366
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;