Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / doc / docLoader.groovy @ 966

History | View | Annotate | Download (12.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2012-04-24 14:30:47 +0200 (mar., 24 avr. 2012) $
25
// $LastChangedRevision: 2174 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.doc;
29

    
30
import org.txm.importer.*;
31
import javax.xml.stream.XMLStreamReader;
32
import org.txm.importer.ApplyXsl2;
33
import org.txm.importer.ValidateXml;
34
import org.txm.importer.xml.importer;
35
import org.txm.importer.xml.compiler;
36
import org.txm.objects.*;
37
import org.txm.tokenizer.TokenizerClasses;
38
import org.txm.utils.*;
39
import org.txm.utils.io.*;
40
import org.txm.*;
41
import org.txm.core.preferences.TBXPreferences
42
import org.txm.core.engines.*;
43
import org.txm.importer.xmltxm.*;
44
import org.txm.utils.i18n.*;
45
import org.txm.metadatas.*;
46
import javax.xml.stream.*;
47
import org.w3c.dom.Element;
48
import org.txm.utils.xml.DomUtils;
49
import org.txm.sw.RemoveTag;
50
import org.txm.doc.*;
51

    
52
String userDir = System.getProperty("user.home");
53

    
54
def MONITOR;
55
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
56
BaseParameters params;
57
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
58
{        println "DEV MODE";//exception means we debug
59
        debug = true
60
        params = new BaseParameters(new File(userDir, "xml/doc/import.xml"))
61
        params.load()
62
        if (!org.txm.Toolbox.isInitialized()) {
63
                TokenizerClasses.loadFromNode(params.getTokenizerElement(params.getCorpusElement()));
64
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
65
                //Toolbox.setParam(Toolbox.INSTALL_DIR,new File("C:\\Program Files\\TXM"));//For Windows
66
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
67
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
68
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
69
                //Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File("C:\\Program Files\\treetagger\\models"));//for Windows
70
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
71
        }
72
}
73
if (params == null) { println "no parameters. Aborting"; return; }
74

    
75
String corpusname = params.getCorpusName();
76
Element corpusElem = params.corpora.get(corpusname);
77
String basename = params.name;
78
String rootDir = params.rootDir;
79
String lang = corpusElem.getAttribute("lang");
80
String model = lang
81
boolean annotate = "true" == corpusElem.getAttribute("annotate");
82
int wordsPerPage = params.getWordsPerPage("default")
83
boolean build_edition = params.getDoEdition("default")
84

    
85
File srcDir = new File(rootDir);
86
File binDir = new File(Toolbox.getTxmHomePath(), "corpora/"+basename);
87
binDir.deleteDir();
88
binDir.mkdirs();
89
if (!binDir.exists()) {
90
        println "Could not create binDir "+binDir
91
        return;
92
}
93

    
94
File txmDir = new File(binDir, "txm/$corpusname");
95
txmDir.deleteDir();
96
txmDir.mkdirs();
97

    
98
String textSortAttribute = null; // a property in metadata.csv file
99
String paginationElement = "pb";
100
boolean normalizeMetadata = true;
101

    
102
File allmetadatasfile = Metadatas.findMetadataFile(srcDir);
103

    
104
if (txmDir.listFiles() == null) {
105
        println "No file to process"
106
        return;
107
}
108

    
109
def srcfiles = [];
110
for (File f : srcDir.listFiles()) {
111
        String name = f.getName().toLowerCase();
112
        if (name.endsWith(".properties")) continue;
113
        if (name.equals("import.xml")) continue;
114
        if (name.endsWith(".csv")) continue;
115
        if (f.isDirectory()) continue;
116
        if (f.isHidden()) continue;
117
        if (name.endsWith(".doc") || name.endsWith(".docx") || name.endsWith(".odt") || name.endsWith(".rtf")) {
118
                srcfiles.add(f)
119
        }
120
}
121

    
122
if (srcfiles.size() == null) {
123
        println "No file to process"
124
        return;
125
}
126

    
127
// convert to TEI
128
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
129
if (MONITOR != null) MONITOR.worked(20, "CREATING TEI FILES")
130
println "-- CONVERTER - Converting source files"
131
File xsldir = new File(Toolbox.getPreference(TBXPreferences.INSTALL_DIR), "redist/xsl")
132
if (!DocumentToTei.processFiles(srcfiles, txmDir, xsldir)) {
133
        println "Abord import"
134
        return;
135
}
136
println ""
137

    
138
// move data folders and build css file
139
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
140
if (MONITOR != null) MONITOR.worked(20, "RETRIEVE STYLES")
141
println "Retrieving data folders and style files"
142
println txmDir.listFiles()
143
File docfiles = new File(binDir, "docfiles");
144
docfiles.mkdir()
145
for (File infile : txmDir.listFiles()) {
146
        if (infile.isDirectory()) {
147
                File unzipDir = new File(docfiles, infile.getName())
148
                infile.renameTo(unzipDir)
149

    
150
                //println "zipdir "+unzipDir
151
                StylesToCSS converter = new StylesToCSS(unzipDir);
152
                if (!converter.process(new File(unzipDir, "style.css"))) {
153
                        println "WARNING: Failed to build css file of $unzipDir"
154
                }
155
                // and get the soft page breaks and styles parents
156
                def parentStyles = converter.parentStyles;
157
                def beforebreaks = converter.beforebreaks
158
                def afterbreaks = converter.afterbreaks
159

    
160
                //println "BEFORES: "+beforebreaks
161
                //println "AFTERS: "+afterbreaks
162
                //println "PARENTS: "+parentStyles
163

    
164
                // se servir de ça pour insérer <pb/> et remplacer styles automatiques
165
                File xmlFile = new File(txmDir, unzipDir.getName().substring(6))
166
                //println "PATCH : $xmlFile"
167
                new FixBreaksAndAutomaticStyles().run(xmlFile, beforebreaks, afterbreaks, parentStyles)
168
        }
169
}
170

    
171
// set new Root
172
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
173
if (MONITOR != null) MONITOR.worked(20, "FIXING TEI")
174
println "Setting new XML root element"
175
for (File infile : txmDir.listFiles()) {
176
        if (infile.isHidden() || infile.isDirectory() || infile.getName().startsWith(".")) continue;
177
        print "."
178
        if (!SetNewXmlDocumentRoot.process(infile, "//tei:text")) {
179
                println "Failed to reset XML root $infile"
180
                return
181
        }
182
}
183
println ""
184

    
185
// filtering
186
println "Filtering XML files with xpaths: [//tei:term]"
187
def xpaths = params.getExcludeXpaths()
188
if (xpaths == null) xpaths = ["//tei:term"]
189
else xpaths << "//tei:term"
190

    
191
if (xpaths != null) {
192
        println "Filtering XML files with xpaths: $xpaths"
193
        for (File infile : txmDir.listFiles()) {
194
                if (infile.isHidden() || infile.isDirectory() || infile.getName().startsWith(".")) continue;
195
                print "."
196
                if (!RemoveTag.xpath(infile, xpaths)) {
197
                        println "Failed to filter $infile"
198
                        return
199
                }
200
        }
201
        println ""
202
}
203

    
204
//get metadata values from CSV
205
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
206
if (MONITOR != null) MONITOR.worked(20, "READ METADATA")
207
Metadatas metadatas; // text metadata
208
//println "Trying to read metadata from: "+allmetadatasfile
209
if (allmetadatasfile.exists()) {
210
        File copy = new File(binDir, allmetadatasfile.getName())
211
        if (!FileCopy.copy(allmetadatasfile, copy)) {
212
                println "Error: could not create a copy of the $allmetadatasfile file "+allmetadatasfile.getAbsoluteFile();
213
                return;
214
        }
215
        metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(), 
216
                Toolbox.getMetadataColumnSeparator(), 
217
                Toolbox.getMetadataTextSeparator(), 1)
218
} else {
219
        println "no $allmetadatasfile metadata file found."
220
}
221

    
222
for (File infile : txmDir.listFiles()) {
223
        if (infile.isHidden()) infile.delete();
224
        if (infile.getName().startsWith(".")) infile.delete();
225
}
226

    
227
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
228
if (MONITOR != null) MONITOR.worked(5, "IMPORTER")
229
println "-- IMPORTER - Reading source files"
230
def imp = new importer();
231
imp.doValidation(true) // change this to not validate xml
232
imp.doTokenize(true) // change this, to not tokenize xml
233
if (!imp.run(srcDir, binDir, txmDir, basename, "", lang)) {
234
        println "import process stopped";
235
        return;
236
}
237

    
238
if (metadatas != null) {
239
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
240
        if (MONITOR != null) MONITOR.worked(20, "INJECTING METADATA")
241
        println "-- INJECTING METADATA - from csv file: "+allmetadatasfile
242
        
243
        println("Injecting metadata: "+metadatas.getHeadersList()+" in texts of directory "+new File(binDir,"txm"))
244
        for (File infile : txmDir.listFiles()) {
245
                if (infile.isHidden() || infile.isDirectory() || infile.getName().startsWith(".")) continue;
246
                print "."
247

    
248
                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
249
                if (!metadatas.injectMetadatasInXml(infile, outfile, "text", null)) {
250
                        outfile.delete();
251
                } else {
252
                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
253
                        if (!infile.exists()) {
254
                                println "Error: could not replace $infile by $outfile"
255
                                return false;
256
                        }
257
                }
258
        }
259
        println ""
260
}
261

    
262
List<File> files = txmDir.listFiles()
263
if (files == null || files.size() == 0) {
264
        return;
265
}
266

    
267
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
268
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
269
println "-- ANNOTATE - Running NLP tools"
270
boolean annotationSuccess = false;
271
if (annotate) {
272
        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
273
        if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
274
                annotationSuccess = true;
275
        }
276
}
277

    
278
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
279
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
280
println "-- COMPILING - Building Search Engine indexes"
281
def c = new compiler();
282
if(debug) c.setDebug();
283
//c.setCwbPath("~/TXM/cwb/bin");
284
c.setOptions(textSortAttribute, normalizeMetadata);
285
c.setAnnotationSuccess(annotationSuccess)
286
c.setLang(lang);
287
if (!c.run(binDir, txmDir, corpusname, null, srcfiles, metadatas)) {
288
        println "import process stopped";
289
        return;
290
}
291

    
292
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
293

    
294
new File(binDir,"HTML/$corpusname").deleteDir();
295
new File(binDir,"HTML/$corpusname").mkdirs();
296
if (build_edition) {
297

    
298
        println "-- EDITION - Building edition"
299
        if (MONITOR != null) MONITOR.worked(25, "EDITION")
300

    
301
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
302
        outdir.mkdirs();
303
        List<File> filelist = txmDir.listFiles();
304
        Collections.sort(filelist);
305
        def second = 0
306

    
307
        println "Paginating texts: "
308

    
309
        for (File txmFile : filelist) {
310
                if (txmFile.isHidden() || txmFile.isDirectory() || txmFile.getName().startsWith(".")) continue;
311
                print "."
312

    
313
                String txtname = txmFile.getName();
314
                int i = txtname.lastIndexOf(".");
315
                if(i > 0) txtname = txtname.substring(0, i);
316

    
317
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
318
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
319

    
320
                Element text = params.addText(corpusElem, txtname, txmFile);
321

    
322
                def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, "pb");
323
                Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
324

    
325
                for (i = 0 ; i < ed.getPageFiles().size();) {
326
                        File f = ed.getPageFiles().get(i);
327
                        String wordid = ed.getIdx().get(i);
328
                        params.addPage(edition, ""+(++i), wordid);
329
                }
330
        }
331

    
332
        for (File unzipDir : docfiles.listFiles()) {
333
                File css = new File(unzipDir, "style.css")
334
                String textname = unzipDir.getName();
335
                textname = textname.substring(6, textname.lastIndexOf(".")) // remove "files-" and ".xml"
336
                File newcss = new File(outdir, textname+".css")
337
                css.renameTo(newcss)
338
        }
339
        File doc = new File(Toolbox.getTxmHomePath(), "css/doc.css")
340
        doc_copy = new File(outdir, "doc.css")
341
        FileCopy.copy(doc, doc_copy);
342

    
343

    
344
        // copy images : images externaly linked must be added manually in the html/CORPUS/default directory
345
        println "Copying internal images..."
346
        for (File txmFile : filelist) {
347
                if (txmFile.isHidden() || txmFile.isDirectory() || txmFile.getName().startsWith(".")) continue;
348
                print "."
349
                File picturesDirectory = new File(binDir, "docfiles/files-"+txmFile.getName()+"/Pictures")
350
                println picturesDirectory
351
                if (picturesDirectory.exists()) {
352
                        File picturesDirectory_copy = new File(outdir, "Pictures")
353
                        FileCopy.copyFiles(picturesDirectory, picturesDirectory_copy)
354
                        if (!picturesDirectory_copy.exists()) {
355
                                println "Failed to copy $picturesDirectory to $picturesDirectory_copy"
356
                        }
357
                }
358
        }
359
        println ""
360
}
361

    
362
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
363
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
364
File paramFile = new File(binDir, "import.xml");
365
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;