Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / doc / docLoader.groovy @ 187

History | View | Annotate | Download (12.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2012-04-24 14:30:47 +0200 (mar., 24 avr. 2012) $
25
// $LastChangedRevision: 2174 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.doc;
29

    
30
import org.txm.importer.*;
31

    
32
import javax.xml.stream.XMLStreamReader;
33

    
34
import org.txm.importer.ApplyXsl2;
35
import org.txm.importer.ValidateXml;
36
import org.txm.importer.xml.importer;
37
import org.txm.importer.xml.compiler;
38
import org.txm.objects.*;
39
import org.txm.tokenizer.TokenizerClasses;
40
import org.txm.utils.*;
41
import org.txm.*;
42
import org.txm.scripts.teitxm.*;
43
import org.txm.utils.i18n.*;
44
import org.txm.metadatas.*;
45

    
46
import javax.xml.stream.*;
47

    
48
import org.w3c.dom.Element;
49
import org.txm.sw.RemoveTag;
50
import org.txm.doc.*;
51

    
52
String userDir = System.getProperty("user.home");
53

    
54
def MONITOR;
55
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
56
BaseParameters params;
57
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
58
{        println "DEV MODE";//exception means we debug
59
        debug = true
60
        params = new BaseParameters(new File(userDir, "xml/doc/import.xml"))
61
        params.load()
62
        if (!org.txm.Toolbox.isInitialized()) {
63
                TokenizerClasses.loadFromNode(params.getTokenizerElement(params.getCorpusElement()));
64
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
65
                //Toolbox.setParam(Toolbox.INSTALL_DIR,new File("C:\\Program Files\\TXM"));//For Windows
66
                Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
67
                //Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File("C:\\Program Files\\treetagger"));//for Windows
68
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
69
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
70
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
71
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
72
                //Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File("C:\\Program Files\\treetagger\\models"));//for Windows
73
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
74
        }
75
}
76
if (params == null) { println "no parameters. Aborting"; return; }
77

    
78
String corpusname = params.getCorpusName();
79
Element corpusElem = params.corpora.get(corpusname);
80
String basename = params.name;
81
String rootDir = params.rootDir;
82
String lang = corpusElem.getAttribute("lang");
83
String model = lang
84
boolean annotate = "true" == corpusElem.getAttribute("annotate");
85
int wordsPerPage = params.getWordsPerPage("default")
86
boolean build_edition = params.getDoEdition("default")
87

    
88
File srcDir = new File(rootDir);
89
File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME), "corpora/"+basename);
90
binDir.deleteDir();
91
binDir.mkdirs();
92
if (!binDir.exists()) {
93
        println "Could not create binDir "+binDir
94
        return;
95
}
96

    
97
File txmDir = new File(binDir, "txm/$corpusname");
98
txmDir.deleteDir();
99
txmDir.mkdirs();
100

    
101
String textSortAttribute = null; // a property in metadata.csv file
102
String paginationElement = "pb";
103
boolean normalizeMetadata = true;
104

    
105
File allmetadatasfile = new File(srcDir, "metadata.csv");
106

    
107
if (txmDir.listFiles() == null) {
108
        println "No file to process"
109
        return;
110
}
111

    
112
def srcfiles = [];
113
for (File f : srcDir.listFiles()) {
114
        String name = f.getName().toLowerCase();
115
        if (name.endsWith(".properties")) continue;
116
        if (name.equals("import.xml")) continue;
117
        if (name.endsWith(".csv")) continue;
118
        if (f.isDirectory()) continue;
119
        if (f.isHidden()) continue;
120
        if (name.endsWith(".doc") || name.endsWith(".docx") || name.endsWith(".odt") || name.endsWith(".rtf")) {
121
                srcfiles.add(f)
122
        }
123
}
124

    
125
if (srcfiles.size() == null) {
126
        println "No file to process"
127
        return;
128
}
129

    
130
// convert to TEI
131
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
132
if (MONITOR != null) MONITOR.worked(20, "CREATING TEI FILES")
133
println "-- CONVERTER - Converting source files"
134
File xsldir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME), "xsl")
135
if (!DocumentToTei.processFiles(srcfiles, txmDir, xsldir)) {
136
        println "Abord import"
137
        return;
138
}
139
println ""
140

    
141
// move data folders and build css file
142
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
143
if (MONITOR != null) MONITOR.worked(20, "RETRIEVE STYLES")
144
println "Retrieving data folders and style files"
145
println txmDir.listFiles()
146
File docfiles = new File(binDir, "docfiles");
147
docfiles.mkdir()
148
for (File infile : txmDir.listFiles()) {
149
        if (infile.isDirectory()) {
150
                File unzipDir = new File(docfiles, infile.getName())
151
                infile.renameTo(unzipDir)
152

    
153
                //println "zipdir "+unzipDir
154
                StylesToCSS converter = new StylesToCSS(unzipDir);
155
                if (!converter.process(new File(unzipDir, "style.css"))) {
156
                        println "WARNING: Failed to build css file of $unzipDir"
157
                }
158
                // and get the soft page breaks and styles parents
159
                def parentStyles = converter.parentStyles;
160
                def beforebreaks = converter.beforebreaks
161
                def afterbreaks = converter.afterbreaks
162

    
163
                //println "BEFORES: "+beforebreaks
164
                //println "AFTERS: "+afterbreaks
165
                //println "PARENTS: "+parentStyles
166

    
167
                // se servir de ça pour insérer <pb/> et remplacer styles automatiques
168
                File xmlFile = new File(txmDir, unzipDir.getName().substring(6))
169
                //println "PATCH : $xmlFile"
170
                new FixBreaksAndAutomaticStyles().run(xmlFile, beforebreaks, afterbreaks, parentStyles)
171
        }
172
}
173

    
174
// set new Root
175
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
176
if (MONITOR != null) MONITOR.worked(20, "FIXING TEI")
177
println "Setting new XML root element"
178
for (File infile : txmDir.listFiles()) {
179
        if (infile.isHidden() || infile.isDirectory() || infile.getName().startsWith(".")) continue;
180
        print "."
181
        if (!SetNewXmlDocumentRoot.process(infile, "//tei:text")) {
182
                println "Failed to reset XML root $infile"
183
                return
184
        }
185
}
186
println ""
187

    
188
// filtering
189
println "Filtering XML files with xpaths: [//tei:term]"
190
def xpaths = params.getExcludeXpaths()
191
if (xpaths == null) xpaths = ["//tei:term"]
192
else xpaths << "//tei:term"
193

    
194
if (xpaths != null) {
195
        println "Filtering XML files with xpaths: $xpaths"
196
        for (File infile : txmDir.listFiles()) {
197
                if (infile.isHidden() || infile.isDirectory() || infile.getName().startsWith(".")) continue;
198
                print "."
199
                if (!RemoveTag.xpath(infile, xpaths)) {
200
                        println "Failed to filter $infile"
201
                        return
202
                }
203
        }
204
        println ""
205
}
206

    
207
//get metadata values from CSV
208
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
209
if (MONITOR != null) MONITOR.worked(20, "READ METADATA")
210
Metadatas metadatas; // text metadata
211
//println "Trying to read metadata from: "+allmetadatasfile
212
if (allmetadatasfile.exists()) {
213
        File copy = new File(binDir, "metadata.csv")
214
        if (!FileCopy.copy(allmetadatasfile, copy)) {
215
                println "Error: could not create a copy of the metadata.csv file "+allmetadatasfile.getAbsoluteFile();
216
                return;
217
        }
218
        metadatas = new Metadatas(copy, Toolbox.getParam(Toolbox.METADATA_ENCODING), Toolbox.getParam(Toolbox.METADATA_COLSEPARATOR), Toolbox.getParam(Toolbox.METADATA_TXTSEPARATOR), 1)
219
} else {
220
        println "no $allmetadatasfile metadata file found."
221
}
222

    
223
for (File infile : txmDir.listFiles()) {
224
        if (infile.isHidden()) infile.delete();
225
        if (infile.getName().startsWith(".")) infile.delete();
226
}
227

    
228
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
229
if (MONITOR != null) MONITOR.worked(5, "IMPORTER")
230
println "-- IMPORTER - Reading source files"
231
def imp = new importer();
232
imp.doValidation(true) // change this to not validate xml
233
imp.doTokenize(true) // change this, to not tokenize xml
234
if (!imp.run(srcDir, binDir, txmDir, basename, "", lang)) {
235
        println "import process stopped";
236
        return;
237
}
238

    
239
if (metadatas != null) {
240
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
241
        if (MONITOR != null) MONITOR.worked(20, "INJECTING METADATA")
242
        println "-- INJECTING METADATA - from csv file: "+allmetadatasfile
243
        
244
        println("Injecting metadata: "+metadatas.getHeadersList()+" in texts of directory "+new File(binDir,"txm"))
245
        for (File infile : txmDir.listFiles()) {
246
                if (infile.isHidden() || infile.isDirectory() || infile.getName().startsWith(".")) continue;
247
                print "."
248

    
249
                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
250
                if (!metadatas.injectMetadatasInXml(infile, outfile, "text", null)) {
251
                        outfile.delete();
252
                } else {
253
                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
254
                        if (!infile.exists()) {
255
                                println "Error: could not replace $infile by $outfile"
256
                                return false;
257
                        }
258
                }
259
        }
260
        println ""
261
}
262

    
263
List<File> files = txmDir.listFiles()
264
if (files == null || files.size() == 0) {
265
        return;
266
}
267

    
268
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
269
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
270
println "-- ANNOTATE - Running NLP tools"
271
boolean annotationSuccess = false;
272
if (annotate && new Annotate().run(binDir, txmDir, model+".par")) {
273
        annotationSuccess = true;
274
}
275

    
276
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
277
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
278
println "-- COMPILING - Building Search Engine indexes"
279
def c = new compiler();
280
if(debug) c.setDebug();
281
//c.setCwbPath("~/TXM/cwb/bin");
282
c.setOptions(textSortAttribute, normalizeMetadata);
283
c.setAnnotationSuccess(annotationSuccess)
284
c.setLang(lang);
285
if (!c.run(binDir, txmDir, corpusname, null, srcfiles, metadatas)) {
286
        println "import process stopped";
287
        return;
288
}
289

    
290
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
291

    
292
new File(binDir,"HTML/$corpusname").deleteDir();
293
new File(binDir,"HTML/$corpusname").mkdirs();
294
if (build_edition) {
295

    
296
        println "-- EDITION - Building edition"
297
        if (MONITOR != null) MONITOR.worked(25, "EDITION")
298

    
299
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
300
        outdir.mkdirs();
301
        List<File> filelist = txmDir.listFiles();
302
        Collections.sort(filelist);
303
        def second = 0
304

    
305
        println "Paginating texts: "
306

    
307
        for (File txmFile : filelist) {
308
                if (txmFile.isHidden() || txmFile.isDirectory() || txmFile.getName().startsWith(".")) continue;
309
                print "."
310

    
311
                String txtname = txmFile.getName();
312
                int i = txtname.lastIndexOf(".");
313
                if(i > 0) txtname = txtname.substring(0, i);
314

    
315
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
316
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
317

    
318
                Element text = params.addText(corpusElem, txtname, txmFile);
319

    
320
                def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, "pb");
321
                Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
322

    
323
                for (i = 0 ; i < ed.getPageFiles().size();) {
324
                        File f = ed.getPageFiles().get(i);
325
                        String wordid = ed.getIdx().get(i);
326
                        params.addPage(edition, ""+(++i), wordid);
327
                }
328
        }
329

    
330
        for (File unzipDir : docfiles.listFiles()) {
331
                File css = new File(unzipDir, "style.css")
332
                String textname = unzipDir.getName();
333
                textname = textname.substring(6, textname.lastIndexOf(".")) // remove "files-" and ".xml"
334
                File newcss = new File(outdir, textname+".css")
335
                css.renameTo(newcss)
336
        }
337
        File doc = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME), "css/doc.css")
338
        doc_copy = new File(outdir, "doc.css")
339
        FileCopy.copy(doc, doc_copy);
340

    
341

    
342
        // copy images : images externaly linked must be added manually in the html/CORPUS/default directory
343
        println "Copying internal images..."
344
        for (File txmFile : filelist) {
345
                if (txmFile.isHidden() || txmFile.isDirectory() || txmFile.getName().startsWith(".")) continue;
346
                print "."
347
                File picturesDirectory = new File(binDir, "docfiles/files-"+txmFile.getName()+"/Pictures")
348
                println picturesDirectory
349
                if (picturesDirectory.exists()) {
350
                        File picturesDirectory_copy = new File(outdir, "Pictures")
351
                        if (!FileCopy.copyFiles(picturesDirectory, picturesDirectory_copy)) {
352
                                println "Failed to copy $picturesDirectory to $picturesDirectory_copy"
353
                        }
354
                }
355
        }
356
        println ""
357
}
358

    
359
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
360
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
361
File paramFile = new File(binDir, "import.xml");
362
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;