Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / doc / docLoader.groovy @ 1488

History | View | Annotate | Download (11.7 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2012-04-24 14:30:47 +0200 (mar., 24 avr. 2012) $
25
// $LastChangedRevision: 2174 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.doc;
29

    
30
import org.txm.scripts.importer.*;
31
import javax.xml.stream.XMLStreamReader;
32
import org.txm.importer.ApplyXsl2;
33
import org.txm.importer.ValidateXml;
34
import org.txm.scripts.importer.xml.importer;
35
import org.txm.scripts.importer.xml.compiler;
36
import org.txm.objects.*;
37
import org.txm.tokenizer.TokenizerClasses;
38
import org.txm.utils.*;
39
import org.txm.utils.io.*;
40
import org.txm.*;
41
import org.txm.core.preferences.TBXPreferences
42
import org.txm.core.engines.*;
43
import org.txm.importer.scripts.xmltxm.*;
44
import org.txm.utils.i18n.*;
45
import org.txm.metadatas.*;
46
import javax.xml.stream.*;
47
import org.w3c.dom.Element;
48
import org.txm.utils.xml.DomUtils;
49
import org.txm.scripts.importer.RemoveTag;
50
import org.txm.scripts.doc.*;
51

    
52
String userDir = System.getProperty("user.home");
53

    
54
def MONITOR;
55
Project project;
56

    
57
try {project=projectBinding;MONITOR=monitor} catch (Exception)
58
{        }
59
if (project == null) { println "no project set. Aborting"; return; }
60

    
61
String corpusname = project.getName();
62
String basename = corpusname
63
String rootDir = project.getSrcdir();
64
String lang = project.getLang()
65
String model = lang
66
String encoding = project.getEncoding()
67
boolean annotate = project.getAnnotate()
68
String xsl = project.getFrontXSL();
69
def xslParams = project.getXsltParameters();
70
int wordsPerPage = project.getEditionDefinition("default").getWordsPerPage()
71
String page_element = project.getEditionDefinition("default").getPageElement()
72
boolean build_edition = project.getEditionDefinition("default").getBuildEdition()
73

    
74
File srcDir = new File(rootDir);
75
File binDir = project.getProjectDirectory();
76
binDir.mkdirs();
77
if (!binDir.exists()) {
78
        println "Could not create binDir "+binDir
79
        return;
80
}
81

    
82
File txmDir = new File(binDir, "txm/$corpusname");
83
txmDir.deleteDir();
84
txmDir.mkdirs();
85

    
86
String textSortAttribute = null; // a property in metadata.csv file
87
String paginationElement = "pb";
88
boolean normalizeMetadata = true;
89

    
90
File allMetadataFile = Metadatas.findMetadataFile(srcDir);
91

    
92
if (txmDir.listFiles() == null) {
93
        println "No file to process"
94
        return;
95
}
96

    
97
def srcfiles = [];
98
for (File f : srcDir.listFiles()) {
99
        String name = f.getName().toLowerCase();
100
        if (name.endsWith(".properties")) continue;
101
        if (name.equals("import.xml")) continue;
102
        if (name.endsWith(".csv")) continue;
103
        if (f.isDirectory()) continue;
104
        if (f.isHidden()) continue;
105
        if (name.endsWith(".doc") || name.endsWith(".docx") || name.endsWith(".odt") || name.endsWith(".rtf")) {
106
                srcfiles.add(f)
107
        }
108
}
109

    
110
if (srcfiles.size() == null) {
111
        println "No file to process"
112
        return;
113
}
114

    
115
// convert to TEI
116
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
117
if (MONITOR != null) MONITOR.worked(20, "CREATING TEI FILES")
118
println "-- CONVERTER - Converting source files"
119
File xsldir = new File(Toolbox.getTxmHomePath(), "xsl")
120
if (!DocumentToTei.processFiles(srcfiles, txmDir, xsldir)) {
121
        println "Abord import"
122
        return;
123
}
124
println ""
125

    
126
// move data folders and build css file
127
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
128
if (MONITOR != null) MONITOR.worked(20, "RETRIEVE STYLES")
129
println "Retrieving data folders and style files"
130
println txmDir.listFiles()
131
File docfiles = new File(binDir, "docfiles");
132
docfiles.mkdir()
133

    
134
for (File infile : txmDir.listFiles()) {
135
        if (infile.isDirectory()) {
136
                File unzipDir = new File(docfiles, infile.getName())
137
                infile.renameTo(unzipDir)
138

    
139
                //println "zipdir "+unzipDir
140
                StylesToCSS converter = new StylesToCSS(unzipDir);
141
                if (!converter.process(new File(unzipDir, "style.css"))) {
142
                        println "WARNING: Failed to build css file of $unzipDir"
143
                }
144
                // and get the soft page breaks and styles parents
145
                def parentStyles = converter.parentStyles;
146
                def beforebreaks = converter.beforebreaks
147
                def afterbreaks = converter.afterbreaks
148

    
149
                //println "BEFORES: "+beforebreaks
150
                //println "AFTERS: "+afterbreaks
151
                //println "PARENTS: "+parentStyles
152

    
153
                // se servir de ça pour insérer <pb/> et remplacer styles automatiques
154
                File xmlFile = new File(txmDir, unzipDir.getName().substring(6))
155
                //println "PATCH : $xmlFile"
156
                new FixBreaksAndAutomaticStyles().run(xmlFile, beforebreaks, afterbreaks, parentStyles)
157
        }
158
}
159

    
160
// set new Root
161
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
162
if (MONITOR != null) MONITOR.worked(20, "FIXING TEI")
163
println "Setting new XML root element"
164
for (File infile : txmDir.listFiles()) {
165
        if (infile.isHidden() || infile.isDirectory() || infile.getName().startsWith(".")) continue;
166
        print "."
167
        if (!SetNewXmlDocumentRoot.process(infile, "//tei:text")) {
168
                println "Failed to reset XML root $infile"
169
                return
170
        }
171
}
172
println ""
173

    
174
// filtering
175
println "Filtering XML files with xpaths: [//tei:term]"
176
def xpaths = ["//tei:term"]
177

    
178
if (xpaths != null) {
179
        println "Filtering XML files with xpaths: $xpaths"
180
        for (File infile : txmDir.listFiles()) {
181
                if (infile.isHidden() || infile.isDirectory() || infile.getName().startsWith(".")) continue;
182
                print "."
183
                if (!RemoveTag.xpath(infile, xpaths)) {
184
                        println "Failed to filter $infile"
185
                        return
186
                }
187
        }
188
        println ""
189
}
190

    
191
//get metadata values from CSV
192
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
193
if (MONITOR != null) MONITOR.worked(20, "READ METADATA")
194
Metadatas metadatas; // text metadata
195
//println "Trying to read metadata from: "+allMetadataFile
196
if (allMetadataFile.exists()) {
197
        File copy = new File(binDir, allMetadataFile.getName())
198
        if (!FileCopy.copy(allMetadataFile, copy)) {
199
                println "Error: could not create a copy of the $allMetadataFile file "+allMetadataFile.getAbsoluteFile();
200
                return;
201
        }
202
        metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(), 
203
                Toolbox.getMetadataColumnSeparator(), 
204
                Toolbox.getMetadataTextSeparator(), 1)
205
} else {
206
        println "no $allMetadataFile metadata file found."
207
}
208

    
209
for (File infile : txmDir.listFiles()) {
210
        if (infile.isHidden()) infile.delete();
211
        if (infile.getName().startsWith(".")) infile.delete();
212
}
213

    
214
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
215
if (MONITOR != null) MONITOR.worked(5, "IMPORTER")
216
println "-- IMPORTER - Reading source files"
217
def imp = new importer();
218
imp.doValidation(true) // change this to not validate xml
219
imp.doTokenize(true) // change this, to not tokenize xml
220
if (!imp.run(srcDir, binDir, txmDir, basename, "", lang)) {
221
        println "import process stopped";
222
        return;
223
}
224

    
225
if (metadatas != null) {
226
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
227
        if (MONITOR != null) MONITOR.worked(20, "INJECTING METADATA")
228
        println "-- INJECTING METADATA - from csv file: "+allMetadataFile
229
        
230
        println("Injecting metadata: "+metadatas.getHeadersList()+" in texts of directory "+new File(binDir,"txm"))
231
        for (File infile : txmDir.listFiles()) {
232
                if (infile.isHidden() || infile.isDirectory() || infile.getName().startsWith(".")) continue;
233
                print "."
234

    
235
                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
236
                if (!metadatas.injectMetadatasInXml(infile, outfile, "text", null)) {
237
                        outfile.delete();
238
                } else {
239
                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
240
                        if (!infile.exists()) {
241
                                println "Error: could not replace $infile by $outfile"
242
                                return false;
243
                        }
244
                }
245
        }
246
        println ""
247
}
248

    
249
List<File> files = txmDir.listFiles()
250
if (files == null || files.size() == 0) {
251
        return;
252
}
253

    
254
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
255
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
256

    
257
boolean annotationSuccess = false;
258
if (annotate) {
259
        println "-- ANNOTATE - Running NLP tools"
260
        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
261
        if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
262
                annotationSuccess = true;
263
        }
264
}
265

    
266
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
267
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
268
println "-- COMPILING - Building Search Engine indexes"
269
def c = new compiler();
270
if(debug) c.setDebug();
271
//c.setCwbPath("~/TXM/cwb/bin");
272
c.setOptions(textSortAttribute, normalizeMetadata);
273
c.setAnnotationSuccess(annotationSuccess)
274
c.setLang(lang);
275
if (!c.run(project, binDir, txmDir, corpusname, null, srcfiles, metadatas)) {
276
        println "import process stopped";
277
        return;
278
}
279

    
280
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
281

    
282
new File(binDir,"HTML/$corpusname").deleteDir();
283
new File(binDir,"HTML/$corpusname").mkdirs();
284
if (build_edition) {
285

    
286
        println "-- EDITION - Building edition"
287
        if (MONITOR != null) MONITOR.worked(25, "EDITION")
288

    
289
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
290
        outdir.mkdirs();
291
        List<File> filelist = txmDir.listFiles();
292
        Collections.sort(filelist);
293
        def second = 0
294

    
295
        println "Paginating texts: "
296

    
297
        for (File txmFile : filelist) {
298
                if (txmFile.isHidden() || txmFile.isDirectory() || txmFile.getName().startsWith(".")) continue;
299
                print "."
300

    
301
                String txtname = txmFile.getName();
302
                int i = txtname.lastIndexOf(".");
303
                if(i > 0) txtname = txtname.substring(0, i);
304

    
305
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
306
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
307

    
308
                Text t = new Text(project);
309
                t.setName(txtname);
310
                t.setSourceFile(txmFile)
311
                t.setTXMFile(txmFile)
312

    
313
                def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, "pb");
314
                Edition edition = new Edition(t);
315
                edition.setName("default");
316
                edition.setIndex(outdir.getAbsolutePath());
317
                for (i = 0 ; i < ed.getPageFiles().size();) {
318
                        File f = ed.getPageFiles().get(i);
319
                        String wordid = "w_0";
320
                                if (i < ed.getIdx().size()) wordid = ed.getIdx().get(i);
321
                        edition.addPage(""+(++i), wordid);
322
                }
323
        }
324

    
325
        for (File unzipDir : docfiles.listFiles()) {
326
                File css = new File(unzipDir, "style.css")
327
                String textname = unzipDir.getName();
328
                textname = textname.substring(6, textname.lastIndexOf(".")) // remove "files-" and ".xml"
329
                File newcss = new File(outdir, textname+".css")
330
                css.renameTo(newcss)
331
        }
332
        File doc = new File(Toolbox.getTxmHomePath(), "css/doc.css")
333
        doc_copy = new File(outdir, "doc.css")
334
        FileCopy.copy(doc, doc_copy);
335

    
336

    
337
        // copy images : images externaly linked must be added manually in the html/CORPUS/default directory
338
        println "Copying internal images..."
339
        for (File txmFile : filelist) {
340
                if (txmFile.isHidden() || txmFile.isDirectory() || txmFile.getName().startsWith(".")) continue;
341
                print "."
342
                File picturesDirectory = new File(binDir, "docfiles/files-"+txmFile.getName()+"/Pictures")
343
                println picturesDirectory
344
                if (picturesDirectory.exists()) {
345
                        File picturesDirectory_copy = new File(outdir, "Pictures")
346
                        FileCopy.copyFiles(picturesDirectory, picturesDirectory_copy)
347
                        if (!picturesDirectory_copy.exists()) {
348
                                println "Failed to copy $picturesDirectory to $picturesDirectory_copy"
349
                        }
350
                }
351
        }
352
        println ""
353
}
354

    
355
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
356
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
357

    
358
readyToLoad = project.save();