Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macroproto / importer / XTZImporterMacro.groovy @ 499

History | View | Annotate | Download (14.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2013-05-15 14:30:48 +0200 (mer., 15 mai 2013) $
25
// $LastChangedRevision: 2392 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.macroproto.importer;
29

    
30
import javax.xml.stream.XMLStreamReader;
31

    
32
import org.txm.sw.RemoveTag;
33
import org.txm.importer.ApplyXsl2;
34
import org.txm.importer.ValidateXml;
35
import org.txm.importer.xml.importer;
36
import org.txm.importer.xml.compiler;
37
import org.txm.importer.xml.pager;
38
import org.txm.objects.*;
39
import org.txm.tokenizer.TokenizerClasses;
40
import org.txm.utils.*;
41
import org.txm.*;
42
import org.txm.scripts.teitxm.*;
43
import org.txm.utils.i18n.*;
44
import org.txm.metadatas.*;
45
import javax.xml.stream.*;
46
import org.w3c.dom.Element
47
import org.txm.utils.xml.DomUtils;
48

    
49
import org.txm.macro.edition.EditionUpdaterMacro
50
import org.txm.macro.edition.AddFacsAttributeMacro
51
import org.txm.macro.edition.FacsEditionBuilderMacro
52

    
53
import org.kohsuke.args4j.*
54
import groovy.transform.Field
55
import org.txm.rcp.swt.widget.parameters.*
56

    
57
import groovy.lang.Binding;
58
import groovy.util.GroovyScriptEngine;
59
import org.txm.rcp.commands.workspace.*
60
import org.txm.rcp.commands.*
61

    
62
String userDir = System.getProperty("user.home");
63

    
64
// BEGINNING OF PARAMETERS
65
@Field @Option(name="srcDirectory", usage="the directory containing the DOC/ODT/RTF files to convert", widget="Folder", required=true, def="srcDirectory")
66
File srcDirectory
67

    
68
@Field @Option(name="SpannedEdition", usage="edition xsl", widget="Boolean", required=false, def="true")
69
Boolean SpannedEdition // call EditionUpdater
70
// calls EditionUpdater : 
71
// Defaults corpus=imported corpus, xslEdition:XTZ xsl, xslPages=XTZ xsl2, editionname=default, useTokenizedDirectory=false
72

    
73
@Field @Option(name="synoptic", usage="syn", widget="Boolean", required=false, def="true")
74
Boolean synoptic 
75
// calls AddFacsAttribute + FacsEditionBuilder. 
76
// Defaults AddFacsAttribute : imageDirectory=$SRC/img, outputDirectory=temp, element=pb, attribute=n, prefix=mediaPath
77
// Default FacsEditionBuilder : editionName=facs, element=pb, attribute=facs
78
 
79
@Field @Option(name="mediaPath", usage="absolute or relative path to images directory", widget="String", required=false, def="img")
80
String mediaPath
81

    
82
@Field @Option(name="facsEditionName", usage="facs edition name to create", widget="String", required=false, def="facs")
83
String facsEditionName
84

    
85
// Open the parameters input dialog box
86
if (!ParametersDialog.open(this)) return;
87

    
88
if (!mediaPath.endsWith("/")) mediaPath += "/"
89

    
90
println "Parameters: "
91
println "        srcDirectory: $srcDirectory"
92
println "        SpannedEdition: $SpannedEdition"
93
println "        synoptic: $synoptic"
94
println "        mediaPath: $mediaPath"
95
println "        facsEditionName: $facsEditionName"
96

    
97
def MONITOR;
98
boolean debug = false;
99
File importFile = new File(srcDirectory, "import.xml")
100
if (!importFile.exists()) {
101
        println "No import.xml file found in $srcDirectory"
102
        println "Aborting."
103
        return;
104
}
105
BaseParameters params = new BaseParameters(importFile);
106
params.load()
107
params.rootDir = srcDirectory.getAbsolutePath();
108
TokenizerClasses.loadFromNode(params.getTokenizerElement(params.getCorpusElement()));
109
if (params == null) { println "no parameters. Aborting"; return; }
110

    
111
String corpusname = params.getCorpusName();
112
Element corpusElem = params.corpora.get(corpusname);
113
String basename = params.name;
114
String rootDir = params.rootDir;
115
String lang = corpusElem.getAttribute("lang");
116
String model = lang
117
String encoding = corpusElem.getAttribute("encoding");
118
boolean annotate = "true" == corpusElem.getAttribute("annotate");
119
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
120
def xslParams = params.getXsltParams(corpusElem);
121

    
122
File srcDir = new File(rootDir);
123
File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME), "corpora/"+basename);
124
binDir.deleteDir();
125
binDir.mkdirs();
126
if (!binDir.exists()) {
127
        println "Could not create binDir "+binDir
128
        return;
129
}
130

    
131
File txmDir = new File(binDir, "txm/$corpusname");
132
txmDir.deleteDir();
133
txmDir.mkdirs();
134

    
135
File propertyFile = new File(rootDir, "import.properties")//default
136
Properties props = new Properties();
137
String[] metadatasToKeep;
138

    
139
String textSortAttribute = null;
140
String paginationElement = "pb";
141
boolean normalizeMetadata = false;
142
String ignoredElements = null;
143
boolean stopIfMalformed = false;
144

    
145
println "Trying to read import properties file: "+propertyFile
146
if (propertyFile.exists() && propertyFile.canRead()) {
147
        InputStreamReader input = new InputStreamReader(new FileInputStream(propertyFile) , "UTF-8");
148
        props.load(input);
149
        input.close();
150
        if(props.getProperty("sortmetadata") != null)
151
                textSortAttribute = props.get("sortmetadata").toString();
152
        if (props.getProperty("editionpage") != null)
153
                paginationElement = props.get("editionpage").toString();
154
        if (props.getProperty("normalizemetadata") != null)
155
                normalizeMetadata = Boolean.parseBoolean(props.get("normalizemetadata").toString());
156
        if (props.getProperty("ignoredelements") != null)
157
                ignoredElements = props.get("ignoredelements").toString();
158
        if (props.getProperty("stopifmalformed") != null)
159
                stopIfMalformed = Boolean.parseBoolean(props.get("stopifmalformed").toString());
160

    
161
        println "import properties: "
162
        println " sort metadata: "+textSortAttribute
163
        println " edition page tag: "+paginationElement
164
        println " normalize attributes: "+normalizeMetadata
165
        println " ignored elements: "+ignoredElements
166
        println " stop if a XML source is malformed: "+stopIfMalformed
167
}
168

    
169
File allmetadatasfile = new File(srcDir, "metadata.csv");
170

    
171
// Apply XSL
172
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
173
if (MONITOR != null) MONITOR.worked(1, "APPLYING XSL")
174
if (xsl != null && xslParams != null && xsl.trim().length() > 0) {
175
        if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"), xslParams))
176
        // return; // error during process
177
        srcDir = new File(binDir, "src");
178
        println ""
179
}
180

    
181
String[] roots = [ System.getProperty("user.home")+"/TXM/scripts/macro/" ];
182
GroovyScriptEngine gse = new GroovyScriptEngine(roots, monitor.getClass().getClassLoader());
183
File jardir = new File(System.getProperty("user.home")+"/TXM/scripts/lib"); //$NON-NLS-1$
184
if (jardir.exists() && jardir.isDirectory()) {
185
        for (File f: jardir.listFiles(new FilenameFilter() {
186
                public boolean accept(File dir, String name) {
187
                        return name.endsWith(".jar"); //$NON-NLS-1$
188
                }
189
        })) {
190
                gse.getGroovyClassLoader().addURL(f.toURI().toURL());
191
        }
192
}
193
// Call AddFacsAttribute - must be called before starting the importer step
194
// Defaults AddFacsAttribute : imageDirectory=$SRC/img, outputDirectory=temp, element=pb, attribute=facs, prefix=mediaPath
195
if (synoptic) {
196
        println "-- Adding the $paginationElement@facs attributes"
197
        File srcDirTemp = new File(binDir, "facs_src")
198
        srcDirTemp.mkdir() // create a new src directory that contains the modified src files
199
        Binding b = new Binding();
200
        b.setVariable("args", ["monitor":monitor, "sourceDirectory":srcDir, "imageDirectory":new File(srcDir, "img"), "outputDirectory":srcDirTemp, "element":paginationElement, "attribute":"facs", "prefix":mediaPath]);
201
        gse.run "org/txm/macro/edition/AddFacsAttributeMacro.groovy", b
202
        println "switching srcDir to $srcDirTemp"
203
        srcDir = srcDirTemp;
204
}
205

    
206
// copy xml+dtd files
207
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
208
List<File> srcfiles = srcDir.listFiles();
209
if (srcfiles != null)
210
for (int i = 0 ; i < srcfiles.size() ; i++) {// check XML format, and copy file into binDir
211
        File f = srcfiles.get(i)
212
        if (f.getName().equals("import.xml") || f.getName().equals("metadata.csv") || f.getName().endsWith(".properties")) {
213
                srcfiles.remove(i);
214
                i--;
215
                continue;
216
        }
217
        if (ValidateXml.test(f)) {
218
                FileCopy.copy(f, new File(txmDir, f.getName()));
219
        } else {
220
                println "Won't process file "+f;
221
        }
222
}
223

    
224
if (txmDir.listFiles() == null) {
225
        println "No txm file to process"
226
        return;
227
}
228

    
229
//get metadatas values from CSV
230
Metadatas metadatas; // text metadatas
231

    
232
println "Trying to read metadatas from: "+allmetadatasfile
233
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
234
if (allmetadatasfile.exists()) {
235
        File copy = new File(binDir, "metadata.csv")
236
        if (!FileCopy.copy(allmetadatasfile, copy)) {
237
                println "Error: could not create a copy of metadata file "+allmetadatasfile.getAbsoluteFile();
238
                return;
239
        }
240
        metadatas = new Metadatas(copy, Toolbox.getParam(Toolbox.METADATA_ENCODING), Toolbox.getParam(Toolbox.METADATA_COLSEPARATOR), Toolbox.getParam(Toolbox.METADATA_TXTSEPARATOR), 1)
241
} else {
242
        println "no metadata file: "+allmetadatasfile
243
}
244

    
245
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
246
if (MONITOR != null) MONITOR.worked(5, "IMPORTER")
247
println "-- IMPORTER - Reading source files"
248
def imp = new importer();
249
imp.doValidation(true) // change this to not validate xml
250
imp.doTokenize(true) // change this, to not tokenize xml
251
imp.setStopIfMalformed(stopIfMalformed);
252
if (!imp.run( srcDir, binDir, txmDir, basename, ignoredElements, lang)) {
253
        println "import process stopped";
254
        return;
255
}
256

    
257
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
258
if (MONITOR != null) MONITOR.worked(20, "INJECTING METADATA")
259
if (metadatas != null) {
260
        println("-- INJECTING METADATA - "+metadatas.getHeadersList()+" in texts of directory "+new File(binDir,"txm"))
261
        for (File infile : txmDir.listFiles()) {
262
                print "."
263
                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
264
                if (!metadatas.injectMetadatasInXml(infile, outfile, "text", null)) {
265
                        outfile.delete();
266
                } else {
267
                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
268
                        if (!infile.exists()) {
269
                                println "Error: could not replace $infile by $outfile"
270
                                return false;
271
                        }
272
                }
273
        }
274
        println ""
275
}
276
List<File> files = txmDir.listFiles()
277
if (files == null || files.size() == 0) {
278
        return;
279
}
280

    
281
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
282
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
283
println "-- ANNOTATE - Running NLP tools"
284
boolean annotationSuccess = false;
285
if (annotate && new Annotate().run(binDir, txmDir, model+".par")) {
286
        annotationSuccess = true;
287
}
288

    
289
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
290
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
291
println "-- COMPILING - Building Search Engine indexes"
292
def c = new compiler();
293
if(debug) c.setDebug();
294
//c.setCwbPath("~/TXM/cwb/bin");
295
c.setOptions(textSortAttribute, normalizeMetadata);
296
c.setAnnotationSuccess(annotationSuccess)
297
c.setLang(lang);
298
if (!c.run(binDir, txmDir, corpusname, null, srcfiles, metadatas)) {
299
        println "import process stopped";
300
        return;
301
}
302

    
303
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
304
if (MONITOR != null) MONITOR.worked(25, "EDITION")
305
println "-- EDITION - Building edition"
306
new File(binDir,"HTML/$corpusname").deleteDir();
307
new File(binDir,"HTML/$corpusname").mkdirs();
308
File outdir = new File(binDir,"/HTML/$corpusname/default/");
309
outdir.mkdirs();
310
List<File> filelist = txmDir.listFiles();
311
Collections.sort(filelist);
312
def second = 0
313

    
314
println "Paginating texts: "+filelist
315
for (File txmFile : filelist) {
316
        print "."
317
        String txtname = txmFile.getName();
318
        int i = txtname.lastIndexOf(".");
319
        if(i > 0) txtname = txtname.substring(0, i);
320

    
321
        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
322
        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
323

    
324
        Element text = params.addText(corpusElem, txtname, txmFile);
325

    
326
        def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, 30000, basename, paginationElement);
327
        Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
328

    
329
        for (i = 0 ; i < ed.getPageFiles().size();) {
330
                File f = ed.getPageFiles().get(i);
331
                String wordid = ed.getIdx().get(i);
332
                params.addPage(edition, ""+(++i), wordid);
333
        }
334
}
335

    
336
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
337
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
338
File paramFile = new File(binDir, "import.xml");
339
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;
340

    
341
// Call FacsEditionBuilder
342
// Default FacsEditionBuilder : editionName=facs, element=pb, attribute=facs
343
if (synoptic) {
344
        println "-- Building the '$facsEditionName' image edition..."
345
        b = new Binding();
346
        b.setVariable("args", ["monitor":monitor,"params":params, "binDirectory":binDir, "txmDirectory":txmDir,"corpus":srcDirectory, "editionName":facsEditionName, "tag":paginationElement, "attribute":"facs"]);
347
        gse.run "org/txm/macro/edition/FacsEditionBuilderMacro.groovy", b
348
        
349
        //copy images into the binary corpus
350
        if (mediaPath.length() > 0 && !mediaPath.startsWith("http")) {
351
                File binImageDirectory = new File(binDir,"HTML/$basename/$facsEditionName/$mediaPath")
352
                println "copying images into binary corpus: $binImageDirectory ..."
353
                FileCopy.copyFiles(new File(srcDirectory, "img"), binImageDirectory)
354
                println "copy done"
355
        } else {
356
                println "Note: the $facsEditionName edition images are not embedded in the binary corpus"
357
        }
358
}
359

    
360
// Call EditionUpdater
361
// corpus=imported corpus, xslEdition:XTZ xsl, xslPages=XTZ xsl2, editionname=default, useTokenizedDirectory=false
362
if (SpannedEdition) {
363
        println "-- Building Spanned edition..."
364
        
365
        Binding b = new Binding();
366
        b.setVariable("args", ["monitor":monitor,"params":params, "binDirectory":binDir, "txmDirectory":txmDir,"corpus":srcDirectory, "xslEdition":"txm-edition-xtz.xsl", "xslPages":"txm-edition-page-split.xsl", "editionname":"default", "useTokenizedDirectory":false]);
367
        gse.run "org/txm/macro/edition/EditionUpdaterMacro.groovy", b
368
}
369
// // create the ".txm" file
370
// try {
371
//         File exportzip = new File(binDir.getParent(), binDir.getName()+".txm")
372
//         println "creating the $exportzip binary corpus file..."
373
//         Zip.compress(binDir, exportzip);
374
//         println "done."
375
// } catch (IOException e) {
376
//         println "Error during binary file creation: $e"
377
// }
378

    
379
// load corpus
380
println "Loading corpus..."
381
try {LoadBinaryCorpus.loadBase(binDir)} catch(Exception e777){AddBases.loadBase(binDir, monitor)} // LoadBinaryCorpus does not exist if TXM version is < 0.7.7
382

    
383
Toolbox.restartWorkspace();
384
Toolbox.restartSearchEngine();
385

    
386
monitor.syncExec(new Runnable() {
387
        @Override
388
        public void run() {
389
                println "Reloading corpora view..."
390
                RestartTXM.reloadViews();
391
                println "import done."
392
        }
393
});