Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macroproto / importer / XTZImporterMacro.groovy @ 967

History | View | Annotate | Download (14.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2013-05-15 14:30:48 +0200 (mer., 15 mai 2013) $
25
// $LastChangedRevision: 2392 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.macroproto.importer;
29

    
30
import javax.xml.stream.XMLStreamReader;
31

    
32
import org.txm.sw.RemoveTag;
33
import org.txm.importer.ApplyXsl2;
34
import org.txm.importer.ValidateXml;
35
import org.txm.importer.xml.importer;
36
import org.txm.importer.xml.compiler;
37
import org.txm.importer.xml.pager;
38
import org.txm.objects.*;
39
import org.txm.tokenizer.TokenizerClasses;
40
import org.txm.utils.*
41
import org.txm.utils.io.*;
42
import org.txm.*;
43
import org.txm.importer.xmltxm.*;
44
import org.txm.utils.i18n.*;
45
import org.txm.metadatas.*;
46
import javax.xml.stream.*;
47
import org.w3c.dom.Element
48
import org.txm.utils.xml.DomUtils;
49

    
50
import org.txm.macro.edition.EditionUpdaterMacro
51
import org.txm.macro.edition.AddFacsAttributeMacro
52
import org.txm.macro.edition.FacsEditionBuilderMacro
53

    
54
import org.kohsuke.args4j.*
55
import groovy.transform.Field
56
import org.txm.rcp.swt.widget.parameters.*
57

    
58
import groovy.lang.Binding;
59
import groovy.util.GroovyScriptEngine;
60
import org.txm.rcp.commands.workspace.*
61
import org.txm.rcp.commands.*
62

    
63
String userDir = System.getProperty("user.home");
64

    
65
// BEGINNING OF PARAMETERS
66
@Field @Option(name="srcDirectory", usage="the directory containing the DOC/ODT/RTF files to convert", widget="Folder", required=true, def="srcDirectory")
67
File srcDirectory
68

    
69
@Field @Option(name="SpannedEdition", usage="edition xsl", widget="Boolean", required=false, def="true")
70
Boolean SpannedEdition // call EditionUpdater
71
// calls EditionUpdater : 
72
// Defaults corpus=imported corpus, xslEdition:XTZ xsl, xslPages=XTZ xsl2, editionname=default, useTokenizedDirectory=false
73

    
74
@Field @Option(name="synoptic", usage="syn", widget="Boolean", required=false, def="true")
75
Boolean synoptic 
76
// calls AddFacsAttribute + FacsEditionBuilder. 
77
// Defaults AddFacsAttribute : imageDirectory=$SRC/img, outputDirectory=temp, element=pb, attribute=n, prefix=mediaPath
78
// Default FacsEditionBuilder : editionName=facs, element=pb, attribute=facs
79
 
80
@Field @Option(name="mediaPath", usage="absolute or relative path to images directory", widget="String", required=false, def="img")
81
String mediaPath
82

    
83
@Field @Option(name="facsEditionName", usage="facs edition name to create", widget="String", required=false, def="facs")
84
String facsEditionName
85

    
86
// Open the parameters input dialog box
87
if (!ParametersDialog.open(this)) return;
88

    
89
if (!mediaPath.endsWith("/")) mediaPath += "/"
90

    
91
println "Parameters: "
92
println "        srcDirectory: $srcDirectory"
93
println "        SpannedEdition: $SpannedEdition"
94
println "        synoptic: $synoptic"
95
println "        mediaPath: $mediaPath"
96
println "        facsEditionName: $facsEditionName"
97

    
98
def MONITOR;
99
boolean debug = false;
100
File importFile = new File(srcDirectory, "import.xml")
101
if (!importFile.exists()) {
102
        println "No import.xml file found in $srcDirectory"
103
        println "Aborting."
104
        return;
105
}
106
BaseParameters params = new BaseParameters(importFile);
107
params.load()
108
params.rootDir = srcDirectory.getAbsolutePath();
109
TokenizerClasses.loadFromNode(params.getTokenizerElement(params.getCorpusElement()));
110
if (params == null) { println "no parameters. Aborting"; return; }
111

    
112
String corpusname = params.getCorpusName();
113
Element corpusElem = params.corpora.get(corpusname);
114
String basename = params.name;
115
String rootDir = params.rootDir;
116
String lang = corpusElem.getAttribute("lang");
117
String model = lang
118
String encoding = corpusElem.getAttribute("encoding");
119
boolean annotate = "true" == corpusElem.getAttribute("annotate");
120
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
121
def xslParams = params.getXsltParams(corpusElem);
122

    
123
File srcDir = new File(rootDir);
124
File binDir = new File(Toolbox.getTxmHomePath(), "corpora/"+basename);
125
binDir.deleteDir();
126
binDir.mkdirs();
127
if (!binDir.exists()) {
128
        println "Could not create binDir "+binDir
129
        return;
130
}
131

    
132
File txmDir = new File(binDir, "txm/$corpusname");
133
txmDir.deleteDir();
134
txmDir.mkdirs();
135

    
136
File propertyFile = new File(rootDir, "import.properties")//default
137
Properties props = new Properties();
138
String[] metadatasToKeep;
139

    
140
String textSortAttribute = null;
141
String paginationElement = "pb";
142
boolean normalizeMetadata = false;
143
String ignoredElements = null;
144
boolean stopIfMalformed = false;
145

    
146
println "Trying to read import properties file: "+propertyFile
147
if (propertyFile.exists() && propertyFile.canRead()) {
148
        InputStreamReader input = new InputStreamReader(new FileInputStream(propertyFile) , "UTF-8");
149
        props.load(input);
150
        input.close();
151
        if(props.getProperty("sortmetadata") != null)
152
                textSortAttribute = props.get("sortmetadata").toString();
153
        if (props.getProperty("editionpage") != null)
154
                paginationElement = props.get("editionpage").toString();
155
        if (props.getProperty("normalizemetadata") != null)
156
                normalizeMetadata = Boolean.parseBoolean(props.get("normalizemetadata").toString());
157
        if (props.getProperty("ignoredelements") != null)
158
                ignoredElements = props.get("ignoredelements").toString();
159
        if (props.getProperty("stopifmalformed") != null)
160
                stopIfMalformed = Boolean.parseBoolean(props.get("stopifmalformed").toString());
161

    
162
        println "import properties: "
163
        println " sort metadata: "+textSortAttribute
164
        println " edition page tag: "+paginationElement
165
        println " normalize attributes: "+normalizeMetadata
166
        println " ignored elements: "+ignoredElements
167
        println " stop if a XML source is malformed: "+stopIfMalformed
168
}
169

    
170
File allmetadatasfile = Metadatas.findMetadataFile(srcDir);
171

    
172
// Apply XSL
173
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
174
if (MONITOR != null) MONITOR.worked(1, "APPLYING XSL")
175
if (xsl != null && xslParams != null && xsl.trim().length() > 0) {
176
        if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"), xslParams))
177
        // return; // error during process
178
        srcDir = new File(binDir, "src");
179
        println ""
180
}
181

    
182
String[] roots = [ System.getProperty("user.home")+"/TXM/scripts/macro/" ];
183
GroovyScriptEngine gse = new GroovyScriptEngine(roots, monitor.getClass().getClassLoader());
184
File jardir = new File(System.getProperty("user.home")+"/TXM/scripts/lib"); //$NON-NLS-1$
185
if (jardir.exists() && jardir.isDirectory()) {
186
        for (File f: jardir.listFiles(new FilenameFilter() {
187
                public boolean accept(File dir, String name) {
188
                        return name.endsWith(".jar"); //$NON-NLS-1$
189
                }
190
        })) {
191
                gse.getGroovyClassLoader().addURL(f.toURI().toURL());
192
        }
193
}
194
// Call AddFacsAttribute - must be called before starting the importer step
195
// Defaults AddFacsAttribute : imageDirectory=$SRC/img, outputDirectory=temp, element=pb, attribute=facs, prefix=mediaPath
196
if (synoptic) {
197
        println "-- Adding the $paginationElement@facs attributes"
198
        File srcDirTemp = new File(binDir, "facs_src")
199
        srcDirTemp.mkdir() // create a new src directory that contains the modified src files
200
        Binding b = new Binding();
201
        b.setVariable("args", ["monitor":monitor, "sourceDirectory":srcDir, "imageDirectory":new File(srcDir, "img"), "outputDirectory":srcDirTemp, "element":paginationElement, "attribute":"facs", "prefix":mediaPath]);
202
        gse.run "org/txm/macro/edition/AddFacsAttributeMacro.groovy", b
203
        println "switching srcDir to $srcDirTemp"
204
        srcDir = srcDirTemp;
205
}
206

    
207
// copy xml+dtd files
208
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
209
List<File> srcfiles = srcDir.listFiles();
210
if (srcfiles != null)
211
for (int i = 0 ; i < srcfiles.size() ; i++) {// check XML format, and copy file into binDir
212
        File f = srcfiles.get(i)
213
        if (f.getName().equals("import.xml") || f.getName().matches("metadata\\.....?") || f.getName().endsWith(".properties")) {
214
                srcfiles.remove(i);
215
                i--;
216
                continue;
217
        }
218
        if (ValidateXml.test(f)) {
219
                FileCopy.copy(f, new File(txmDir, f.getName()));
220
        } else {
221
                println "Won't process file "+f;
222
        }
223
}
224

    
225
if (txmDir.listFiles() == null) {
226
        println "No txm file to process"
227
        return;
228
}
229

    
230
//get metadatas values from CSV
231
Metadatas metadatas; // text metadatas
232

    
233
println "Trying to read metadatas from: "+allmetadatasfile
234
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
235
if (allmetadatasfile.exists()) {
236
        File copy = new File(binDir, allmetadatasfile.getName())
237
        if (!FileCopy.copy(allmetadatasfile, copy)) {
238
                println "Error: could not create a copy of metadata file "+allmetadatasfile.getAbsoluteFile();
239
                return;
240
        }
241
        metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(), Toolbox.getMetadataColumnSeparator(), Toolbox.getMetadataTextSeparator(), 1)
242
} else {
243
        println "no metadata file: "+allmetadatasfile
244
}
245

    
246
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
247
if (MONITOR != null) MONITOR.worked(5, "IMPORTER")
248
println "-- IMPORTER - Reading source files"
249
def imp = new importer();
250
imp.doValidation(true) // change this to not validate xml
251
imp.doTokenize(true) // change this, to not tokenize xml
252
imp.setStopIfMalformed(stopIfMalformed);
253
if (!imp.run( srcDir, binDir, txmDir, basename, ignoredElements, lang)) {
254
        println "import process stopped";
255
        return;
256
}
257

    
258
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
259
if (MONITOR != null) MONITOR.worked(20, "INJECTING METADATA")
260
if (metadatas != null) {
261
        println("-- INJECTING METADATA - "+metadatas.getHeadersList()+" in texts of directory "+new File(binDir,"txm"))
262
        for (File infile : txmDir.listFiles()) {
263
                print "."
264
                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
265
                if (!metadatas.injectMetadatasInXml(infile, outfile, "text", null)) {
266
                        outfile.delete();
267
                } else {
268
                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
269
                        if (!infile.exists()) {
270
                                println "Error: could not replace $infile by $outfile"
271
                                return false;
272
                        }
273
                }
274
        }
275
        println ""
276
}
277
List<File> files = txmDir.listFiles()
278
if (files == null || files.size() == 0) {
279
        return;
280
}
281

    
282
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
283
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
284
println "-- ANNOTATE - Running NLP tools"
285
boolean annotationSuccess = false;
286
if (annotate && new Annotate().run(binDir, txmDir, model+".par")) {
287
        annotationSuccess = true;
288
}
289

    
290
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
291
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
292
println "-- COMPILING - Building Search Engine indexes"
293
def c = new compiler();
294
if(debug) c.setDebug();
295
//c.setCwbPath("~/TXM/cwb/bin");
296
c.setOptions(textSortAttribute, normalizeMetadata);
297
c.setAnnotationSuccess(annotationSuccess)
298
c.setLang(lang);
299
if (!c.run(binDir, txmDir, corpusname, null, srcfiles, metadatas)) {
300
        println "import process stopped";
301
        return;
302
}
303

    
304
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
305
if (MONITOR != null) MONITOR.worked(25, "EDITION")
306
println "-- EDITION - Building edition"
307
new File(binDir,"HTML/$corpusname").deleteDir();
308
new File(binDir,"HTML/$corpusname").mkdirs();
309
File outdir = new File(binDir,"/HTML/$corpusname/default/");
310
outdir.mkdirs();
311
List<File> filelist = txmDir.listFiles();
312
Collections.sort(filelist);
313
def second = 0
314

    
315
println "Paginating texts: "+filelist
316
for (File txmFile : filelist) {
317
        print "."
318
        String txtname = txmFile.getName();
319
        int i = txtname.lastIndexOf(".");
320
        if(i > 0) txtname = txtname.substring(0, i);
321

    
322
        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
323
        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
324

    
325
        Element text = params.addText(corpusElem, txtname, txmFile);
326

    
327
        def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, 30000, basename, paginationElement);
328
        Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
329

    
330
        for (i = 0 ; i < ed.getPageFiles().size();) {
331
                File f = ed.getPageFiles().get(i);
332
                String wordid = ed.getIdx().get(i);
333
                params.addPage(edition, ""+(++i), wordid);
334
        }
335
}
336

    
337
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
338
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
339
File paramFile = new File(binDir, "import.xml");
340
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;
341

    
342
// Call FacsEditionBuilder
343
// Default FacsEditionBuilder : editionName=facs, element=pb, attribute=facs
344
if (synoptic) {
345
        println "-- Building the '$facsEditionName' image edition..."
346
        b = new Binding();
347
        b.setVariable("args", ["monitor":monitor,"params":params, "binDirectory":binDir, "txmDirectory":txmDir,"corpus":srcDirectory, "editionName":facsEditionName, "tag":paginationElement, "attribute":"facs"]);
348
        gse.run "org/txm/macro/edition/FacsEditionBuilderMacro.groovy", b
349
        
350
        //copy images into the binary corpus
351
        if (mediaPath.length() > 0 && !mediaPath.startsWith("http")) {
352
                File binImageDirectory = new File(binDir,"HTML/$basename/$facsEditionName/$mediaPath")
353
                println "copying images into binary corpus: $binImageDirectory ..."
354
                FileCopy.copyFiles(new File(srcDirectory, "img"), binImageDirectory)
355
                println "copy done"
356
        } else {
357
                println "Note: the $facsEditionName edition images are not embedded in the binary corpus"
358
        }
359
}
360

    
361
// Call EditionUpdater
362
// corpus=imported corpus, xslEdition:XTZ xsl, xslPages=XTZ xsl2, editionname=default, useTokenizedDirectory=false
363
if (SpannedEdition) {
364
        println "-- Building Spanned edition..."
365
        
366
        Binding b = new Binding();
367
        b.setVariable("args", ["monitor":monitor,"params":params, "binDirectory":binDir, "txmDirectory":txmDir,"corpus":srcDirectory, "xslEdition":"txm-edition-xtz.xsl", "xslPages":"txm-edition-page-split.xsl", "editionname":"default", "useTokenizedDirectory":false]);
368
        gse.run "org/txm/macro/edition/EditionUpdaterMacro.groovy", b
369
}
370
// // create the ".txm" file
371
// try {
372
//         File exportzip = new File(binDir.getParent(), binDir.getName()+".txm")
373
//         println "creating the $exportzip binary corpus file..."
374
//         Zip.compress(binDir, exportzip);
375
//         println "done."
376
// } catch (IOException e) {
377
//         println "Error during binary file creation: $e"
378
// }
379

    
380
// load corpus
381
println "Loading corpus..."
382
try {LoadBinaryCorpus.loadBase(binDir)} catch(Exception e777){AddBases.loadBase(binDir, monitor)} // LoadBinaryCorpus does not exist if TXM version is < 0.7.7
383

    
384
Toolbox.restartWorkspace();
385
Toolbox.restartSearchEngine();
386

    
387
monitor.syncExec(new Runnable() {
388
        @Override
389
        public void run() {
390
                println "Reloading corpora view..."
391
                RestartTXM.reloadViews();
392
                println "import done."
393
        }
394
});