Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / macroproto / importer / XTZImporterMacro.groovy @ 187

History | View | Annotate | Download (14.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2013-05-15 14:30:48 +0200 (mer., 15 mai 2013) $
25
// $LastChangedRevision: 2392 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.macroproto.importer;
29

    
30
import groovy.transform.Field
31

    
32
import javax.xml.stream.*
33

    
34
import org.kohsuke.args4j.*
35
import org.txm.*
36
import org.txm.importer.ApplyXsl2
37
import org.txm.importer.ValidateXml
38
import org.txm.importer.xml.compiler
39
import org.txm.importer.xml.importer
40
import org.txm.importer.xml.pager
41
import org.txm.metadatas.*
42
import org.txm.objects.*
43
import org.txm.rcpapplication.commands.*
44
import org.txm.rcpapplication.commands.workspace.*
45
import org.txm.rcpapplication.swt.widget.parameters.*
46
import org.txm.scripts.teitxm.*
47
import org.txm.tokenizer.TokenizerClasses
48
import org.txm.utils.*
49
import org.txm.utils.i18n.*
50
import org.txm.utils.xml.DomUtils
51
import org.w3c.dom.Element
52

    
53
String userDir = System.getProperty("user.home");
54

    
55
// BEGINNING OF PARAMETERS
56
@Field @Option(name="srcDirectory", usage="the directory containing the DOC/ODT/RTF files to convert", widget="Folder", required=true, def="srcDirectory")
57
File srcDirectory
58

    
59
@Field @Option(name="SpannedEdition", usage="edition xsl", widget="Boolean", required=false, def="true")
60
Boolean SpannedEdition // call EditionUpdater
61
// calls EditionUpdater : 
62
// Defaults corpus=imported corpus, xslEdition:XTZ xsl, xslPages=XTZ xsl2, editionname=default, useTokenizedDirectory=false
63

    
64
@Field @Option(name="synoptic", usage="syn", widget="Boolean", required=false, def="true")
65
Boolean synoptic 
66
// calls AddFacsAttribute + FacsEditionBuilder. 
67
// Defaults AddFacsAttribute : imageDirectory=$SRC/img, outputDirectory=temp, element=pb, attribute=n, prefix=mediaPath
68
// Default FacsEditionBuilder : editionName=facs, element=pb, attribute=facs
69
 
70
@Field @Option(name="mediaPath", usage="absolute or relative path to images directory", widget="String", required=false, def="img")
71
String mediaPath
72

    
73
@Field @Option(name="facsEditionName", usage="facs edition name to create", widget="String", required=false, def="facs")
74
String facsEditionName
75

    
76
// Open the parameters input dialog box
77
if (!ParametersDialog.open(this)) return;
78

    
79
if (!mediaPath.endsWith("/")) mediaPath += "/"
80

    
81
println "Parameters: "
82
println "        srcDirectory: $srcDirectory"
83
println "        SpannedEdition: $SpannedEdition"
84
println "        synoptic: $synoptic"
85
println "        mediaPath: $mediaPath"
86
println "        facsEditionName: $facsEditionName"
87

    
88
def MONITOR;
89
boolean debug = false;
90
File importFile = new File(srcDirectory, "import.xml")
91
if (!importFile.exists()) {
92
        println "No import.xml file found in $srcDirectory"
93
        println "Aborting."
94
        return;
95
}
96
BaseParameters params = new BaseParameters(importFile);
97
params.load()
98
params.rootDir = srcDirectory.getAbsolutePath();
99
TokenizerClasses.loadFromNode(params.getTokenizerElement(params.getCorpusElement()));
100
if (params == null) { println "no parameters. Aborting"; return; }
101

    
102
String corpusname = params.getCorpusName();
103
Element corpusElem = params.corpora.get(corpusname);
104
String basename = params.name;
105
String rootDir = params.rootDir;
106
String lang = corpusElem.getAttribute("lang");
107
String model = lang
108
String encoding = corpusElem.getAttribute("encoding");
109
boolean annotate = "true" == corpusElem.getAttribute("annotate");
110
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
111
def xslParams = params.getXsltParams(corpusElem);
112

    
113
File srcDir = new File(rootDir);
114
File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME), "corpora/"+basename);
115
binDir.deleteDir();
116
binDir.mkdirs();
117
if (!binDir.exists()) {
118
        println "Could not create binDir "+binDir
119
        return;
120
}
121

    
122
File txmDir = new File(binDir, "txm/$corpusname");
123
txmDir.deleteDir();
124
txmDir.mkdirs();
125

    
126
File propertyFile = new File(rootDir, "import.properties")//default
127
Properties props = new Properties();
128
String[] metadatasToKeep;
129

    
130
String textSortAttribute = null;
131
String paginationElement = "pb";
132
boolean normalizeMetadata = false;
133
String ignoredElements = null;
134
boolean stopIfMalformed = false;
135

    
136
println "Trying to read import properties file: "+propertyFile
137
if (propertyFile.exists() && propertyFile.canRead()) {
138
        InputStreamReader input = new InputStreamReader(new FileInputStream(propertyFile) , "UTF-8");
139
        props.load(input);
140
        input.close();
141
        if(props.getProperty("sortmetadata") != null)
142
                textSortAttribute = props.get("sortmetadata").toString();
143
        if (props.getProperty("editionpage") != null)
144
                paginationElement = props.get("editionpage").toString();
145
        if (props.getProperty("normalizemetadata") != null)
146
                normalizeMetadata = Boolean.parseBoolean(props.get("normalizemetadata").toString());
147
        if (props.getProperty("ignoredelements") != null)
148
                ignoredElements = props.get("ignoredelements").toString();
149
        if (props.getProperty("stopifmalformed") != null)
150
                stopIfMalformed = Boolean.parseBoolean(props.get("stopifmalformed").toString());
151

    
152
        println "import properties: "
153
        println " sort metadata: "+textSortAttribute
154
        println " edition page tag: "+paginationElement
155
        println " normalize attributes: "+normalizeMetadata
156
        println " ignored elements: "+ignoredElements
157
        println " stop if a XML source is malformed: "+stopIfMalformed
158
}
159

    
160
File allmetadatasfile = new File(srcDir, "metadata.csv");
161

    
162
// Apply XSL
163
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
164
if (MONITOR != null) MONITOR.worked(1, "APPLYING XSL")
165
if (xsl != null && xslParams != null && xsl.trim().length() > 0) {
166
        if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"), xslParams))
167
        // return; // error during process
168
        srcDir = new File(binDir, "src");
169
        println ""
170
}
171

    
172
String[] roots = [ System.getProperty("user.home")+"/TXM/scripts/macro/" ];
173
GroovyScriptEngine gse = new GroovyScriptEngine(roots, monitor.getClass().getClassLoader());
174
File jardir = new File(System.getProperty("user.home")+"/TXM/scripts/lib"); //$NON-NLS-1$
175
if (jardir.exists() && jardir.isDirectory()) {
176
        for (File f: jardir.listFiles(new FilenameFilter() {
177
                public boolean accept(File dir, String name) {
178
                        return name.endsWith(".jar"); //$NON-NLS-1$
179
                }
180
        })) {
181
                gse.getGroovyClassLoader().addURL(f.toURI().toURL());
182
        }
183
}
184
// Call AddFacsAttribute - must be called before starting the importer step
185
// Defaults AddFacsAttribute : imageDirectory=$SRC/img, outputDirectory=temp, element=pb, attribute=facs, prefix=mediaPath
186
if (synoptic) {
187
        println "-- Adding the $paginationElement@facs attributes"
188
        File srcDirTemp = new File(binDir, "facs_src")
189
        srcDirTemp.mkdir() // create a new src directory that contains the modified src files
190
        Binding b = new Binding();
191
        b.setVariable("args", ["monitor":monitor, "sourceDirectory":srcDir, "imageDirectory":new File(srcDir, "img"), "outputDirectory":srcDirTemp, "element":paginationElement, "attribute":"facs", "prefix":mediaPath]);
192
        gse.run "org/txm/macro/edition/AddFacsAttributeMacro.groovy", b
193
        println "switching srcDir to $srcDirTemp"
194
        srcDir = srcDirTemp;
195
}
196

    
197
// copy xml+dtd files
198
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
199
List<File> srcfiles = srcDir.listFiles();
200
if (srcfiles != null)
201
for (int i = 0 ; i < srcfiles.size() ; i++) {// check XML format, and copy file into binDir
202
        File f = srcfiles.get(i)
203
        if (f.getName().equals("import.xml") || f.getName().equals("metadata.csv") || f.getName().endsWith(".properties")) {
204
                srcfiles.remove(i);
205
                i--;
206
                continue;
207
        }
208
        if (ValidateXml.test(f)) {
209
                FileCopy.copy(f, new File(txmDir, f.getName()));
210
        } else {
211
                println "Won't process file "+f;
212
        }
213
}
214

    
215
if (txmDir.listFiles() == null) {
216
        println "No txm file to process"
217
        return;
218
}
219

    
220
//get metadatas values from CSV
221
Metadatas metadatas; // text metadatas
222

    
223
println "Trying to read metadatas from: "+allmetadatasfile
224
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
225
if (allmetadatasfile.exists()) {
226
        File copy = new File(binDir, "metadata.csv")
227
        if (!FileCopy.copy(allmetadatasfile, copy)) {
228
                println "Error: could not create a copy of metadata file "+allmetadatasfile.getAbsoluteFile();
229
                return;
230
        }
231
        metadatas = new Metadatas(copy, Toolbox.getParam(Toolbox.METADATA_ENCODING), Toolbox.getParam(Toolbox.METADATA_COLSEPARATOR), Toolbox.getParam(Toolbox.METADATA_TXTSEPARATOR), 1)
232
} else {
233
        println "no metadata file: "+allmetadatasfile
234
}
235

    
236
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
237
if (MONITOR != null) MONITOR.worked(5, "IMPORTER")
238
println "-- IMPORTER - Reading source files"
239
def imp = new importer();
240
imp.doValidation(true) // change this to not validate xml
241
imp.doTokenize(true) // change this, to not tokenize xml
242
imp.setStopIfMalformed(stopIfMalformed);
243
if (!imp.run( srcDir, binDir, txmDir, basename, ignoredElements, lang)) {
244
        println "import process stopped";
245
        return;
246
}
247

    
248
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
249
if (MONITOR != null) MONITOR.worked(20, "INJECTING METADATA")
250
if (metadatas != null) {
251
        println("-- INJECTING METADATA - "+metadatas.getHeadersList()+" in texts of directory "+new File(binDir,"txm"))
252
        for (File infile : txmDir.listFiles()) {
253
                print "."
254
                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
255
                if (!metadatas.injectMetadatasInXml(infile, outfile, "text", null)) {
256
                        outfile.delete();
257
                } else {
258
                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
259
                        if (!infile.exists()) {
260
                                println "Error: could not replace $infile by $outfile"
261
                                return false;
262
                        }
263
                }
264
        }
265
        println ""
266
}
267
List<File> files = txmDir.listFiles()
268
if (files == null || files.size() == 0) {
269
        return;
270
}
271

    
272
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
273
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
274
println "-- ANNOTATE - Running NLP tools"
275
boolean annotationSuccess = false;
276
if (annotate && new Annotate().run(binDir, txmDir, model+".par")) {
277
        annotationSuccess = true;
278
}
279

    
280
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
281
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
282
println "-- COMPILING - Building Search Engine indexes"
283
def c = new compiler();
284
if(debug) c.setDebug();
285
//c.setCwbPath("~/TXM/cwb/bin");
286
c.setOptions(textSortAttribute, normalizeMetadata);
287
c.setAnnotationSuccess(annotationSuccess)
288
c.setLang(lang);
289
if (!c.run(binDir, txmDir, corpusname, null, srcfiles, metadatas)) {
290
        println "import process stopped";
291
        return;
292
}
293

    
294
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
295
if (MONITOR != null) MONITOR.worked(25, "EDITION")
296
println "-- EDITION - Building edition"
297
new File(binDir,"HTML/$corpusname").deleteDir();
298
new File(binDir,"HTML/$corpusname").mkdirs();
299
File outdir = new File(binDir,"/HTML/$corpusname/default/");
300
outdir.mkdirs();
301
List<File> filelist = txmDir.listFiles();
302
Collections.sort(filelist);
303
def second = 0
304

    
305
println "Paginating texts: "+filelist
306
for (File txmFile : filelist) {
307
        print "."
308
        String txtname = txmFile.getName();
309
        int i = txtname.lastIndexOf(".");
310
        if(i > 0) txtname = txtname.substring(0, i);
311

    
312
        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
313
        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
314

    
315
        Element text = params.addText(corpusElem, txtname, txmFile);
316

    
317
        def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, 30000, basename, paginationElement);
318
        Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
319

    
320
        for (i = 0 ; i < ed.getPageFiles().size();) {
321
                File f = ed.getPageFiles().get(i);
322
                String wordid = ed.getIdx().get(i);
323
                params.addPage(edition, ""+(++i), wordid);
324
        }
325
}
326

    
327
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
328
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
329
File paramFile = new File(binDir, "import.xml");
330
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;
331

    
332
// Call FacsEditionBuilder
333
// Default FacsEditionBuilder : editionName=facs, element=pb, attribute=facs
334
if (synoptic) {
335
        println "-- Building the '$facsEditionName' image edition..."
336
        b = new Binding();
337
        b.setVariable("args", ["monitor":monitor,"params":params, "binDirectory":binDir, "txmDirectory":txmDir,"corpus":srcDirectory, "editionName":facsEditionName, "tag":paginationElement, "attribute":"facs"]);
338
        gse.run "org/txm/macro/edition/FacsEditionBuilderMacro.groovy", b
339
        
340
        //copy images into the binary corpus
341
        if (mediaPath.length() > 0 && !mediaPath.startsWith("http")) {
342
                File binImageDirectory = new File(binDir,"HTML/$basename/$facsEditionName/$mediaPath")
343
                println "copying images into binary corpus: $binImageDirectory ..."
344
                FileCopy.copyFiles(new File(srcDirectory, "img"), binImageDirectory)
345
                println "copy done"
346
        } else {
347
                println "Note: the $facsEditionName edition images are not embedded in the binary corpus"
348
        }
349
}
350

    
351
// Call EditionUpdater
352
// corpus=imported corpus, xslEdition:XTZ xsl, xslPages=XTZ xsl2, editionname=default, useTokenizedDirectory=false
353
if (SpannedEdition) {
354
        println "-- Building Spanned edition..."
355
        
356
        Binding b = new Binding();
357
        b.setVariable("args", ["monitor":monitor,"params":params, "binDirectory":binDir, "txmDirectory":txmDir,"corpus":srcDirectory, "xslEdition":"txm-edition-xtz.xsl", "xslPages":"txm-edition-page-split.xsl", "editionname":"default", "useTokenizedDirectory":false]);
358
        gse.run "org/txm/macro/edition/EditionUpdaterMacro.groovy", b
359
}
360
// // create the ".txm" file
361
// try {
362
//         File exportzip = new File(binDir.getParent(), binDir.getName()+".txm")
363
//         println "creating the $exportzip binary corpus file..."
364
//         Zip.compress(binDir, exportzip);
365
//         println "done."
366
// } catch (IOException e) {
367
//         println "Error during binary file creation: $e"
368
// }
369

    
370
// load corpus
371
println "Loading corpus..."
372
try {LoadBinaryCorpus.loadBase(binDir)} catch(Exception e777){AddBases.loadBase(binDir, monitor)} // LoadBinaryCorpus does not exist if TXM version is < 0.7.7
373

    
374
Toolbox.restartWorkspace();
375
Toolbox.restartSearchEngine();
376

    
377
monitor.syncExec(new Runnable() {
378
        @Override
379
        public void run() {
380
                println "Reloading corpora view..."
381
                RestartTXM.reloadViews();
382
                println "import done."
383
        }
384
});