Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / xml / xmlLoader.groovy @ 187

History | View | Annotate | Download (10.7 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-11-29 16:47:07 +0100 (Tue, 29 Nov 2016) $
25
// $LastChangedRevision: 3349 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.xml;
29

    
30
import javax.xml.stream.XMLStreamReader;
31

    
32
import org.txm.sw.RemoveTag;
33
import org.txm.importer.ApplyXsl2;
34
import org.txm.importer.ValidateXml;
35
import org.txm.importer.xml.importer;
36
import org.txm.importer.xml.compiler;
37
import org.txm.importer.xml.pager_old;
38
import org.txm.objects.*;
39
import org.txm.tokenizer.TokenizerClasses;
40
import org.txm.utils.*;
41
import org.txm.*;
42
import org.txm.scripts.teitxm.*;
43
import org.txm.utils.i18n.*;
44
import org.txm.metadatas.*;
45

    
46
import javax.xml.stream.*;
47

    
48
import org.w3c.dom.Element
49

    
50
String userDir = System.getProperty("user.home");
51

    
52
def MONITOR;
53
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
54
BaseParameters params;
55
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
56
{        println "DEV MODE";//exception means we debug
57
        debug = true
58
        params = new BaseParameters(new File(userDir, "xml/bvh/bugnames/import.xml"))
59
        params.load()
60
        if (!org.txm.Toolbox.isInitialized()) {
61

    
62
                TokenizerClasses.loadFromNode(params.getTokenizerElement(params.getCorpusElement()));
63
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
64
                //Toolbox.setParam(Toolbox.INSTALL_DIR,new File("C:\\Program Files\\TXM"));//For Windows
65
                Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
66
                //Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File("C:\\Program Files\\treetagger"));//for Windows
67
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
68
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
69
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
70
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
71
                //Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File("C:\\Program Files\\treetagger\\models"));//for Windows
72
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
73
        }
74
}
75
if (params == null) { println "no parameters. Aborting"; return; }
76

    
77
String corpusname = params.getCorpusName();
78
Element corpusElem = params.corpora.get(corpusname);
79
String basename = params.name;
80
String rootDir = params.rootDir;
81
String lang = corpusElem.getAttribute("lang");
82
String model = lang
83
String encoding = corpusElem.getAttribute("encoding");
84
boolean annotate = "true" == corpusElem.getAttribute("annotate");
85
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
86
def xslParams = params.getXsltParams(corpusElem);
87
int wordsPerPage = params.getWordsPerPage("default")
88
String page_element = params.getPageElement("default")
89
boolean build_edition = params.getDoEdition("default")
90

    
91
File srcDir = new File(rootDir);
92
File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME), "corpora/"+basename);
93
binDir.deleteDir();
94
binDir.mkdirs();
95
if (!binDir.exists()) {
96
        println "Could not create binDir "+binDir
97
        return;
98
}
99

    
100
File txmDir = new File(binDir, "txm/$corpusname");
101
txmDir.deleteDir();
102
txmDir.mkdirs();
103

    
104
File propertyFile = new File(rootDir, "import.properties")//default
105
Properties props = new Properties();
106
String[] metadatasToKeep;
107

    
108
String textSortAttribute = null;
109
boolean normalizeMetadata = false;
110
String ignoredElements = null;
111
boolean stopIfMalformed = false;
112

    
113
println "Trying to read import properties file: "+propertyFile
114
if (propertyFile.exists() && propertyFile.canRead()) {
115
        InputStreamReader input = new InputStreamReader(new FileInputStream(propertyFile) , "UTF-8");
116
        props.load(input);
117
        input.close();
118
        if(props.getProperty("sortmetadata") != null)
119
                textSortAttribute = props.get("sortmetadata").toString();
120
        if (props.getProperty("normalizemetadata") != null)
121
                normalizeMetadata = Boolean.parseBoolean(props.get("normalizemetadata").toString());
122
        if (props.getProperty("ignoredelements") != null)
123
                ignoredElements = props.get("ignoredelements").toString();
124
        if (props.getProperty("stopifmalformed") != null)
125
                stopIfMalformed = Boolean.parseBoolean(props.get("stopifmalformed").toString());
126

    
127
        println "import properties: "
128
        println " sort metadata: "+textSortAttribute
129
        println " normalize attributes: "+normalizeMetadata
130
        println " ignored elements: "+ignoredElements
131
        println " stop if a XML source is malformed: "+stopIfMalformed
132
}
133

    
134
File allmetadatasfile = new File(srcDir, "metadata.csv");
135

    
136
// Apply XSL
137
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
138
if (MONITOR != null) MONITOR.worked(1, "APPLYING XSL")
139
if (xsl != null && xslParams != null && xsl.trim().length() > 0) {
140
        if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"), xslParams))
141
        // return; // error during process
142
        srcDir = new File(binDir, "src");
143
        println ""
144
}
145

    
146
// copy xml+dtd files
147
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
148
List<File> srcfiles = srcDir.listFiles();
149
if (srcfiles != null)
150
        for (int i = 0 ; i < srcfiles.size() ; i++) {// check XML format, and copy file into binDir
151
                File f = srcfiles.get(i)
152
                if (f.getName().equals("import.xml") || f.getName().equals("metadata.csv") || f.getName().endsWith(".properties")) {
153
                        srcfiles.remove(i);
154
                        i--;
155
                        continue;
156
                }
157
                if (ValidateXml.test(f)) {
158
                        FileCopy.copy(f, new File(txmDir, f.getName()));
159
                } else {
160
                        println "Won't process file "+f;
161
                }
162
        }
163

    
164
if (txmDir.listFiles() == null) {
165
        println "No txm file to process"
166
        return;
167
}
168

    
169
// filtering
170
/*def xpaths = params.getExcludeXpaths()
171
 if (xpaths != null) {
172
 println "Filtering XML files with xpaths: $xpaths"
173
 for (File infile : txmDir.listFiles()) {
174
 print "."
175
 if (!RemoveTag.xpath(infile, xpaths)) {
176
 println "Failed to filter $infile"
177
 return
178
 }
179
 }
180
 println ""
181
 }*/
182

    
183
//get metadata values from CSV
184
Metadatas metadatas; // text metadata
185

    
186
println "Trying to read metadata from: "+allmetadatasfile
187
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
188
if (allmetadatasfile.exists()) {
189
        println " Metadata file found: "+allmetadatasfile
190
        File copy = new File(binDir, "metadata.csv")
191
        if (!FileCopy.copy(allmetadatasfile, copy)) {
192
                println "Error: could not create a copy of metadata file "+allmetadatasfile.getAbsoluteFile();
193
                return;
194
        }
195
        metadatas = new Metadatas(copy, Toolbox.getParam(Toolbox.METADATA_ENCODING), Toolbox.getParam(Toolbox.METADATA_COLSEPARATOR), Toolbox.getParam(Toolbox.METADATA_TXTSEPARATOR), 1)
196
} else {
197
        println " No metadata file: "+allmetadatasfile
198
}
199

    
200
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
201
if (MONITOR != null) MONITOR.worked(5, "IMPORTER")
202
println "-- IMPORTER - Reading source files"
203
def imp = new importer();
204
imp.doValidation(true) // change this to not validate xml
205
boolean skipDoTokenizeStep = params.getSkipTokenization()
206
imp.doTokenize(!skipDoTokenizeStep) // change this, to not tokenize xml
207
imp.setStopIfMalformed(stopIfMalformed);
208
if (!imp.run( srcDir, binDir, txmDir, basename, ignoredElements, lang)) {
209
        println "import process stopped";
210
        return;
211
}
212

    
213
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
214
if (MONITOR != null) MONITOR.worked(20, "INJECTING METADATA")
215
if (metadatas != null) {
216
        println("-- INJECTING METADATA - "+metadatas.getHeadersList()+" in texts of directory "+new File(binDir,"txm"))
217
        for (File infile : txmDir.listFiles()) {
218
                print "."
219
                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
220
                if (!metadatas.injectMetadatasInXml(infile, outfile, "text", null)) {
221
                        outfile.delete();
222
                } else {
223
                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
224
                        if (!infile.exists()) {
225
                                println "Error: could not replace $infile by $outfile"
226
                                return false;
227
                        }
228
                }
229
        }
230
        println ""
231
}
232
List<File> files = txmDir.listFiles()
233
if (files == null || files.size() == 0) {
234
        return;
235
}
236

    
237
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
238
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
239
println "-- ANNOTATE - Running NLP tools"
240
boolean annotationSuccess = false;
241
if (annotate && new Annotate().run(binDir, txmDir, model+".par")) {
242
        annotationSuccess = true;
243
}
244

    
245
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
246
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
247
println "-- COMPILING - Building Search Engine indexes"
248
def c = new compiler();
249
if(debug) c.setDebug();
250
//c.setCwbPath("~/TXM/cwb/bin");
251
c.setOptions(textSortAttribute, normalizeMetadata);
252
c.setAnnotationSuccess(annotationSuccess)
253
c.setLang(lang);
254
if (!c.run(binDir, txmDir, corpusname, null, srcfiles, metadatas)) {
255
        println "import process stopped";
256
        return;
257
}
258

    
259
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
260

    
261
new File(binDir,"HTML/$corpusname").deleteDir();
262
new File(binDir,"HTML/$corpusname").mkdirs();
263
if (build_edition) {
264

    
265
        println "-- EDITION - Building edition"
266
        if (MONITOR != null) MONITOR.worked(25, "EDITION")
267
        
268
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
269
        outdir.mkdirs();
270
        List<File> filelist = txmDir.listFiles();
271
        Collections.sort(filelist);
272
        def second = 0
273

    
274
        println "Paginating texts: "+filelist
275

    
276
        for (File txmFile : filelist) {
277
                print "."
278
                String txtname = txmFile.getName();
279
                int i = txtname.lastIndexOf(".");
280
                if (i > 0) txtname = txtname.substring(0, i);
281

    
282
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
283
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
284

    
285
                Element text = params.addText(corpusElem, txtname, txmFile);
286

    
287
                def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element);
288
                Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
289

    
290
                for (i = 0 ; i < ed.getPageFiles().size();) {
291
                        File f = ed.getPageFiles().get(i);
292
                        String wordid = ed.getIdx().get(i);
293
                        params.addPage(edition, ""+(++i), wordid);
294
                }
295
        }
296
}
297

    
298
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
299
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
300
File paramFile = new File(binDir, "import.xml");
301
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;