Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xml / xmlLoader.groovy @ 2554

History | View | Annotate | Download (9.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-11-29 16:47:07 +0100 (mar. 29 nov. 2016) $
25
// $LastChangedRevision: 3349 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.xml;
29

    
30
import javax.xml.stream.XMLStreamReader;
31

    
32
import org.txm.scripts.importer.RemoveTag;
33
import org.txm.importer.ApplyXsl2;
34
import org.txm.importer.ValidateXml;
35
import org.txm.scripts.importer.xml.importer;
36
import org.txm.scripts.importer.xml.compiler;
37
import org.txm.scripts.importer.xml.pager_old;
38
import org.txm.objects.*;
39
import org.txm.tokenizer.TokenizerClasses;
40
import org.txm.utils.*
41
import org.txm.utils.io.*;
42
import org.txm.*;
43
import org.txm.core.engines.*;
44
import org.txm.importer.scripts.xmltxm.*;
45
import org.txm.utils.i18n.*;
46
import org.txm.metadatas.*;
47
import javax.xml.stream.*;
48
import org.w3c.dom.Element
49
import org.txm.utils.xml.DomUtils;
50

    
51
String userDir = System.getProperty("user.home");
52

    
53
def MONITOR;
54
Project project;
55

    
56
try {project=projectBinding;MONITOR=monitor} catch (Exception)
57
{        }
58
if (project == null) { println "no project set. Aborting"; return; }
59

    
60
String corpusname = project.getName();
61
String basename = corpusname
62
String rootDir = project.getSrcdir();
63
String lang = project.getLang()
64
String model = lang
65
String encoding = project.getEncoding()
66
boolean annotate = project.getAnnotate()
67
String xsl = project.getFrontXSL();
68
def xslParams = project.getXsltParameters();
69
int wordsPerPage = project.getEditionDefinition("default").getWordsPerPage()
70
String page_element = project.getEditionDefinition("default").getPageElement()
71
boolean build_edition = project.getEditionDefinition("default").getBuildEdition()
72
boolean doTokenizeStep = project.getDoTokenizerStep()
73

    
74
File srcDir = new File(rootDir);
75
File binDir = project.getProjectDirectory()
76
binDir.mkdirs();
77
if (!binDir.exists()) {
78
        println "Could not create binDir "+binDir
79
        return;
80
}
81

    
82
File txmDir = new File(binDir, "txm/$corpusname");
83
txmDir.deleteDir();
84
txmDir.mkdirs();
85

    
86
File propertyFile = new File(rootDir, "import.properties")//default
87
Properties props = new Properties();
88
String[] metadatasToKeep;
89

    
90
String textSortAttribute = null;
91
boolean normalizeMetadata = false;
92
String ignoredElements = null;
93
boolean stopIfMalformed = false;
94

    
95
println "Trying to read import properties file: "+propertyFile
96
if (propertyFile.exists() && propertyFile.canRead()) {
97
        InputStreamReader input = new InputStreamReader(new FileInputStream(propertyFile) , "UTF-8");
98
        props.load(input);
99
        input.close();
100
        if(props.getProperty("sortmetadata") != null)
101
                textSortAttribute = props.get("sortmetadata").toString();
102
        if (props.getProperty("normalizemetadata") != null)
103
                normalizeMetadata = Boolean.parseBoolean(props.get("normalizemetadata").toString());
104
        if (props.getProperty("ignoredelements") != null)
105
                ignoredElements = props.get("ignoredelements").toString();
106
        if (props.getProperty("stopifmalformed") != null)
107
                stopIfMalformed = Boolean.parseBoolean(props.get("stopifmalformed").toString());
108

    
109
        println "import properties: "
110
        println " sort metadata: "+textSortAttribute
111
        println " normalize attributes: "+normalizeMetadata
112
        println " ignored elements: "+ignoredElements
113
        println " stop if a XML source is malformed: "+stopIfMalformed
114
}
115

    
116
File allMetadataFile = Metadatas.findMetadataFile(srcDir);
117

    
118
// Apply XSL
119
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
120
if (MONITOR != null) MONITOR.worked(1, "APPLYING XSL")
121
if (xsl != null && xslParams != null && xsl.trim().length() > 0) {
122
        new File(binDir, "src").deleteDir() // remove old outputed files if any
123
        if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"), xslParams))
124
        // return; // error during process
125
        srcDir = new File(binDir, "src");
126
        println ""
127
}
128

    
129
// copy xml+dtd files
130
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
131
List<File> srcfiles = srcDir.listFiles();
132
if (srcfiles != null)
133
        for (int i = 0 ; i < srcfiles.size() ; i++) {// check XML format, and copy file into binDir
134
                File f = srcfiles.get(i)
135
                if (f.getName().equals("import.xml") || f.getName().matches("metadata\\.....?") || f.getName().endsWith(".properties")) {
136
                        srcfiles.remove(i);
137
                        i--;
138
                        continue;
139
                }
140
                if (ValidateXml.test(f)) {
141
                        FileCopy.copy(f, new File(txmDir, f.getName()));
142
                } else {
143
                        println "Won't process file "+f;
144
                }
145
        }
146

    
147
if (txmDir.listFiles() == null) {
148
        println "No txm file to process"
149
        return;
150
}
151

    
152
// filtering
153
/*def xpaths = params.getExcludeXpaths()
154
 if (xpaths != null) {
155
 println "Filtering XML files with xpaths: $xpaths"
156
 for (File infile : txmDir.listFiles()) {
157
 print "."
158
 if (!RemoveTag.xpath(infile, xpaths)) {
159
 println "Failed to filter $infile"
160
 return
161
 }
162
 }
163
 println ""
164
 }*/
165

    
166
//get metadata values from CSV
167
Metadatas metadatas; // text metadata
168

    
169
println "Trying to read metadata from: "+allMetadataFile
170
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
171
if (allMetadataFile.exists()) {
172
        println " Metadata file found: "+allMetadataFile
173
        File copy = new File(binDir, allMetadataFile.getName())
174
        if (!FileCopy.copy(allMetadataFile, copy)) {
175
                println "Error: could not create a copy of metadata file "+allMetadataFile.getAbsoluteFile();
176
                return;
177
        }
178
        metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(), Toolbox.getMetadataColumnSeparator(), Toolbox.getMetadataTextSeparator(), 1)
179
} else {
180
        println " No metadata file: "+allMetadataFile
181
}
182

    
183
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
184
if (MONITOR != null) MONITOR.worked(5, "IMPORTER")
185
println "-- IMPORTER - Reading source files"
186
def imp = new importer();
187
imp.doValidation(true) // change this to not validate xml
188

    
189
imp.doTokenize(doTokenizeStep) // change this, to not tokenize xml
190
imp.setStopIfMalformed(stopIfMalformed);
191
if (!imp.run( srcDir, binDir, txmDir, basename, ignoredElements, lang)) {
192
        println "import process stopped";
193
        return;
194
}
195

    
196
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
197
if (MONITOR != null) MONITOR.worked(20, "INJECTING METADATA")
198
if (metadatas != null) {
199
        
200
        
201
        println("-- INJECTING METADATA - "+metadatas.getHeadersList()+" in texts of directory "+new File(binDir,"txm"))
202
        
203
        def files = txmDir.listFiles()
204
        ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
205
        for (File infile : files) {
206
                cpb.tick()
207
                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
208
                if (!metadatas.injectMetadatasInXml(infile, outfile, "text", null)) {
209
                        outfile.delete();
210
                } else {
211
                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
212
                        if (!infile.exists()) {
213
                                println "Error: could not replace $infile by $outfile"
214
                                return false;
215
                        }
216
                }
217
        }
218
        cpb.done()
219
}
220
List<File> files = txmDir.listFiles()
221
if (files == null || files.size() == 0) {
222
        return;
223
}
224

    
225
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
226
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
227

    
228
boolean annotationSuccess = false;
229
if (annotate) {
230
        println "-- ANNOTATE - Running NLP tools"
231
        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
232
        if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
233
                annotationSuccess = true;
234
        }
235
}
236

    
237
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
238
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
239
println "-- COMPILING - Building Search Engine indexes"
240
def c = new compiler();
241
if(debug) c.setDebug();
242
//c.setCwbPath("~/TXM/cwb/bin");
243
c.setOptions(textSortAttribute, normalizeMetadata);
244
c.setAnnotationSuccess(annotationSuccess)
245
c.setLang(lang);
246
if (!c.run(project, binDir, txmDir, corpusname, null, srcfiles, metadatas)) {
247
        println "import process stopped";
248
        return;
249
}
250

    
251
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
252

    
253
new File(binDir,"HTML/$corpusname").deleteDir();
254
new File(binDir,"HTML/$corpusname").mkdirs();
255
if (build_edition) {
256

    
257
        println "-- EDITION - Building edition"
258
        if (MONITOR != null) MONITOR.worked(25, "EDITION")
259
        
260
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
261
        outdir.mkdirs();
262
        List<File> filelist = txmDir.listFiles();
263
        Collections.sort(filelist);
264
        def second = 0
265

    
266
        println "Paginating "+filelist.size()+" texts"
267
        ConsoleProgressBar cpb = new ConsoleProgressBar(filelist.size());
268
        for (File txmFile : filelist) {
269
                cpb.tick()
270
                String txtname = txmFile.getName();
271
                int i = txtname.lastIndexOf(".");
272
                if (i > 0) txtname = txtname.substring(0, i);
273

    
274
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
275
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
276

    
277
                Text t = new Text(project);
278
                t.setName(txtname);
279
                t.setSourceFile(txmFile)
280
                t.setTXMFile(txmFile)
281

    
282
                def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element);
283
                Edition edition = new Edition(t);
284
                edition.setName("default");
285
                edition.setIndex(outdir.getAbsolutePath());
286

    
287
                for (i = 0 ; i < ed.getPageFiles().size();) {
288
                        File f = ed.getPageFiles().get(i);
289
                        String wordid = "w_0";
290
                                if (i < ed.getIdx().size()) wordid = ed.getIdx().get(i);
291
                        edition.addPage(""+(++i), wordid);
292
                }
293
        }
294
        cpb.done()
295
}
296

    
297
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
298
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
299

    
300
readyToLoad = project.save();