Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xml / xmlLoader.groovy @ 1488

History | View | Annotate | Download (9.7 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-11-29 16:47:07 +0100 (mar. 29 nov. 2016) $
25
// $LastChangedRevision: 3349 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.xml;
29

    
30
import javax.xml.stream.XMLStreamReader;
31

    
32
import org.txm.scripts.importer.RemoveTag;
33
import org.txm.importer.ApplyXsl2;
34
import org.txm.importer.ValidateXml;
35
import org.txm.scripts.importer.xml.importer;
36
import org.txm.scripts.importer.xml.compiler;
37
import org.txm.scripts.importer.xml.pager_old;
38
import org.txm.objects.*;
39
import org.txm.tokenizer.TokenizerClasses;
40
import org.txm.utils.*
41
import org.txm.utils.io.*;
42
import org.txm.*;
43
import org.txm.core.engines.*;
44
import org.txm.importer.scripts.xmltxm.*;
45
import org.txm.utils.i18n.*;
46
import org.txm.metadatas.*;
47
import javax.xml.stream.*;
48
import org.w3c.dom.Element
49
import org.txm.utils.xml.DomUtils;
50

    
51
String userDir = System.getProperty("user.home");
52

    
53
def MONITOR;
54
Project project;
55

    
56
try {project=projectBinding;MONITOR=monitor} catch (Exception)
57
{        }
58
if (project == null) { println "no project set. Aborting"; return; }
59

    
60
String corpusname = project.getName();
61
String basename = corpusname
62
String rootDir = project.getSrcdir();
63
String lang = project.getLang()
64
String model = lang
65
String encoding = project.getEncoding()
66
boolean annotate = project.getAnnotate()
67
String xsl = project.getFrontXSL();
68
def xslParams = project.getXsltParameters();
69
int wordsPerPage = project.getEditionDefinition("default").getWordsPerPage()
70
String page_element = project.getEditionDefinition("default").getPageElement()
71
boolean build_edition = project.getEditionDefinition("default").getBuildEdition()
72
boolean doTokenizeStep = project.getDoTokenizerStep()
73

    
74
File srcDir = new File(rootDir);
75
File binDir = project.getProjectDirectory()
76
binDir.mkdirs();
77
if (!binDir.exists()) {
78
        println "Could not create binDir "+binDir
79
        return;
80
}
81

    
82
File txmDir = new File(binDir, "txm/$corpusname");
83
txmDir.deleteDir();
84
txmDir.mkdirs();
85

    
86
File propertyFile = new File(rootDir, "import.properties")//default
87
Properties props = new Properties();
88
String[] metadatasToKeep;
89

    
90
String textSortAttribute = null;
91
boolean normalizeMetadata = false;
92
String ignoredElements = null;
93
boolean stopIfMalformed = false;
94

    
95
println "Trying to read import properties file: "+propertyFile
96
if (propertyFile.exists() && propertyFile.canRead()) {
97
        InputStreamReader input = new InputStreamReader(new FileInputStream(propertyFile) , "UTF-8");
98
        props.load(input);
99
        input.close();
100
        if(props.getProperty("sortmetadata") != null)
101
                textSortAttribute = props.get("sortmetadata").toString();
102
        if (props.getProperty("normalizemetadata") != null)
103
                normalizeMetadata = Boolean.parseBoolean(props.get("normalizemetadata").toString());
104
        if (props.getProperty("ignoredelements") != null)
105
                ignoredElements = props.get("ignoredelements").toString();
106
        if (props.getProperty("stopifmalformed") != null)
107
                stopIfMalformed = Boolean.parseBoolean(props.get("stopifmalformed").toString());
108

    
109
        println "import properties: "
110
        println " sort metadata: "+textSortAttribute
111
        println " normalize attributes: "+normalizeMetadata
112
        println " ignored elements: "+ignoredElements
113
        println " stop if a XML source is malformed: "+stopIfMalformed
114
}
115

    
116
File allMetadataFile = Metadatas.findMetadataFile(srcDir);
117

    
118
// Apply XSL
119
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
120
if (MONITOR != null) MONITOR.worked(1, "APPLYING XSL")
121
if (xsl != null && xslParams != null && xsl.trim().length() > 0) {
122
        if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"), xslParams))
123
        // return; // error during process
124
        srcDir = new File(binDir, "src");
125
        println ""
126
}
127

    
128
// copy xml+dtd files
129
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
130
List<File> srcfiles = srcDir.listFiles();
131
if (srcfiles != null)
132
        for (int i = 0 ; i < srcfiles.size() ; i++) {// check XML format, and copy file into binDir
133
                File f = srcfiles.get(i)
134
                if (f.getName().equals("import.xml") || f.getName().matches("metadata\\.....?") || f.getName().endsWith(".properties")) {
135
                        srcfiles.remove(i);
136
                        i--;
137
                        continue;
138
                }
139
                if (ValidateXml.test(f)) {
140
                        FileCopy.copy(f, new File(txmDir, f.getName()));
141
                } else {
142
                        println "Won't process file "+f;
143
                }
144
        }
145

    
146
if (txmDir.listFiles() == null) {
147
        println "No txm file to process"
148
        return;
149
}
150

    
151
// filtering
152
/*def xpaths = params.getExcludeXpaths()
153
 if (xpaths != null) {
154
 println "Filtering XML files with xpaths: $xpaths"
155
 for (File infile : txmDir.listFiles()) {
156
 print "."
157
 if (!RemoveTag.xpath(infile, xpaths)) {
158
 println "Failed to filter $infile"
159
 return
160
 }
161
 }
162
 println ""
163
 }*/
164

    
165
//get metadata values from CSV
166
Metadatas metadatas; // text metadata
167

    
168
println "Trying to read metadata from: "+allMetadataFile
169
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
170
if (allMetadataFile.exists()) {
171
        println " Metadata file found: "+allMetadataFile
172
        File copy = new File(binDir, allMetadataFile.getName())
173
        if (!FileCopy.copy(allMetadataFile, copy)) {
174
                println "Error: could not create a copy of metadata file "+allMetadataFile.getAbsoluteFile();
175
                return;
176
        }
177
        metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(), Toolbox.getMetadataColumnSeparator(), Toolbox.getMetadataTextSeparator(), 1)
178
} else {
179
        println " No metadata file: "+allMetadataFile
180
}
181

    
182
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
183
if (MONITOR != null) MONITOR.worked(5, "IMPORTER")
184
println "-- IMPORTER - Reading source files"
185
def imp = new importer();
186
imp.doValidation(true) // change this to not validate xml
187

    
188
imp.doTokenize(doTokenizeStep) // change this, to not tokenize xml
189
imp.setStopIfMalformed(stopIfMalformed);
190
if (!imp.run( srcDir, binDir, txmDir, basename, ignoredElements, lang)) {
191
        println "import process stopped";
192
        return;
193
}
194

    
195
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
196
if (MONITOR != null) MONITOR.worked(20, "INJECTING METADATA")
197
if (metadatas != null) {
198
        println("-- INJECTING METADATA - "+metadatas.getHeadersList()+" in texts of directory "+new File(binDir,"txm"))
199
        for (File infile : txmDir.listFiles()) {
200
                print "."
201
                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
202
                if (!metadatas.injectMetadatasInXml(infile, outfile, "text", null)) {
203
                        outfile.delete();
204
                } else {
205
                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
206
                        if (!infile.exists()) {
207
                                println "Error: could not replace $infile by $outfile"
208
                                return false;
209
                        }
210
                }
211
        }
212
        println ""
213
}
214
List<File> files = txmDir.listFiles()
215
if (files == null || files.size() == 0) {
216
        return;
217
}
218

    
219
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
220
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
221

    
222
boolean annotationSuccess = false;
223
if (annotate) {
224
        println "-- ANNOTATE - Running NLP tools"
225
        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
226
        if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
227
                annotationSuccess = true;
228
        }
229
}
230

    
231
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
232
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
233
println "-- COMPILING - Building Search Engine indexes"
234
def c = new compiler();
235
if(debug) c.setDebug();
236
//c.setCwbPath("~/TXM/cwb/bin");
237
c.setOptions(textSortAttribute, normalizeMetadata);
238
c.setAnnotationSuccess(annotationSuccess)
239
c.setLang(lang);
240
if (!c.run(project, binDir, txmDir, corpusname, null, srcfiles, metadatas)) {
241
        println "import process stopped";
242
        return;
243
}
244

    
245
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
246

    
247
new File(binDir,"HTML/$corpusname").deleteDir();
248
new File(binDir,"HTML/$corpusname").mkdirs();
249
if (build_edition) {
250

    
251
        println "-- EDITION - Building edition"
252
        if (MONITOR != null) MONITOR.worked(25, "EDITION")
253
        
254
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
255
        outdir.mkdirs();
256
        List<File> filelist = txmDir.listFiles();
257
        Collections.sort(filelist);
258
        def second = 0
259

    
260
        println "Paginating texts: "+filelist
261

    
262
        for (File txmFile : filelist) {
263
                print "."
264
                String txtname = txmFile.getName();
265
                int i = txtname.lastIndexOf(".");
266
                if (i > 0) txtname = txtname.substring(0, i);
267

    
268
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
269
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
270

    
271
                Text t = new Text(project);
272
                t.setName(txtname);
273
                t.setSourceFile(txmFile)
274
                t.setTXMFile(txmFile)
275

    
276
                def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element);
277
                Edition edition = new Edition(t);
278
                edition.setName("default");
279
                edition.setIndex(outdir.getAbsolutePath());
280

    
281
                for (i = 0 ; i < ed.getPageFiles().size();) {
282
                        File f = ed.getPageFiles().get(i);
283
                        String wordid = "w_0";
284
                                if (i < ed.getIdx().size()) wordid = ed.getIdx().get(i);
285
                        edition.addPage(""+(++i), wordid);
286
                }
287
        }
288
}
289

    
290
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
291
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
292

    
293
readyToLoad = project.save();