Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / factiva / factivaLoader.groovy @ 479

History | View | Annotate | Download (10.1 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2012-02-13 11:42:24 +0100 (lun., 13 févr. 2012) $
25
// $LastChangedRevision: 2126 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.factiva;
29

    
30
import javax.xml.stream.XMLStreamReader;
31
import org.txm.importer.ApplyXsl2;
32
import org.txm.importer.Factiva2XmlTxm;
33
import org.txm.importer.ValidateXml;
34
import org.txm.importer.xml.importer;
35
import org.txm.importer.xml.compiler;
36
import org.txm.importer.xml.pager;
37
import org.txm.objects.*;
38
import org.txm.utils.*;
39
import org.txm.*;
40
import org.txm.scripts.teitxm.*;
41
import org.txm.utils.i18n.*;
42
import org.txm.metadatas.*;
43
import javax.xml.stream.*;
44
import org.w3c.dom.Element
45
import org.txm.utils.xml.DomUtils;
46

    
47
String userDir = System.getProperty("user.home");
48

    
49
def MONITOR;
50
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
51
BaseParameters params;
52
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
53
{        println "DEV MODE";//exception means we debug
54
        debug = true;
55
        params = new BaseParameters(new File(userDir, "xml/TESTS/factiva/import.xml"))
56
        params.load()
57
        if (!org.txm.Toolbox.isInitialized()) {
58

    
59
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
60
                Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
61
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
62
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
63
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
64
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
65
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
66
        }
67
}
68
if (params == null) { println "no parameters. Aborting"; return; }
69

    
70
String corpusname = params.getCorpusName();
71
Element corpusElem = params.corpora.get(corpusname);
72
String basename = params.name;
73
String rootDir = params.rootDir;
74
String lang = corpusElem.getAttribute("lang");
75
String model = lang
76
String encoding = corpusElem.getAttribute("encoding");
77
boolean annotate = "true" == corpusElem.getAttribute("annotate");
78
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
79
def xslParams = params.getXsltParams(corpusElem);
80
int wordsPerPage = params.getWordsPerPage("default")
81
String page_element = params.getPageElement("default")
82
boolean build_edition = params.getDoEdition("default")
83

    
84
File srcDir = new File(rootDir);
85
File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME),"corpora/"+basename);
86
binDir.deleteDir();
87
binDir.mkdirs();
88
if (!binDir.exists()) {
89
        println "Could not create binDir "+binDir
90
        return;
91
}
92

    
93
File txmDir = new File(binDir,"txm/$corpusname");
94
txmDir.deleteDir();
95
txmDir.mkdirs();
96

    
97
File propertyFile = new File(rootDir, "import.properties")//default
98
Properties props = new Properties();
99
String[] metadatasToKeep;
100

    
101
String textSortAttribute = null;
102
String paginationElement = null;
103
boolean normalizeMetadata = false;
104
String ignoredElements = null;
105
boolean stopIfMalformed = false;
106

    
107
println "Trying to read import properties file: "+propertyFile
108
if (propertyFile.exists() && propertyFile.canRead()) {
109
        InputStreamReader input = new InputStreamReader(new FileInputStream(propertyFile) , "UTF-8");
110
        props.load(input);
111
        input.close();
112
        if(props.getProperty("sortmetadata") != null)
113
                textSortAttribute = props.get("sortmetadata").toString();
114
        if(props.getProperty("editionpage") != null)
115
                paginationElement = props.get("editionpage").toString();
116
        if(props.getProperty("normalizemetadata") != null)
117
                normalizeMetadata = Boolean.parseBoolean(props.get("normalizemetadata").toString());
118
        if(props.getProperty("ignoredelements") != null)
119
                ignoredElements = props.get("ignoredelements").toString();
120
        if(props.getProperty("stopIfMalformed") != null)
121
                stopIfMalformed = Boolean.parseBoolean(props.get("stopIfMalformed").toString());
122

    
123
        println "import properties: "
124
        println " sort metadata: "+textSortAttribute
125
        println " edition page tag: "+paginationElement
126
        println " normalize attributes: "+normalizeMetadata
127
        println " ignored elements: "+ignoredElements
128
        println " stop if a XML source is malformed: "+stopIfMalformed
129
}
130

    
131
File allmetadatasfile = new File(srcDir, "metadata.csv");
132

    
133
// Apply Factiva2XMLTXM
134
println "Factiva -> xml/w"
135
File factivaOutDir = new File(binDir, "src")
136
new Factiva2XmlTxm().run(srcDir, factivaOutDir.getAbsolutePath());
137
if (factivaOutDir.listFiles() == null) {
138
        println "Error: no file to process"
139
        return;
140
}
141
srcDir = factivaOutDir;
142

    
143
// Apply XSL
144
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
145
if (MONITOR != null) MONITOR.worked(1, "APPLYING XSL")
146
if (xsl != null && xslParams != null && xsl.trim().length() > 0) {
147
        if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"), xslParams))
148
        // return; // error during process
149
        srcDir = new File(binDir, "src");
150
        println ""
151
}
152

    
153
// copy txm files
154
List<File> srcfiles = srcDir.listFiles();
155
for (int i = 0 ; i < srcfiles.size() ; i++) {// check XML format, and copy file into binDir
156
        File f = srcfiles.get(i)
157
        if (f.getName().equals("import.xml") || f.getName().equals("metadata.csv") || f.getName().endsWith(".properties")) {
158
                srcfiles.remove(i);
159
                i--;
160
                continue;
161
        }
162
        if (ValidateXml.test(f)) {
163
                FileCopy.copy(f, new File(txmDir, f.getName()));
164
        } else {
165
                println "Won't process file "+f;
166
        }
167
}
168

    
169
if (txmDir.listFiles() == null) {
170
        println "No txm file to process"
171
        return;
172
}
173

    
174
//get metadata values from CSV
175
Metadatas metadatas; // text metadata
176
println "Trying to read metadata from: "+allmetadatasfile
177
if (allmetadatasfile.exists()) {
178
        File copy = new File(binDir, "metadata.csv")
179
        if (!FileCopy.copy(allmetadatasfile, copy)) {
180
                println "Error: could not create a copy of metadata file "+allmetadatasfile.getAbsoluteFile();
181
                return;
182
        }
183
        metadatas = new Metadatas(copy, Toolbox.getParam(Toolbox.METADATA_ENCODING), Toolbox.getParam(Toolbox.METADATA_COLSEPARATOR), Toolbox.getParam(Toolbox.METADATA_TXTSEPARATOR), 1)
184
} else
185
        println "no metadata file: "+allmetadatasfile
186

    
187
if (MONITOR != null) MONITOR.worked(1, "IMPORTER")
188
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
189
println "-- IMPORTER - Reading source files"
190
def imp = new importer();
191
imp.doValidation(true) // change this to not validate xml
192
imp.doTokenize(true) // change this, to not tokenize xml
193
imp.setStopIfMalformed(stopIfMalformed);
194
if (!imp.run( srcDir, binDir, txmDir, basename, null, lang)) {
195
        println "import process stopped";
196
        return;
197
}
198

    
199
if (MONITOR != null) MONITOR.worked(20, "INJECTING METADATA")
200
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
201
println "-- INJECTING METADATA - from csv file: "+allmetadatasfile
202
if (metadatas != null) {
203
        println("Injecting metadata: "+metadatas.getHeadersList()+" in texts of directory "+new File(binDir,"txm"))
204
        for (File infile : txmDir.listFiles()) {
205
                print "."
206
                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
207
                if (!metadatas.injectMetadatasInXml(infile, outfile, "text", null)) {
208
                        outfile.delete();
209
                } else {
210
                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
211
                        if (!infile.exists()) {
212
                                println "Error: could not replace $infile by $outfile"
213
                                return false;
214
                        }
215
                }
216
        }
217
        println ""
218
}
219

    
220
List<File> files = txmDir.listFiles()
221
if (files == null || files.size() == 0) {
222
        return;
223
}
224

    
225
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
226
if (MONITOR != null) MONITOR.worked(10, "ANNOTATE")
227
println "-- ANNOTATE - Running NLP tools"
228
boolean annotationSuccess = false;
229
if (annotate && new Annotate().run(binDir, txmDir, model+".par")) {
230
        annotationSuccess = true;
231
}
232

    
233
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
234
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
235
println "-- COMPILING - Building Search Engine indexes"
236
def c = new compiler();
237
if(debug) c.setDebug();
238
//c.setCwbPath("~/TXM/cwb/bin");
239
c.setOptions(textSortAttribute, normalizeMetadata);
240
c.setAnnotationSuccess(annotationSuccess)
241
c.setLang(lang);
242
if (!c.run(binDir, txmDir, corpusname, null, srcfiles, metadatas)) {
243
        println "import process stopped";
244
        return;
245
}
246

    
247
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
248

    
249
new File(binDir,"HTML/$corpusname").deleteDir();
250
new File(binDir,"HTML/$corpusname").mkdirs();
251
if (build_edition) {
252

    
253
        println "-- EDITION - Building edition"
254
        if (MONITOR != null) MONITOR.worked(20, "EDITION")
255
        
256
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
257
        outdir.mkdirs();
258
        List<File> filelist = txmDir.listFiles();
259
        Collections.sort(filelist);
260
        def second = 0
261

    
262
        println "Paginating texts: "
263

    
264
        for (File txmFile : filelist) {
265
                print "."
266
                String txtname = txmFile.getName();
267
                int i = txtname.lastIndexOf(".");
268
                if(i > 0) txtname = txtname.substring(0, i);
269

    
270
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
271
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
272

    
273
                Element text = params.addText(corpusElem, txtname, txmFile);
274

    
275
                def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element);
276
                Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
277

    
278
                for (i = 0 ; i < ed.getPageFiles().size();) {
279
                        File f = ed.getPageFiles().get(i);
280
                        String wordid = ed.getIdx().get(i);
281
                        params.addPage(edition, ""+(++i), wordid);
282
                }
283
        }
284
}
285

    
286
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
287
File paramFile = new File(binDir, "import.xml");
288
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;