Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / factiva / factivaLoader.groovy @ 966

History | View | Annotate | Download (10.1 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
//
22 321 mdecorde
//
23 321 mdecorde
//
24 321 mdecorde
// $LastChangedDate: 2012-02-13 11:42:24 +0100 (lun., 13 févr. 2012) $
25 321 mdecorde
// $LastChangedRevision: 2126 $
26 321 mdecorde
// $LastChangedBy: mdecorde $
27 321 mdecorde
//
28 321 mdecorde
package org.txm.importer.factiva;
29 321 mdecorde
30 321 mdecorde
import javax.xml.stream.XMLStreamReader;
31 321 mdecorde
import org.txm.importer.ApplyXsl2;
32 321 mdecorde
import org.txm.importer.Factiva2XmlTxm;
33 321 mdecorde
import org.txm.importer.ValidateXml;
34 321 mdecorde
import org.txm.importer.xml.importer;
35 321 mdecorde
import org.txm.importer.xml.compiler;
36 321 mdecorde
import org.txm.importer.xml.pager;
37 321 mdecorde
import org.txm.objects.*;
38 927 mdecorde
import org.txm.utils.*
39 928 mdecorde
import org.txm.utils.io.*;
40 321 mdecorde
import org.txm.*;
41 927 mdecorde
import org.txm.core.engines.*;
42 927 mdecorde
import org.txm.importer.xmltxm.*;
43 321 mdecorde
import org.txm.utils.i18n.*;
44 321 mdecorde
import org.txm.metadatas.*;
45 321 mdecorde
import javax.xml.stream.*;
46 321 mdecorde
import org.w3c.dom.Element
47 479 mdecorde
import org.txm.utils.xml.DomUtils;
48 321 mdecorde
49 321 mdecorde
String userDir = System.getProperty("user.home");
50 321 mdecorde
51 321 mdecorde
def MONITOR;
52 321 mdecorde
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
53 321 mdecorde
BaseParameters params;
54 321 mdecorde
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
55 321 mdecorde
{        println "DEV MODE";//exception means we debug
56 321 mdecorde
        debug = true;
57 321 mdecorde
        params = new BaseParameters(new File(userDir, "xml/TESTS/factiva/import.xml"))
58 321 mdecorde
        params.load()
59 321 mdecorde
        if (!org.txm.Toolbox.isInitialized()) {
60 321 mdecorde
61 321 mdecorde
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
62 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
63 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
64 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
65 321 mdecorde
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
66 321 mdecorde
        }
67 321 mdecorde
}
68 321 mdecorde
if (params == null) { println "no parameters. Aborting"; return; }
69 321 mdecorde
70 321 mdecorde
String corpusname = params.getCorpusName();
71 321 mdecorde
Element corpusElem = params.corpora.get(corpusname);
72 321 mdecorde
String basename = params.name;
73 321 mdecorde
String rootDir = params.rootDir;
74 321 mdecorde
String lang = corpusElem.getAttribute("lang");
75 321 mdecorde
String model = lang
76 321 mdecorde
String encoding = corpusElem.getAttribute("encoding");
77 321 mdecorde
boolean annotate = "true" == corpusElem.getAttribute("annotate");
78 321 mdecorde
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
79 321 mdecorde
def xslParams = params.getXsltParams(corpusElem);
80 321 mdecorde
int wordsPerPage = params.getWordsPerPage("default")
81 321 mdecorde
String page_element = params.getPageElement("default")
82 321 mdecorde
boolean build_edition = params.getDoEdition("default")
83 321 mdecorde
84 321 mdecorde
File srcDir = new File(rootDir);
85 878 sjacqu01
File binDir = new File(Toolbox.getTxmHomePath(),"corpora/"+basename);
86 321 mdecorde
binDir.deleteDir();
87 321 mdecorde
binDir.mkdirs();
88 321 mdecorde
if (!binDir.exists()) {
89 321 mdecorde
        println "Could not create binDir "+binDir
90 321 mdecorde
        return;
91 321 mdecorde
}
92 321 mdecorde
93 321 mdecorde
File txmDir = new File(binDir,"txm/$corpusname");
94 321 mdecorde
txmDir.deleteDir();
95 321 mdecorde
txmDir.mkdirs();
96 321 mdecorde
97 321 mdecorde
File propertyFile = new File(rootDir, "import.properties")//default
98 321 mdecorde
Properties props = new Properties();
99 321 mdecorde
String[] metadatasToKeep;
100 321 mdecorde
101 321 mdecorde
String textSortAttribute = null;
102 321 mdecorde
String paginationElement = null;
103 321 mdecorde
boolean normalizeMetadata = false;
104 321 mdecorde
String ignoredElements = null;
105 321 mdecorde
boolean stopIfMalformed = false;
106 321 mdecorde
107 321 mdecorde
println "Trying to read import properties file: "+propertyFile
108 321 mdecorde
if (propertyFile.exists() && propertyFile.canRead()) {
109 321 mdecorde
        InputStreamReader input = new InputStreamReader(new FileInputStream(propertyFile) , "UTF-8");
110 321 mdecorde
        props.load(input);
111 321 mdecorde
        input.close();
112 321 mdecorde
        if(props.getProperty("sortmetadata") != null)
113 321 mdecorde
                textSortAttribute = props.get("sortmetadata").toString();
114 321 mdecorde
        if(props.getProperty("editionpage") != null)
115 321 mdecorde
                paginationElement = props.get("editionpage").toString();
116 321 mdecorde
        if(props.getProperty("normalizemetadata") != null)
117 321 mdecorde
                normalizeMetadata = Boolean.parseBoolean(props.get("normalizemetadata").toString());
118 321 mdecorde
        if(props.getProperty("ignoredelements") != null)
119 321 mdecorde
                ignoredElements = props.get("ignoredelements").toString();
120 321 mdecorde
        if(props.getProperty("stopIfMalformed") != null)
121 321 mdecorde
                stopIfMalformed = Boolean.parseBoolean(props.get("stopIfMalformed").toString());
122 321 mdecorde
123 321 mdecorde
        println "import properties: "
124 321 mdecorde
        println " sort metadata: "+textSortAttribute
125 321 mdecorde
        println " edition page tag: "+paginationElement
126 321 mdecorde
        println " normalize attributes: "+normalizeMetadata
127 321 mdecorde
        println " ignored elements: "+ignoredElements
128 321 mdecorde
        println " stop if a XML source is malformed: "+stopIfMalformed
129 321 mdecorde
}
130 321 mdecorde
131 965 mdecorde
File allmetadatasfile = Metadatas.findMetadataFile(srcDir);
132 321 mdecorde
133 321 mdecorde
// Apply Factiva2XMLTXM
134 321 mdecorde
println "Factiva -> xml/w"
135 321 mdecorde
File factivaOutDir = new File(binDir, "src")
136 321 mdecorde
new Factiva2XmlTxm().run(srcDir, factivaOutDir.getAbsolutePath());
137 321 mdecorde
if (factivaOutDir.listFiles() == null) {
138 321 mdecorde
        println "Error: no file to process"
139 321 mdecorde
        return;
140 321 mdecorde
}
141 321 mdecorde
srcDir = factivaOutDir;
142 321 mdecorde
143 321 mdecorde
// Apply XSL
144 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
145 321 mdecorde
if (MONITOR != null) MONITOR.worked(1, "APPLYING XSL")
146 321 mdecorde
if (xsl != null && xslParams != null && xsl.trim().length() > 0) {
147 321 mdecorde
        if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"), xslParams))
148 321 mdecorde
        // return; // error during process
149 321 mdecorde
        srcDir = new File(binDir, "src");
150 321 mdecorde
        println ""
151 321 mdecorde
}
152 321 mdecorde
153 321 mdecorde
// copy txm files
154 321 mdecorde
List<File> srcfiles = srcDir.listFiles();
155 321 mdecorde
for (int i = 0 ; i < srcfiles.size() ; i++) {// check XML format, and copy file into binDir
156 321 mdecorde
        File f = srcfiles.get(i)
157 966 mdecorde
        if (f.getName().equals("import.xml") || f.getName().matches("metadata\.....?") || f.getName().endsWith(".properties")) {
158 321 mdecorde
                srcfiles.remove(i);
159 321 mdecorde
                i--;
160 321 mdecorde
                continue;
161 321 mdecorde
        }
162 321 mdecorde
        if (ValidateXml.test(f)) {
163 321 mdecorde
                FileCopy.copy(f, new File(txmDir, f.getName()));
164 321 mdecorde
        } else {
165 321 mdecorde
                println "Won't process file "+f;
166 321 mdecorde
        }
167 321 mdecorde
}
168 321 mdecorde
169 321 mdecorde
if (txmDir.listFiles() == null) {
170 321 mdecorde
        println "No txm file to process"
171 321 mdecorde
        return;
172 321 mdecorde
}
173 321 mdecorde
174 321 mdecorde
//get metadata values from CSV
175 321 mdecorde
Metadatas metadatas; // text metadata
176 321 mdecorde
println "Trying to read metadata from: "+allmetadatasfile
177 321 mdecorde
if (allmetadatasfile.exists()) {
178 966 mdecorde
        File copy = new File(binDir, allmetadatasfile.getName())
179 321 mdecorde
        if (!FileCopy.copy(allmetadatasfile, copy)) {
180 321 mdecorde
                println "Error: could not create a copy of metadata file "+allmetadatasfile.getAbsoluteFile();
181 321 mdecorde
                return;
182 321 mdecorde
        }
183 788 mdecorde
        metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(), Toolbox.getMetadataColumnSeparator(), Toolbox.getMetadataTextSeparator(), 1)
184 321 mdecorde
} else
185 321 mdecorde
        println "no metadata file: "+allmetadatasfile
186 321 mdecorde
187 321 mdecorde
if (MONITOR != null) MONITOR.worked(1, "IMPORTER")
188 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
189 321 mdecorde
println "-- IMPORTER - Reading source files"
190 321 mdecorde
def imp = new importer();
191 321 mdecorde
imp.doValidation(true) // change this to not validate xml
192 321 mdecorde
imp.doTokenize(true) // change this, to not tokenize xml
193 321 mdecorde
imp.setStopIfMalformed(stopIfMalformed);
194 321 mdecorde
if (!imp.run( srcDir, binDir, txmDir, basename, null, lang)) {
195 321 mdecorde
        println "import process stopped";
196 321 mdecorde
        return;
197 321 mdecorde
}
198 321 mdecorde
199 321 mdecorde
if (MONITOR != null) MONITOR.worked(20, "INJECTING METADATA")
200 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
201 321 mdecorde
println "-- INJECTING METADATA - from csv file: "+allmetadatasfile
202 321 mdecorde
if (metadatas != null) {
203 321 mdecorde
        println("Injecting metadata: "+metadatas.getHeadersList()+" in texts of directory "+new File(binDir,"txm"))
204 321 mdecorde
        for (File infile : txmDir.listFiles()) {
205 321 mdecorde
                print "."
206 321 mdecorde
                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
207 321 mdecorde
                if (!metadatas.injectMetadatasInXml(infile, outfile, "text", null)) {
208 321 mdecorde
                        outfile.delete();
209 321 mdecorde
                } else {
210 321 mdecorde
                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
211 321 mdecorde
                        if (!infile.exists()) {
212 321 mdecorde
                                println "Error: could not replace $infile by $outfile"
213 321 mdecorde
                                return false;
214 321 mdecorde
                        }
215 321 mdecorde
                }
216 321 mdecorde
        }
217 321 mdecorde
        println ""
218 321 mdecorde
}
219 321 mdecorde
220 321 mdecorde
List<File> files = txmDir.listFiles()
221 321 mdecorde
if (files == null || files.size() == 0) {
222 321 mdecorde
        return;
223 321 mdecorde
}
224 321 mdecorde
225 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
226 321 mdecorde
if (MONITOR != null) MONITOR.worked(10, "ANNOTATE")
227 321 mdecorde
println "-- ANNOTATE - Running NLP tools"
228 321 mdecorde
boolean annotationSuccess = false;
229 927 mdecorde
if (annotate) {
230 927 mdecorde
        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
231 927 mdecorde
        if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
232 927 mdecorde
                annotationSuccess = true;
233 927 mdecorde
        }
234 321 mdecorde
}
235 321 mdecorde
236 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
237 321 mdecorde
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
238 321 mdecorde
println "-- COMPILING - Building Search Engine indexes"
239 321 mdecorde
def c = new compiler();
240 321 mdecorde
if(debug) c.setDebug();
241 321 mdecorde
//c.setCwbPath("~/TXM/cwb/bin");
242 321 mdecorde
c.setOptions(textSortAttribute, normalizeMetadata);
243 321 mdecorde
c.setAnnotationSuccess(annotationSuccess)
244 321 mdecorde
c.setLang(lang);
245 321 mdecorde
if (!c.run(binDir, txmDir, corpusname, null, srcfiles, metadatas)) {
246 321 mdecorde
        println "import process stopped";
247 321 mdecorde
        return;
248 321 mdecorde
}
249 321 mdecorde
250 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
251 321 mdecorde
252 321 mdecorde
new File(binDir,"HTML/$corpusname").deleteDir();
253 321 mdecorde
new File(binDir,"HTML/$corpusname").mkdirs();
254 321 mdecorde
if (build_edition) {
255 321 mdecorde
256 321 mdecorde
        println "-- EDITION - Building edition"
257 321 mdecorde
        if (MONITOR != null) MONITOR.worked(20, "EDITION")
258 321 mdecorde
259 321 mdecorde
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
260 321 mdecorde
        outdir.mkdirs();
261 321 mdecorde
        List<File> filelist = txmDir.listFiles();
262 321 mdecorde
        Collections.sort(filelist);
263 321 mdecorde
        def second = 0
264 321 mdecorde
265 321 mdecorde
        println "Paginating texts: "
266 321 mdecorde
267 321 mdecorde
        for (File txmFile : filelist) {
268 321 mdecorde
                print "."
269 321 mdecorde
                String txtname = txmFile.getName();
270 321 mdecorde
                int i = txtname.lastIndexOf(".");
271 321 mdecorde
                if(i > 0) txtname = txtname.substring(0, i);
272 321 mdecorde
273 321 mdecorde
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
274 321 mdecorde
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
275 321 mdecorde
276 321 mdecorde
                Element text = params.addText(corpusElem, txtname, txmFile);
277 321 mdecorde
278 321 mdecorde
                def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element);
279 321 mdecorde
                Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
280 321 mdecorde
281 321 mdecorde
                for (i = 0 ; i < ed.getPageFiles().size();) {
282 321 mdecorde
                        File f = ed.getPageFiles().get(i);
283 321 mdecorde
                        String wordid = ed.getIdx().get(i);
284 321 mdecorde
                        params.addPage(edition, ""+(++i), wordid);
285 321 mdecorde
                }
286 321 mdecorde
        }
287 321 mdecorde
}
288 321 mdecorde
289 321 mdecorde
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
290 321 mdecorde
File paramFile = new File(binDir, "import.xml");
291 321 mdecorde
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;