Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / factiva / factivaLoader.groovy @ 1488

History | View | Annotate | Download (9.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2012-02-13 11:42:24 +0100 (lun., 13 févr. 2012) $
25
// $LastChangedRevision: 2126 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.factiva;
29

    
30
import javax.xml.stream.XMLStreamReader;
31
import org.txm.importer.ApplyXsl2;
32
import org.txm.scripts.importer.Factiva2XmlTxm;
33
import org.txm.importer.ValidateXml;
34
import org.txm.scripts.importer.xml.importer;
35
import org.txm.scripts.importer.xml.compiler;
36
import org.txm.scripts.importer.xml.pager;
37
import org.txm.objects.*;
38
import org.txm.utils.*
39
import org.txm.utils.io.*;
40
import org.txm.*;
41
import org.txm.core.engines.*;
42
import org.txm.importer.scripts.xmltxm.*;
43
import org.txm.utils.i18n.*;
44
import org.txm.metadatas.*;
45
import javax.xml.stream.*;
46
import org.w3c.dom.Element
47
import org.txm.utils.xml.DomUtils;
48

    
49
String userDir = System.getProperty("user.home");
50

    
51
def MONITOR;
52
Project project;
53

    
54
try {project=projectBinding;MONITOR=monitor} catch (Exception)
55
{        }
56
if (project == null) { println "no project set. Aborting"; return; }
57

    
58
String corpusname = project.getName();
59
String basename = corpusname
60
String rootDir = project.getSrcdir();
61
String lang = project.getLang()
62
String model = lang
63
String encoding = project.getEncoding()
64
boolean annotate = project.getAnnotate()
65
String xsl = project.getFrontXSL();
66
def xslParams = project.getXsltParameters();
67
int wordsPerPage = project.getEditionDefinition("default").getWordsPerPage()
68
String page_element = project.getEditionDefinition("default").getPageElement()
69
boolean build_edition = project.getEditionDefinition("default").getBuildEdition()
70

    
71
File srcDir = new File(rootDir);
72
File binDir = project.getProjectDirectory();
73
binDir.mkdirs();
74
if (!binDir.exists()) {
75
        println "Could not create binDir "+binDir
76
        return;
77
}
78

    
79
File txmDir = new File(binDir,"txm/$corpusname");
80
txmDir.deleteDir();
81
txmDir.mkdirs();
82

    
83
File propertyFile = new File(rootDir, "import.properties")//default
84
Properties props = new Properties();
85
String[] metadatasToKeep;
86

    
87
String textSortAttribute = null;
88
String paginationElement = null;
89
boolean normalizeMetadata = false;
90
String ignoredElements = null;
91
boolean stopIfMalformed = false;
92

    
93
println "Trying to read import properties file: "+propertyFile
94
if (propertyFile.exists() && propertyFile.canRead()) {
95
        InputStreamReader input = new InputStreamReader(new FileInputStream(propertyFile) , "UTF-8");
96
        props.load(input);
97
        input.close();
98
        if(props.getProperty("sortmetadata") != null)
99
                textSortAttribute = props.get("sortmetadata").toString();
100
        if(props.getProperty("editionpage") != null)
101
                paginationElement = props.get("editionpage").toString();
102
        if(props.getProperty("normalizemetadata") != null)
103
                normalizeMetadata = Boolean.parseBoolean(props.get("normalizemetadata").toString());
104
        if(props.getProperty("ignoredelements") != null)
105
                ignoredElements = props.get("ignoredelements").toString();
106
        if(props.getProperty("stopIfMalformed") != null)
107
                stopIfMalformed = Boolean.parseBoolean(props.get("stopIfMalformed").toString());
108

    
109
        println "import properties: "
110
        println " sort metadata: "+textSortAttribute
111
        println " edition page tag: "+paginationElement
112
        println " normalize attributes: "+normalizeMetadata
113
        println " ignored elements: "+ignoredElements
114
        println " stop if a XML source is malformed: "+stopIfMalformed
115
}
116

    
117
File allMetadataFile = Metadatas.findMetadataFile(srcDir);
118

    
119
// Apply Factiva2XMLTXM
120
println "Factiva -> xml/w"
121
File factivaOutDir = new File(binDir, "src")
122
new Factiva2XmlTxm().run(srcDir, factivaOutDir.getAbsolutePath());
123
if (factivaOutDir.listFiles() == null) {
124
        println "Error: no file to process"
125
        return;
126
}
127
srcDir = factivaOutDir;
128

    
129
// Apply XSL
130
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
131
if (MONITOR != null) MONITOR.worked(1, "APPLYING XSL")
132
if (xsl != null && xslParams != null && xsl.trim().length() > 0) {
133
        if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"), xslParams))
134
        // return; // error during process
135
        srcDir = new File(binDir, "src");
136
        println ""
137
}
138

    
139
// copy txm files
140
List<File> srcfiles = srcDir.listFiles();
141
for (int i = 0 ; i < srcfiles.size() ; i++) {// check XML format, and copy file into binDir
142
        File f = srcfiles.get(i)
143
        if (f.getName().equals("import.xml") || f.getName().matches("metadata\\.....?") || f.getName().endsWith(".properties")) {
144
                srcfiles.remove(i);
145
                i--;
146
                continue;
147
        }
148
        if (ValidateXml.test(f)) {
149
                FileCopy.copy(f, new File(txmDir, f.getName()));
150
        } else {
151
                println "Won't process file "+f;
152
        }
153
}
154

    
155
if (txmDir.listFiles() == null) {
156
        println "No txm file to process"
157
        return;
158
}
159

    
160
//get metadata values from CSV
161
Metadatas metadatas; // text metadata
162
println "Trying to read metadata from: "+allMetadataFile
163
if (allMetadataFile.exists()) {
164
        File copy = new File(binDir, allMetadataFile.getName())
165
        if (!FileCopy.copy(allMetadataFile, copy)) {
166
                println "Error: could not create a copy of metadata file "+allMetadataFile.getAbsoluteFile();
167
                return;
168
        }
169
        metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(), Toolbox.getMetadataColumnSeparator(), Toolbox.getMetadataTextSeparator(), 1)
170
} else
171
        println "no metadata file: "+allMetadataFile
172

    
173
if (MONITOR != null) MONITOR.worked(1, "IMPORTER")
174
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
175
println "-- IMPORTER - Reading source files"
176
def imp = new importer();
177
imp.doValidation(true) // change this to not validate xml
178
imp.doTokenize(true) // change this, to not tokenize xml
179
imp.setStopIfMalformed(stopIfMalformed);
180
if (!imp.run( srcDir, binDir, txmDir, basename, null, lang)) {
181
        println "import process stopped";
182
        return;
183
}
184

    
185
if (MONITOR != null) MONITOR.worked(20, "INJECTING METADATA")
186
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
187
println "-- INJECTING METADATA - from csv file: "+allMetadataFile
188
if (metadatas != null) {
189
        println("Injecting metadata: "+metadatas.getHeadersList()+" in texts of directory "+new File(binDir,"txm"))
190
        for (File infile : txmDir.listFiles()) {
191
                print "."
192
                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
193
                if (!metadatas.injectMetadatasInXml(infile, outfile, "text", null)) {
194
                        outfile.delete();
195
                } else {
196
                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
197
                        if (!infile.exists()) {
198
                                println "Error: could not replace $infile by $outfile"
199
                                return false;
200
                        }
201
                }
202
        }
203
        println ""
204
}
205

    
206
List<File> files = txmDir.listFiles()
207
if (files == null || files.size() == 0) {
208
        return;
209
}
210

    
211
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
212
if (MONITOR != null) MONITOR.worked(10, "ANNOTATE")
213

    
214
boolean annotationSuccess = false;
215
if (annotate) {
216
        println "-- ANNOTATE - Running NLP tools"
217
        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
218
        if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
219
                annotationSuccess = true;
220
        }
221
}
222

    
223
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
224
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
225
println "-- COMPILING - Building Search Engine indexes"
226
def c = new compiler();
227
if(debug) c.setDebug();
228
//c.setCwbPath("~/TXM/cwb/bin");
229
c.setOptions(textSortAttribute, normalizeMetadata);
230
c.setAnnotationSuccess(annotationSuccess)
231
c.setLang(lang);
232
if (!c.run(project, binDir, txmDir, corpusname, null, srcfiles, metadatas)) {
233
        println "import process stopped";
234
        return;
235
}
236

    
237
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
238

    
239
new File(binDir,"HTML/$corpusname").deleteDir();
240
new File(binDir,"HTML/$corpusname").mkdirs();
241
if (build_edition) {
242

    
243
        println "-- EDITION - Building edition"
244
        if (MONITOR != null) MONITOR.worked(20, "EDITION")
245
        
246
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
247
        outdir.mkdirs();
248
        List<File> filelist = txmDir.listFiles();
249
        Collections.sort(filelist);
250
        def second = 0
251

    
252
        println "Paginating texts: "
253

    
254
        for (File txmFile : filelist) {
255
                print "."
256
                String txtname = txmFile.getName();
257
                int i = txtname.lastIndexOf(".");
258
                if(i > 0) txtname = txtname.substring(0, i);
259

    
260
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
261
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
262

    
263
                Text t = new Text(project);
264
                t.setName(txtname);
265
                t.setSourceFile(txmFile)
266
                t.setTXMFile(txmFile)
267

    
268
                def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element);
269
                Edition edition = new Edition(t);
270
                edition.setName("default");
271
                edition.setIndex(outdir.getAbsolutePath());
272

    
273
                for (i = 0 ; i < ed.getPageFiles().size();) {
274
                        File f = ed.getPageFiles().get(i);
275
                        String wordid = "w_0";
276
                                if (i < ed.getIdx().size()) wordid = ed.getIdx().get(i);
277
                        edition.addPage(""+(++i), wordid);
278
                }
279
        }
280
}
281

    
282
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
283
        
284
readyToLoad = project.save();