Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / xmltxm / xmltxmLoader.groovy @ 148

History | View | Annotate | Download (8.2 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
// $LastChangedDate: 2016-05-11 08:38:47 +0200 (Wed, 11 May 2016) $
23
// $LastChangedRevision: 3211 $
24
// $LastChangedBy: mdecorde $
25
//
26
package org.txm.importer.xmltxm
27

    
28
import javax.xml.stream.*;
29

    
30
import java.io.BufferedWriter;
31
import java.io.File;
32
import java.io.FileOutputStream;
33
import java.io.OutputStreamWriter;
34
import java.io.PrintStream;
35
import java.net.URL;
36
import java.text.DateFormat;
37
import java.text.SimpleDateFormat;
38
import java.io.File;
39

    
40
import org.txm.importer.*;
41
import org.txm.importer.xmltxm.compiler;
42
import org.txm.importer.xml.pager;
43
import org.txm.objects.*;
44
import org.txm.utils.*;
45
import org.txm.*;
46
import org.txm.scripts.teitxm.*;
47
import org.txm.metadatas.*;
48
import org.txm.utils.i18n.*;
49
import org.w3c.dom.Element;
50

    
51
String userDir = System.getProperty("user.home");
52
boolean debug = false
53
def MONITOR;
54
BaseParameters params;
55
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
56
{        println "DEV MODE";//exception means we debug
57
        debug = true
58
        params = new BaseParameters(new File(userDir, "xml/anapovoas/FOLLOWING TEST OF MATTHIEU/OUTPUT 2/import.xml"))
59
        params.load()
60
        if (!org.txm.Toolbox.isInitialized()) {
61
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
62
                Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
63
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
64
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
65
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
66
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
67
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
68
        }
69
}
70
if (params == null) { println "no parameters. Aborting"; return; }
71

    
72
String corpusname = params.getCorpusName();
73
Element corpusElem = params.corpora.get(corpusname);
74
String basename = params.name;
75
String rootDir = params.rootDir;
76
String lang = corpusElem.getAttribute("lang");
77
String model = lang
78
String encoding = corpusElem.getAttribute("encoding");
79
boolean annotate = "true" == corpusElem.getAttribute("annotate");
80
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
81
def xslParams = params.getXsltParams(corpusElem);
82
int wordsPerPage = params.getWordsPerPage("default")
83
String page_element = params.getPageElement("default")
84
boolean build_edition = params.getDoEdition("default")
85

    
86
File srcDir = new File(rootDir);
87
File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME),"corpora/"+basename);
88
binDir.deleteDir();
89
binDir.mkdirs();
90
if (!binDir.exists()) {
91
        println "Could not create binDir "+binDir
92
        return;
93
}
94

    
95
File txmDir = new File(binDir,"txm/$corpusname");
96
txmDir.deleteDir();
97
txmDir.mkdirs();
98

    
99
File allmetadatasfile = new File(srcDir, "metadata.csv");
100

    
101
// Apply XSL
102
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
103
if (MONITOR != null) MONITOR.worked(1, "APPLYING XSL")
104
if (xsl != null && xslParams != null && xsl.trim().length() > 0) {
105
        if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"), xslParams))
106
        // return; // error during process
107
        srcDir = new File(binDir, "src");
108
        println ""
109
}
110

    
111
// copy txm files
112
println "Copying XML-TXM files..."
113
List<File> srcfiles = srcDir.listFiles();
114
for (File f : srcfiles) {// check XML format, and copy file into binDir
115
        if (f.isHidden() || f.getName().equals("import.xml") || f.getName().equals("metadata.csv") || f.getName().endsWith(".properties"))
116
                continue;
117
        if (ValidateXml.test(f)) {
118
                FileCopy.copy(f, new File(txmDir, f.getName()));
119
        } else {
120
                println "Won't process file "+f;
121
        }
122
}
123

    
124
def files = txmDir.listFiles();
125
if (files == null || files.size() == 0) {
126
        println "No txm file to process"
127
        return;
128
}
129

    
130
////get metadata values from CSV
131
//Metadatas metadatas; // text metadata
132
//if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
133
//if (allmetadatasfile.exists()) {
134
//        println "Trying to read metadata from: "+allmetadatasfile
135
//        File copy = new File(binDir, "metadata.csv")
136
//        if (!FileCopy.copy(allmetadatasfile, copy)) {
137
//                println "Error: could not create a copy of metadata file "+allmetadatasfile.getAbsoluteFile();
138
//                return;
139
//        }
140
//        metadatas = new Metadatas(copy, Toolbox.getParam(Toolbox.METADATA_ENCODING), Toolbox.getParam(Toolbox.METADATA_COLSEPARATOR), Toolbox.getParam(Toolbox.METADATA_TXTSEPARATOR), 1)
141
//} else {
142
//        println "no metadata file: "+allmetadatasfile
143
//}
144
//
145
//if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
146
//if (MONITOR != null) MONITOR.worked(20, "INJECTING METADATA - from csv file: "+allmetadatasfile)
147
//if (metadatas != null) {
148
//        println("Injecting metadata: "+metadatas.getHeadersList()+" in texts of directory "+txmDir)
149
//        for (File infile : files) {
150
//                print "."
151
//                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
152
//
153
//                if (!metadatas.injectMetadatasInXml(infile, outfile, "text", null)) {
154
//                        outfile.delete();
155
//                } else {
156
//                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
157
//                        if (!infile.exists()) {
158
//                                println "Error: could not replace $infile by $outfile"
159
//                                return false;
160
//                        }
161
//                }
162
//        }
163
//        println ""
164
//}
165

    
166
//sorting files per metadata "date" with format
167
def dates = [:]
168
def useDates = false
169
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("dd.MM.yyyy");
170
for (File infile : files) { // get dates
171
        String str = GetAttributeValue.process(infile, "/TEI/text", "date")
172

    
173
        if (str != "N/A") {
174
                dates[infile] = simpleDateFormat.parse(str)
175
                useDates = true
176
        }
177
}
178

    
179
//sort
180
if (useDates) {
181
        files.sort() { f-> dates[f] }
182
} else {
183
        files.sort()
184
}
185

    
186
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
187
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
188
println "-- COMPILING - Building Search Engine indexes"
189
def c = new compiler();
190
if (debug) c.setDebug();
191
c.setLang(lang);
192
if (!c.run(binDir, txmDir, basename, corpusname, Arrays.asList(files))) {
193
        println "import process stopped";
194
        return;
195
}
196

    
197
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
198

    
199
new File(binDir,"HTML/$corpusname").deleteDir();
200
new File(binDir,"HTML/$corpusname").mkdirs();
201
if (build_edition) {
202

    
203
        if (MONITOR != null) MONITOR.worked(20, "EDITION")
204
        println "-- EDITION - Building edition"
205
        
206
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
207
        outdir.mkdirs();
208

    
209
        def second = 0
210

    
211
        println "Paginating text: "
212
        for (File srcfile : files) {
213

    
214
                String txtname = srcfile.getName();
215
                int i = txtname.lastIndexOf(".");
216
                if(i > 0) txtname = txtname.substring(0, i);
217

    
218
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
219
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
220

    
221
                Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile);
222

    
223
                if (second) { print(", ") }
224
                if (second > 0 && (second++ % 5) == 0) println ""
225
                print(".");
226

    
227
                def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element);
228
                Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
229

    
230
                for (i = 0 ; i < ed.getPageFiles().size();) {
231
                        File f = ed.getPageFiles().get(i);
232
                        String wordid = ed.getIdx().get(i);
233
                        params.addPage(edition, ""+(++i), wordid);
234
                }
235
        }
236
}
237
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
238
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
239
File paramFile = new File(binDir, "import.xml");
240
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;