Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xmltxm / xmltxmLoader.groovy @ 1000

History | View | Annotate | Download (8.1 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
//
22 479 mdecorde
// $LastChangedDate: 2016-05-11 08:38:47 +0200 (mer. 11 mai 2016) $
23 321 mdecorde
// $LastChangedRevision: 3211 $
24 321 mdecorde
// $LastChangedBy: mdecorde $
25 321 mdecorde
//
26 986 mdecorde
package org.txm.scripts.importer.xmltxm
27 321 mdecorde
28 321 mdecorde
import javax.xml.stream.*;
29 321 mdecorde
import java.io.BufferedWriter;
30 321 mdecorde
import java.io.File;
31 321 mdecorde
import java.io.FileOutputStream;
32 321 mdecorde
import java.io.OutputStreamWriter;
33 321 mdecorde
import java.io.PrintStream;
34 321 mdecorde
import java.net.URL;
35 321 mdecorde
import java.text.DateFormat;
36 321 mdecorde
import java.text.SimpleDateFormat;
37 321 mdecorde
38 479 mdecorde
import java.io.File;
39 986 mdecorde
import org.txm.scripts.importer.*;
40 986 mdecorde
import org.txm.scripts.importer.xmltxm.compiler;
41 986 mdecorde
import org.txm.scripts.importer.xml.pager;
42 321 mdecorde
import org.txm.objects.*;
43 927 mdecorde
import org.txm.utils.*
44 928 mdecorde
import org.txm.utils.io.*;
45 321 mdecorde
import org.txm.*;
46 927 mdecorde
import org.txm.core.engines.*;
47 1000 mdecorde
48 1000 mdecorde
import org.txm.importer.scripts.xmltxm.*;
49 321 mdecorde
import org.txm.metadatas.*;
50 321 mdecorde
import org.txm.utils.i18n.*;
51 967 mdecorde
import org.txm.utils.xml.*
52 321 mdecorde
import org.w3c.dom.Element;
53 1000 mdecorde
import org.txm.importer.*
54 321 mdecorde
55 321 mdecorde
String userDir = System.getProperty("user.home");
56 321 mdecorde
boolean debug = false
57 321 mdecorde
def MONITOR;
58 321 mdecorde
BaseParameters params;
59 321 mdecorde
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
60 321 mdecorde
{        println "DEV MODE";//exception means we debug
61 321 mdecorde
        debug = true
62 321 mdecorde
        params = new BaseParameters(new File(userDir, "xml/anapovoas/FOLLOWING TEST OF MATTHIEU/OUTPUT 2/import.xml"))
63 321 mdecorde
        params.load()
64 321 mdecorde
        if (!org.txm.Toolbox.isInitialized()) {
65 321 mdecorde
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
66 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
67 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
68 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
69 321 mdecorde
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
70 321 mdecorde
        }
71 321 mdecorde
}
72 321 mdecorde
if (params == null) { println "no parameters. Aborting"; return; }
73 321 mdecorde
74 321 mdecorde
String corpusname = params.getCorpusName();
75 321 mdecorde
Element corpusElem = params.corpora.get(corpusname);
76 321 mdecorde
String basename = params.name;
77 321 mdecorde
String rootDir = params.rootDir;
78 321 mdecorde
String lang = corpusElem.getAttribute("lang");
79 321 mdecorde
String model = lang
80 321 mdecorde
String encoding = corpusElem.getAttribute("encoding");
81 321 mdecorde
boolean annotate = "true" == corpusElem.getAttribute("annotate");
82 321 mdecorde
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
83 321 mdecorde
def xslParams = params.getXsltParams(corpusElem);
84 321 mdecorde
int wordsPerPage = params.getWordsPerPage("default")
85 321 mdecorde
String page_element = params.getPageElement("default")
86 321 mdecorde
boolean build_edition = params.getDoEdition("default")
87 321 mdecorde
88 321 mdecorde
File srcDir = new File(rootDir);
89 878 sjacqu01
File binDir = new File(Toolbox.getTxmHomePath(),"corpora/"+basename);
90 321 mdecorde
binDir.deleteDir();
91 321 mdecorde
binDir.mkdirs();
92 321 mdecorde
if (!binDir.exists()) {
93 321 mdecorde
        println "Could not create binDir "+binDir
94 321 mdecorde
        return;
95 321 mdecorde
}
96 321 mdecorde
97 321 mdecorde
File txmDir = new File(binDir,"txm/$corpusname");
98 321 mdecorde
txmDir.deleteDir();
99 321 mdecorde
txmDir.mkdirs();
100 321 mdecorde
101 1000 mdecorde
File allMetadataFile = Metadatas.findMetadataFile(srcDir);
102 321 mdecorde
103 321 mdecorde
// Apply XSL
104 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
105 321 mdecorde
if (MONITOR != null) MONITOR.worked(1, "APPLYING XSL")
106 321 mdecorde
if (xsl != null && xslParams != null && xsl.trim().length() > 0) {
107 321 mdecorde
        if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"), xslParams))
108 321 mdecorde
        // return; // error during process
109 321 mdecorde
        srcDir = new File(binDir, "src");
110 321 mdecorde
        println ""
111 321 mdecorde
}
112 321 mdecorde
113 321 mdecorde
// copy txm files
114 321 mdecorde
println "Copying XML-TXM files..."
115 321 mdecorde
List<File> srcfiles = srcDir.listFiles();
116 321 mdecorde
for (File f : srcfiles) {// check XML format, and copy file into binDir
117 967 mdecorde
        if (f.isHidden() || f.getName().equals("import.xml") || f.getName().matches("metadata\\.....?") || f.getName().endsWith(".properties"))
118 321 mdecorde
                continue;
119 321 mdecorde
        if (ValidateXml.test(f)) {
120 321 mdecorde
                FileCopy.copy(f, new File(txmDir, f.getName()));
121 321 mdecorde
        } else {
122 321 mdecorde
                println "Won't process file "+f;
123 321 mdecorde
        }
124 321 mdecorde
}
125 321 mdecorde
126 321 mdecorde
def files = txmDir.listFiles();
127 321 mdecorde
if (files == null || files.size() == 0) {
128 321 mdecorde
        println "No txm file to process"
129 321 mdecorde
        return;
130 321 mdecorde
}
131 321 mdecorde
132 321 mdecorde
////get metadata values from CSV
133 321 mdecorde
//Metadatas metadatas; // text metadata
134 321 mdecorde
//if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
135 1000 mdecorde
//if (allMetadataFile.exists()) {
136 1000 mdecorde
//        println "Trying to read metadata from: "+allMetadataFile
137 321 mdecorde
//        File copy = new File(binDir, "metadata.csv")
138 1000 mdecorde
//        if (!FileCopy.copy(allMetadataFile, copy)) {
139 1000 mdecorde
//                println "Error: could not create a copy of metadata file "+allMetadataFile.getAbsoluteFile();
140 321 mdecorde
//                return;
141 321 mdecorde
//        }
142 788 mdecorde
//        metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(), Toolbox.getMetadataColumnSeparator(), Toolbox.getMetadataTextSeparator(), 1)
143 321 mdecorde
//} else {
144 1000 mdecorde
//        println "no metadata file: "+allMetadataFile
145 321 mdecorde
//}
146 321 mdecorde
//
147 321 mdecorde
//if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
148 1000 mdecorde
//if (MONITOR != null) MONITOR.worked(20, "INJECTING METADATA - from csv file: "+allMetadataFile)
149 321 mdecorde
//if (metadatas != null) {
150 321 mdecorde
//        println("Injecting metadata: "+metadatas.getHeadersList()+" in texts of directory "+txmDir)
151 321 mdecorde
//        for (File infile : files) {
152 321 mdecorde
//                print "."
153 321 mdecorde
//                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
154 321 mdecorde
//
155 321 mdecorde
//                if (!metadatas.injectMetadatasInXml(infile, outfile, "text", null)) {
156 321 mdecorde
//                        outfile.delete();
157 321 mdecorde
//                } else {
158 321 mdecorde
//                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
159 321 mdecorde
//                        if (!infile.exists()) {
160 321 mdecorde
//                                println "Error: could not replace $infile by $outfile"
161 321 mdecorde
//                                return false;
162 321 mdecorde
//                        }
163 321 mdecorde
//                }
164 321 mdecorde
//        }
165 321 mdecorde
//        println ""
166 321 mdecorde
//}
167 321 mdecorde
168 321 mdecorde
//sorting files per metadata "date" with format
169 321 mdecorde
def dates = [:]
170 321 mdecorde
def useDates = false
171 321 mdecorde
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("dd.MM.yyyy");
172 321 mdecorde
for (File infile : files) { // get dates
173 321 mdecorde
        String str = GetAttributeValue.process(infile, "/TEI/text", "date")
174 321 mdecorde
175 321 mdecorde
        if (str != "N/A") {
176 321 mdecorde
                dates[infile] = simpleDateFormat.parse(str)
177 321 mdecorde
                useDates = true
178 321 mdecorde
        }
179 321 mdecorde
}
180 321 mdecorde
181 321 mdecorde
//sort
182 321 mdecorde
if (useDates) {
183 321 mdecorde
        files.sort() { f-> dates[f] }
184 321 mdecorde
} else {
185 321 mdecorde
        files.sort()
186 321 mdecorde
}
187 321 mdecorde
188 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
189 321 mdecorde
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
190 321 mdecorde
println "-- COMPILING - Building Search Engine indexes"
191 321 mdecorde
def c = new compiler();
192 321 mdecorde
if (debug) c.setDebug();
193 321 mdecorde
c.setLang(lang);
194 321 mdecorde
if (!c.run(binDir, txmDir, basename, corpusname, Arrays.asList(files))) {
195 321 mdecorde
        println "import process stopped";
196 321 mdecorde
        return;
197 321 mdecorde
}
198 321 mdecorde
199 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
200 321 mdecorde
201 321 mdecorde
new File(binDir,"HTML/$corpusname").deleteDir();
202 321 mdecorde
new File(binDir,"HTML/$corpusname").mkdirs();
203 321 mdecorde
if (build_edition) {
204 321 mdecorde
205 321 mdecorde
        if (MONITOR != null) MONITOR.worked(20, "EDITION")
206 321 mdecorde
        println "-- EDITION - Building edition"
207 321 mdecorde
208 321 mdecorde
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
209 321 mdecorde
        outdir.mkdirs();
210 321 mdecorde
211 321 mdecorde
        def second = 0
212 321 mdecorde
213 321 mdecorde
        println "Paginating text: "
214 321 mdecorde
        for (File srcfile : files) {
215 321 mdecorde
216 321 mdecorde
                String txtname = srcfile.getName();
217 321 mdecorde
                int i = txtname.lastIndexOf(".");
218 321 mdecorde
                if(i > 0) txtname = txtname.substring(0, i);
219 321 mdecorde
220 321 mdecorde
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
221 321 mdecorde
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
222 321 mdecorde
223 321 mdecorde
                Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile);
224 321 mdecorde
225 321 mdecorde
                if (second) { print(", ") }
226 321 mdecorde
                if (second > 0 && (second++ % 5) == 0) println ""
227 321 mdecorde
                print(".");
228 321 mdecorde
229 321 mdecorde
                def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element);
230 321 mdecorde
                Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
231 321 mdecorde
232 321 mdecorde
                for (i = 0 ; i < ed.getPageFiles().size();) {
233 321 mdecorde
                        File f = ed.getPageFiles().get(i);
234 321 mdecorde
                        String wordid = ed.getIdx().get(i);
235 321 mdecorde
                        params.addPage(edition, ""+(++i), wordid);
236 321 mdecorde
                }
237 321 mdecorde
        }
238 321 mdecorde
}
239 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
240 321 mdecorde
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
241 321 mdecorde
File paramFile = new File(binDir, "import.xml");
242 927 mdecorde
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;