Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / frantext / frantextLoader.groovy @ 966

History | View | Annotate | Download (7.7 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
//
7 321 mdecorde
// This file is part of the TXM platform.
8 321 mdecorde
//
9 321 mdecorde
// The TXM platform is free software: you can redistribute it and/or modif y
10 321 mdecorde
// it under the terms of the GNU General Public License as published by
11 321 mdecorde
// the Free Software Foundation, either version 3 of the License, or
12 321 mdecorde
// (at your option) any later version.
13 321 mdecorde
//
14 321 mdecorde
// The TXM platform is distributed in the hope that it will be useful,
15 321 mdecorde
// but WITHOUT ANY WARRANTY; without even the implied warranty of
16 321 mdecorde
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 321 mdecorde
// GNU General Public License for more details.
18 321 mdecorde
//
19 321 mdecorde
// You should have received a copy of the GNU General Public License
20 321 mdecorde
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
21 321 mdecorde
//
22 321 mdecorde
//
23 321 mdecorde
//
24 321 mdecorde
//
25 321 mdecorde
//
26 321 mdecorde
// $LastChangedDate: 2012-02-13 11:42:24 +0100 (lun., 13 févr. 2012) $
27 321 mdecorde
// $LastChangedRevision: 2126 $
28 321 mdecorde
// $LastChangedBy: mdecorde $
29 321 mdecorde
//
30 321 mdecorde
package org.txm.importer.frantext;
31 321 mdecorde
32 927 mdecorde
import javax.xml.stream.XMLStreamReader;
33 927 mdecorde
import org.txm.importer.ApplyXsl2;
34 321 mdecorde
import org.txm.importer.bfm.importer;
35 321 mdecorde
import org.txm.importer.bfm.compiler;
36 321 mdecorde
import org.txm.importer.bfm.pager;
37 321 mdecorde
import org.txm.objects.*;
38 321 mdecorde
import org.txm.*;
39 927 mdecorde
import org.txm.core.engines.*;
40 321 mdecorde
import org.txm.utils.i18n.*;
41 927 mdecorde
import org.txm.importer.xmltxm.*;
42 321 mdecorde
import org.txm.importer.*;
43 321 mdecorde
import org.w3c.dom.Element
44 479 mdecorde
import org.txm.utils.xml.DomUtils;
45 927 mdecorde
import org.txm.utils.*
46 928 mdecorde
import org.txm.utils.io.*;
47 321 mdecorde
48 321 mdecorde
String userDir = System.getProperty("user.home");
49 321 mdecorde
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
50 321 mdecorde
def MONITOR;
51 321 mdecorde
BaseParameters params;
52 321 mdecorde
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
53 321 mdecorde
{        println "DEV MODE";//exception means we debug
54 321 mdecorde
        debug = true
55 321 mdecorde
        params = new BaseParameters(new File(userDir, "xml/TESTS/frantext/import.xml"))
56 321 mdecorde
        params.load()
57 321 mdecorde
        if (!org.txm.Toolbox.isInitialized()) {
58 321 mdecorde
                //rootDir = userDir+"/xml/TESTS/alceste";  // directory which contains the source file
59 321 mdecorde
60 321 mdecorde
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
61 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
62 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
63 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
64 321 mdecorde
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
65 321 mdecorde
        }
66 321 mdecorde
}
67 321 mdecorde
if (params == null) { println "no parameters. Aborting"; return; }
68 321 mdecorde
69 321 mdecorde
String corpusname = params.getCorpusName();
70 321 mdecorde
Element corpusElem = params.corpora.get(corpusname);
71 321 mdecorde
String basename = params.name;
72 321 mdecorde
String rootDir = params.rootDir;
73 321 mdecorde
String lang = corpusElem.getAttribute("lang");
74 321 mdecorde
String model = lang
75 321 mdecorde
String encoding = corpusElem.getAttribute("encoding");
76 321 mdecorde
boolean annotate = "true" == corpusElem.getAttribute("annotate");
77 321 mdecorde
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
78 321 mdecorde
def xslParams = params.getXsltParams(corpusElem);
79 321 mdecorde
int wordsPerPage = params.getWordsPerPage("default")
80 321 mdecorde
boolean build_edition = params.getDoEdition("default")
81 321 mdecorde
82 321 mdecorde
File srcDir = new File(rootDir);
83 878 sjacqu01
File binDir = new File(Toolbox.getTxmHomePath(),"corpora/"+basename);
84 321 mdecorde
binDir.deleteDir();
85 321 mdecorde
binDir.mkdirs();
86 321 mdecorde
if (!binDir.exists()) {
87 321 mdecorde
        println "Could not create binDir "+binDir
88 321 mdecorde
        return;
89 321 mdecorde
}
90 321 mdecorde
91 321 mdecorde
File txmDir = new File(binDir,"txm/$corpusname");
92 321 mdecorde
txmDir.deleteDir();
93 321 mdecorde
txmDir.mkdirs();
94 321 mdecorde
95 321 mdecorde
// BFM XPATH PARAMETERS
96 927 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
97 321 mdecorde
if (MONITOR != null) MONITOR.worked(1, "READ XPATH PARAMETERS")
98 927 mdecorde
File paramDeclarationFile = new File(srcDir, "import.properties");
99 927 mdecorde
Properties metadataXPath = new Properties();
100 321 mdecorde
if (paramDeclarationFile.exists() && paramDeclarationFile.canRead()) {
101 321 mdecorde
        InputStreamReader input = new InputStreamReader(new FileInputStream(paramDeclarationFile) , "UTF-8");
102 927 mdecorde
        metadataXPath.load(input);
103 321 mdecorde
        input.close();
104 321 mdecorde
105 927 mdecorde
        if (!metadataXPath.containsKey("titre"))
106 321 mdecorde
                println "Warning: parameters property file does not contain the 'titre' metadata"
107 927 mdecorde
        if (!metadataXPath.containsKey("forme"))
108 321 mdecorde
                println "Warning: parameters property file does not contain the 'forme' metadata in param values. The default value is 'prose'"
109 321 mdecorde
}
110 321 mdecorde
else
111 321 mdecorde
        println "No '$paramDeclarationFile' file found"
112 321 mdecorde
113 321 mdecorde
// Apply XSL
114 321 mdecorde
if (MONITOR != null) MONITOR.worked(5, "APPLYING XSL")
115 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
116 878 sjacqu01
File xslFile = new File(Toolbox.getTxmHomePath(), "xsl/txm-filter-teifrantext-teibfm.xsl") // force xsl
117 321 mdecorde
118 321 mdecorde
if (!ApplyXsl2.processImportSources(xslFile, srcDir, new File(binDir, "xsl"))) {
119 321 mdecorde
        println "Error: failed to apply xsl $xsl. Aborting import."
120 321 mdecorde
        return; // error durring process
121 321 mdecorde
}
122 321 mdecorde
srcDir = new File(binDir, "xsl");
123 321 mdecorde
124 321 mdecorde
// copy txm files
125 321 mdecorde
println "-- VALIDATION - checking XML source files well-formedness"
126 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
127 321 mdecorde
List<File> srcfiles = srcDir.listFiles();
128 927 mdecorde
for (File f : srcfiles) { // check XML format, and copy file into binDir
129 966 mdecorde
        if (f.isHidden() || f.getName().equals("import.xml") || f.getName().matches("metadata\.....?") || f.getName().endsWith(".properties"))
130 321 mdecorde
                continue;
131 321 mdecorde
        if (ValidateXml.test(f)) {
132 321 mdecorde
                FileCopy.copy(f, new File(txmDir, f.getName()));
133 321 mdecorde
        } else {
134 321 mdecorde
                println "Won't process file "+f;
135 321 mdecorde
        }
136 321 mdecorde
}
137 321 mdecorde
if (MONITOR != null) MONITOR.worked(5)
138 321 mdecorde
139 321 mdecorde
if (txmDir.listFiles() == null) {
140 321 mdecorde
        println "No txm file to process"
141 321 mdecorde
        return;
142 321 mdecorde
}
143 321 mdecorde
144 321 mdecorde
if (MONITOR != null) MONITOR.worked(1, "IMPORTER")
145 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
146 321 mdecorde
println "-- IMPORTER - Reading source files"
147 321 mdecorde
def imp = new importer()
148 927 mdecorde
if (!imp.run(srcDir, binDir, txmDir, basename, metadataXPath)) {
149 321 mdecorde
        println "import process stopped";
150 321 mdecorde
        return;
151 321 mdecorde
}
152 321 mdecorde
153 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
154 321 mdecorde
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
155 321 mdecorde
println "-- ANNOTATE - Running NLP tools - $model model"
156 321 mdecorde
boolean annotate_status = true;
157 927 mdecorde
if (annotate) {
158 927 mdecorde
        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
159 927 mdecorde
        if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
160 927 mdecorde
                annotate_status = true;
161 927 mdecorde
        }
162 321 mdecorde
}
163 321 mdecorde
164 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
165 321 mdecorde
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
166 321 mdecorde
println "-- COMPILING - Building Search Engine indexes"
167 321 mdecorde
def c = new compiler();
168 321 mdecorde
if (debug) c.setDebug();
169 321 mdecorde
//c.setCwbPath("~/TXM/cwb/bin");
170 321 mdecorde
c.setLang(lang);
171 321 mdecorde
c.setAnnotationDone(annotate_status)
172 927 mdecorde
if (!c.run(binDir, txmDir, corpusname, metadataXPath)) {
173 321 mdecorde
        println "import process stopped";
174 321 mdecorde
        return;
175 321 mdecorde
}
176 321 mdecorde
177 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
178 321 mdecorde
179 321 mdecorde
new File(binDir,"HTML/$corpusname").deleteDir();
180 321 mdecorde
new File(binDir,"HTML/$corpusname").mkdirs();
181 321 mdecorde
if (build_edition) {
182 321 mdecorde
183 321 mdecorde
        println "-- EDITION"
184 321 mdecorde
        if (MONITOR != null) MONITOR.worked(25, "EDITION")
185 321 mdecorde
186 321 mdecorde
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
187 321 mdecorde
        outdir.mkdirs();
188 321 mdecorde
        files = c.getOrderedTxmFiles();
189 321 mdecorde
        println("Building editions: "+files.size()+" files" );
190 321 mdecorde
191 321 mdecorde
        for (File txmFile : files) {
192 321 mdecorde
                print "."
193 321 mdecorde
                String txtname = txmFile.getName();
194 321 mdecorde
                int idx = txtname.lastIndexOf(".");
195 321 mdecorde
                if(idx > 0) txtname = txtname.substring(0, idx);
196 321 mdecorde
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
197 321 mdecorde
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);//["'","(","[","{","«"];
198 321 mdecorde
199 321 mdecorde
                Element text = params.addText(corpusElem, txtname, txmFile);
200 321 mdecorde
201 927 mdecorde
                def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, corpusname, metadataXPath);
202 321 mdecorde
                Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
203 321 mdecorde
204 321 mdecorde
                for (i = 0 ; i < ed.getPageFiles().size();) {
205 321 mdecorde
                        File f = ed.getPageFiles().get(i);
206 321 mdecorde
                        String wordid = ed.getIdx().get(i);
207 321 mdecorde
                        params.addPage(edition, ""+(++i), wordid);
208 321 mdecorde
                }
209 321 mdecorde
        }
210 321 mdecorde
}
211 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
212 321 mdecorde
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
213 321 mdecorde
File paramFile = new File(binDir, "import.xml");
214 927 mdecorde
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;