Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / bfm / bfmLoader.groovy @ 479

History | View | Annotate | Download (8.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
//
7
// This file is part of the TXM platform.
8
//
9
// The TXM platform is free software: you can redistribute it and/or modif y
10
// it under the terms of the GNU General Public License as published by
11
// the Free Software Foundation, either version 3 of the License, or
12
// (at your option) any later version.
13
//
14
// The TXM platform is distributed in the hope that it will be useful,
15
// but WITHOUT ANY WARRANTY; without even the implied warranty of
16
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
// GNU General Public License for more details.
18
//
19
// You should have received a copy of the GNU General Public License
20
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-03-01 10:10:58 +0100 (mar. 01 mars 2016) $
25
// $LastChangedRevision: 3133 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.bfm;
29

    
30
import javax.xml.stream.XMLStreamReader;
31
import org.txm.importer.ApplyXsl2;
32
import org.txm.importer.bfm.importer;
33
import org.txm.importer.bfm.compiler;
34
import org.txm.importer.bfm.pager;
35
import org.txm.objects.*;
36
import org.txm.*;
37
import org.txm.utils.i18n.*;
38
import org.txm.scripts.teitxm.*;
39
import org.txm.importer.*;
40
import org.w3c.dom.Element
41
import org.txm.utils.xml.DomUtils;
42
import org.txm.utils.*;
43

    
44
String userDir = System.getProperty("user.home");
45
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
46
def MONITOR;
47
BaseParameters params;
48
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
49
{        println "DEV MODE";//exception means we debug
50
        debug = true
51
        params = new BaseParameters(new File(userDir, "xml/bfm/import.xml"))
52
        params.load()
53
        if (!org.txm.Toolbox.isInitialized()) {
54
                //rootDir = userDir+"/xml/TESTS/alceste";  // directory which contains the source file
55

    
56
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
57
                //Toolbox.setParam(Toolbox.INSTALL_DIR,new File("C:\\Program Files\\TXM"));//For Windows
58
                Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
59
                //Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File("C:\\Program Files\\treetagger"));//for Windows
60
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
61
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
62
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
63
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
64
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File("C:\\Program Files\\treetagger\\models"));//for Windows
65
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
66
        }
67
}
68
if (params == null) { println "no parameters. Aborting"; return; }
69

    
70
String corpusname = params.getCorpusName();
71
Element corpusElem = params.corpora.get(corpusname);
72
String basename = params.name;
73
String rootDir = params.rootDir;
74
String lang = corpusElem.getAttribute("lang");
75
String model = lang
76
String encoding = corpusElem.getAttribute("encoding");
77
boolean annotate = "true" == corpusElem.getAttribute("annotate");
78
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
79
def xslParams = params.getXsltParams(corpusElem);
80
int wordsPerPage = params.getWordsPerPage("default")
81
boolean build_edition = params.getDoEdition("default")
82

    
83
File srcDir = new File(rootDir);
84
File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME),"corpora/"+basename);
85
binDir.deleteDir();
86
binDir.mkdirs();
87
if (!binDir.exists()) {
88
        println "Could not create binDir "+binDir
89
        return;
90
}
91

    
92
File txmDir = new File(binDir,"txm/$corpusname");
93
txmDir.deleteDir();
94
txmDir.mkdirs();
95

    
96
// BFM XPATH PARAMETERS
97
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
98
if (MONITOR != null) MONITOR.worked(1, "READ XPATH PARAMETERS")
99
File paramDeclarationFile = new File(srcDir, "import.properties");
100
Properties metadataXPath = new Properties();
101
if (paramDeclarationFile.exists() && paramDeclarationFile.canRead()) {
102
        InputStreamReader input = new InputStreamReader(new FileInputStream(paramDeclarationFile) , "UTF-8");
103
        metadataXPath.load(input);
104
        input.close();
105

    
106
        if (!metadataXPath.containsKey("titre"))
107
                println "Warning: parameters property file does not contain the 'titre' metadata"
108
        if (!metadataXPath.containsKey("forme"))
109
                println "Warning: parameters property file does not contain the 'forme' metadata in param values. The default value is 'prose'"
110
}
111
else
112
        println "No '$paramDeclarationFile' file found"
113

    
114
// Apply XSL
115
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
116
if (MONITOR != null) MONITOR.worked(1, "APPLYING XSL")
117
if (xsl != null && xslParams != null && xsl.trim().length() > 0) {
118
        if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"), xslParams))
119
        // return; // error during process
120
        srcDir = new File(binDir, "src");
121
        println ""
122
}
123

    
124
// copy txm files
125
println "-- VALIDATION - checking XML source files well-formedness"
126
List<File> srcfiles = srcDir.listFiles();
127
for (File f : srcfiles) { // check XML format, and copy file into binDir
128
        if (f.isHidden() || f.getName().equals("import.xml") || f.getName().equals("metadata.csv") || f.getName().endsWith(".properties"))
129
                continue;
130
        if (ValidateXml.test(f)) {
131
                FileCopy.copy(f, new File(txmDir, f.getName()));
132
        } else {
133
                println "Won't process file "+f;
134
        }
135
}
136

    
137
if (txmDir.listFiles() == null) {
138
        println "No txm file to process"
139
        return;
140
}
141

    
142
if (MONITOR != null) MONITOR.worked(1, "IMPORTER")
143
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
144
println "-- IMPORTER - Reading source files"
145
def imp = new importer()
146
if (!imp.run(srcDir, binDir, txmDir, basename, metadataXPath)) {
147
        println "import process stopped";
148
        return;
149
}
150

    
151
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
152
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
153
println "-- ANNOTATE - Running NLP tools - $model model"
154
boolean annotate_status = true;
155
if (annotate && !(new Annotate().run(binDir, txmDir,model+".par"))) {
156
        println "Skipping Annotate process";
157
        annotate_status = false;
158
}
159

    
160
//if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
161
//if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
162
//println "-- ANNOTATE - Running NLP tools - fr model"
163
//annotate_status = true;
164
//if (annotate && !(new Annotate().run(binDir, txmDir,"fr.par"))) {
165
//        println "Skipping Annotate process";
166
//        annotate_status = false;
167
//}
168

    
169
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
170
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
171
println "-- COMPILING - Building Search Engine indexes"
172
def c = new compiler();
173
if (debug) c.setDebug();
174
//c.setCwbPath("~/TXM/cwb/bin");
175
c.setLang(lang);
176
c.setAnnotationDone(annotate_status)
177
if (!c.run(binDir, txmDir, corpusname, metadataXPath)) {
178
        println "import process stopped";
179
        return;
180
}
181

    
182
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
183

    
184
new File(binDir,"HTML/$corpusname").deleteDir();
185
new File(binDir,"HTML/$corpusname").mkdirs();
186
if (build_edition) {
187

    
188
        println "-- EDITION"
189
        if (MONITOR != null) MONITOR.worked(25, "EDITION")
190
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
191
        outdir.mkdirs();
192
        files = c.getOrderedTxmFiles();
193

    
194
        println("Building editions: "+files.size()+" files" );
195
        for (File txmFile : files) {
196
                print "."
197
                String txtname = txmFile.getName();
198
                int idx = txtname.lastIndexOf(".");
199
                if(idx > 0) txtname = txtname.substring(0, idx);
200
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
201
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);//["'","(","[","{","«"];
202

    
203
                Element text = params.addText(corpusElem, txtname, txmFile);
204

    
205
                def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, corpusname, metadataXPath);
206
                Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
207

    
208
                for (i = 0 ; i < ed.getPageFiles().size();) {
209
                        File f = ed.getPageFiles().get(i);
210
                        String wordid = ed.getIdx().get(i);
211
                        params.addPage(edition, ""+(++i), wordid);
212
                }
213
        }
214
}
215
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
216
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
217
File paramFile = new File(binDir, "import.xml");
218
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;