Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / bfm / bfmLoader.groovy @ 966

History | View | Annotate | Download (7.7 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
//
7
// This file is part of the TXM platform.
8
//
9
// The TXM platform is free software: you can redistribute it and/or modif y
10
// it under the terms of the GNU General Public License as published by
11
// the Free Software Foundation, either version 3 of the License, or
12
// (at your option) any later version.
13
//
14
// The TXM platform is distributed in the hope that it will be useful,
15
// but WITHOUT ANY WARRANTY; without even the implied warranty of
16
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
// GNU General Public License for more details.
18
//
19
// You should have received a copy of the GNU General Public License
20
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-03-01 10:10:58 +0100 (mar. 01 mars 2016) $
25
// $LastChangedRevision: 3133 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.bfm;
29

    
30
import javax.xml.stream.XMLStreamReader;
31
import org.txm.importer.ApplyXsl2;
32
import org.txm.importer.bfm.importer;
33
import org.txm.importer.bfm.compiler;
34
import org.txm.importer.bfm.pager;
35
import org.txm.objects.*;
36
import org.txm.*;
37
import org.txm.core.engines.*;
38
import org.txm.utils.i18n.*;
39
import org.txm.importer.xmltxm.*;
40
import org.txm.importer.*;
41
import org.w3c.dom.Element
42
import org.txm.utils.xml.DomUtils;
43
import org.txm.utils.*
44
import org.txm.utils.io.*;
45

    
46
String userDir = System.getProperty("user.home");
47
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
48
def MONITOR;
49
BaseParameters params;
50
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
51
{        println "DEV MODE";//exception means we debug
52
        debug = true
53
        params = new BaseParameters(new File(userDir, "xml/bfm/import.xml"))
54
        params.load()
55
        if (!org.txm.Toolbox.isInitialized()) {
56
                //rootDir = userDir+"/xml/TESTS/alceste";  // directory which contains the source file
57

    
58
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
59
                //Toolbox.setParam(Toolbox.INSTALL_DIR,new File("C:\\Program Files\\TXM"));//For Windows
60
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
61
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
62
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
63
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File("C:\\Program Files\\treetagger\\models"));//for Windows
64
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
65
        }
66
}
67
if (params == null) { println "no parameters. Aborting"; return; }
68

    
69
String corpusname = params.getCorpusName();
70
Element corpusElem = params.corpora.get(corpusname);
71
String basename = params.name;
72
String rootDir = params.rootDir;
73
String lang = corpusElem.getAttribute("lang");
74
String model = lang
75
String encoding = corpusElem.getAttribute("encoding");
76
boolean annotate = "true" == corpusElem.getAttribute("annotate");
77
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
78
def xslParams = params.getXsltParams(corpusElem);
79
int wordsPerPage = params.getWordsPerPage("default")
80
boolean build_edition = params.getDoEdition("default")
81

    
82
File srcDir = new File(rootDir);
83
File binDir = new File(Toolbox.getTxmHomePath(),"corpora/"+basename);
84
binDir.deleteDir();
85
binDir.mkdirs();
86
if (!binDir.exists()) {
87
        println "Could not create binDir "+binDir
88
        return;
89
}
90

    
91
File txmDir = new File(binDir,"txm/$corpusname");
92
txmDir.deleteDir();
93
txmDir.mkdirs();
94

    
95
// BFM XPATH PARAMETERS
96
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
97
if (MONITOR != null) MONITOR.worked(1, "READ XPATH PARAMETERS")
98
File paramDeclarationFile = new File(srcDir, "import.properties");
99
Properties metadataXPath = new Properties();
100
if (paramDeclarationFile.exists() && paramDeclarationFile.canRead()) {
101
        InputStreamReader input = new InputStreamReader(new FileInputStream(paramDeclarationFile) , "UTF-8");
102
        metadataXPath.load(input);
103
        input.close();
104

    
105
        if (!metadataXPath.containsKey("titre"))
106
                println "Warning: parameters property file does not contain the 'titre' metadata"
107
        if (!metadataXPath.containsKey("forme"))
108
                println "Warning: parameters property file does not contain the 'forme' metadata in param values. The default value is 'prose'"
109
}
110
else
111
        println "No '$paramDeclarationFile' file found"
112

    
113
// Apply XSL
114
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
115
if (MONITOR != null) MONITOR.worked(1, "APPLYING XSL")
116
if (xsl != null && xslParams != null && xsl.trim().length() > 0) {
117
        if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"), xslParams))
118
        // return; // error during process
119
        srcDir = new File(binDir, "src");
120
        println ""
121
}
122

    
123
// copy txm files
124
println "-- VALIDATION - checking XML source files well-formedness"
125
List<File> srcfiles = srcDir.listFiles();
126
for (File f : srcfiles) { // check XML format, and copy file into binDir
127
        if (f.isHidden() || f.getName().equals("import.xml") || f.getName().matches("metadata\.....?") || f.getName().endsWith(".properties"))
128
                continue;
129
        if (ValidateXml.test(f)) {
130
                FileCopy.copy(f, new File(txmDir, f.getName()));
131
        } else {
132
                println "Won't process file "+f;
133
        }
134
}
135

    
136
if (txmDir.listFiles() == null) {
137
        println "No txm file to process"
138
        return;
139
}
140

    
141
if (MONITOR != null) MONITOR.worked(1, "IMPORTER")
142
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
143
println "-- IMPORTER - Reading source files"
144
def imp = new importer()
145
if (!imp.run(srcDir, binDir, txmDir, basename, metadataXPath)) {
146
        println "import process stopped";
147
        return;
148
}
149

    
150
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
151
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
152
println "-- ANNOTATE - Running NLP tools - $model model"
153
boolean annotate_status = true;
154
if (annotate) {
155
        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
156
        if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
157
                annotate_status = true;
158
        }
159
}
160

    
161
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
162
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
163
println "-- COMPILING - Building Search Engine indexes"
164
def c = new compiler();
165
if (debug) c.setDebug();
166
//c.setCwbPath("~/TXM/cwb/bin");
167
c.setLang(lang);
168
c.setAnnotationDone(annotate_status)
169
if (!c.run(binDir, txmDir, corpusname, metadataXPath)) {
170
        println "import process stopped";
171
        return;
172
}
173

    
174
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
175

    
176
new File(binDir,"HTML/$corpusname").deleteDir();
177
new File(binDir,"HTML/$corpusname").mkdirs();
178
if (build_edition) {
179

    
180
        println "-- EDITION"
181
        if (MONITOR != null) MONITOR.worked(25, "EDITION")
182
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
183
        outdir.mkdirs();
184
        files = c.getOrderedTxmFiles();
185

    
186
        println("Building editions: "+files.size()+" files" );
187
        for (File txmFile : files) {
188
                print "."
189
                String txtname = txmFile.getName();
190
                int idx = txtname.lastIndexOf(".");
191
                if(idx > 0) txtname = txtname.substring(0, idx);
192
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
193
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);//["'","(","[","{","«"];
194

    
195
                Element text = params.addText(corpusElem, txtname, txmFile);
196

    
197
                def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, corpusname, metadataXPath);
198
                Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
199

    
200
                for (i = 0 ; i < ed.getPageFiles().size();) {
201
                        File f = ed.getPageFiles().get(i);
202
                        String wordid = ed.getIdx().get(i);
203
                        params.addPage(edition, ""+(++i), wordid);
204
                }
205
        }
206
}
207
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
208
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
209
File paramFile = new File(binDir, "import.xml");
210
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;