Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / frantext / frantextLoader.groovy @ 1000

History | View | Annotate | Download (7.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
//
7
// This file is part of the TXM platform.
8
//
9
// The TXM platform is free software: you can redistribute it and/or modif y
10
// it under the terms of the GNU General Public License as published by
11
// the Free Software Foundation, either version 3 of the License, or
12
// (at your option) any later version.
13
//
14
// The TXM platform is distributed in the hope that it will be useful,
15
// but WITHOUT ANY WARRANTY; without even the implied warranty of
16
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
// GNU General Public License for more details.
18
//
19
// You should have received a copy of the GNU General Public License
20
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
21
//
22
//
23
//
24
//
25
//
26
// $LastChangedDate: 2012-02-13 11:42:24 +0100 (lun., 13 févr. 2012) $
27
// $LastChangedRevision: 2126 $
28
// $LastChangedBy: mdecorde $
29
//
30
package org.txm.scripts.importer.frantext;
31

    
32
import javax.xml.stream.XMLStreamReader;
33
import org.txm.importer.ApplyXsl2;
34
import org.txm.scripts.importer.bfm.importer;
35
import org.txm.scripts.importer.bfm.compiler;
36
import org.txm.scripts.importer.bfm.pager;
37
import org.txm.objects.*;
38
import org.txm.*;
39
import org.txm.core.engines.*;
40
import org.txm.utils.i18n.*;
41
import org.txm.importer.scripts.xmltxm.*;
42
import org.txm.scripts.importer.*;
43
import org.w3c.dom.Element
44
import org.txm.utils.xml.DomUtils;
45
import org.txm.utils.*
46
import org.txm.utils.io.*;
47

    
48
String userDir = System.getProperty("user.home");
49
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
50
def MONITOR;
51
BaseParameters params;
52
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
53
{        println "DEV MODE";//exception means we debug
54
        debug = true
55
        params = new BaseParameters(new File(userDir, "xml/TESTS/frantext/import.xml"))
56
        params.load()
57
        if (!org.txm.Toolbox.isInitialized()) {
58
                //rootDir = userDir+"/xml/TESTS/alceste";  // directory which contains the source file
59

    
60
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
61
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
62
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
63
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
64
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
65
        }
66
}
67
if (params == null) { println "no parameters. Aborting"; return; }
68

    
69
String corpusname = params.getCorpusName();
70
Element corpusElem = params.corpora.get(corpusname);
71
String basename = params.name;
72
String rootDir = params.rootDir;
73
String lang = corpusElem.getAttribute("lang");
74
String model = lang
75
String encoding = corpusElem.getAttribute("encoding");
76
boolean annotate = "true" == corpusElem.getAttribute("annotate");
77
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
78
def xslParams = params.getXsltParams(corpusElem);
79
int wordsPerPage = params.getWordsPerPage("default")
80
boolean build_edition = params.getDoEdition("default")
81

    
82
File srcDir = new File(rootDir);
83
File binDir = new File(Toolbox.getTxmHomePath(),"corpora/"+basename);
84
binDir.deleteDir();
85
binDir.mkdirs();
86
if (!binDir.exists()) {
87
        println "Could not create binDir "+binDir
88
        return;
89
}
90

    
91
File txmDir = new File(binDir,"txm/$corpusname");
92
txmDir.deleteDir();
93
txmDir.mkdirs();
94

    
95
// BFM XPATH PARAMETERS
96
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
97
if (MONITOR != null) MONITOR.worked(1, "READ XPATH PARAMETERS")
98
File paramDeclarationFile = new File(srcDir, "import.properties");
99
Properties metadataXPath = new Properties();
100
if (paramDeclarationFile.exists() && paramDeclarationFile.canRead()) {
101
        InputStreamReader input = new InputStreamReader(new FileInputStream(paramDeclarationFile) , "UTF-8");
102
        metadataXPath.load(input);
103
        input.close();
104

    
105
        if (!metadataXPath.containsKey("titre"))
106
                println "Warning: parameters property file does not contain the 'titre' metadata"
107
        if (!metadataXPath.containsKey("forme"))
108
                println "Warning: parameters property file does not contain the 'forme' metadata in param values. The default value is 'prose'"
109
}
110
else
111
        println "No '$paramDeclarationFile' file found"
112

    
113
// Apply XSL
114
if (MONITOR != null) MONITOR.worked(5, "APPLYING XSL")
115
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
116
File xslFile = new File(Toolbox.getTxmHomePath(), "xsl/txm-filter-teifrantext-teibfm.xsl") // force xsl
117

    
118
if (!ApplyXsl2.processImportSources(xslFile, srcDir, new File(binDir, "xsl"))) {
119
        println "Error: failed to apply xsl $xsl. Aborting import."
120
        return; // error durring process
121
}
122
srcDir = new File(binDir, "xsl");
123

    
124
// copy txm files
125
println "-- VALIDATION - checking XML source files well-formedness"
126
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
127
List<File> srcfiles = srcDir.listFiles();
128
for (File f : srcfiles) { // check XML format, and copy file into binDir
129
        if (f.isHidden() || f.getName().equals("import.xml") || f.getName().matches("metadata\\.....?") || f.getName().endsWith(".properties"))
130
                continue;
131
        if (ValidateXml.test(f)) {
132
                FileCopy.copy(f, new File(txmDir, f.getName()));
133
        } else {
134
                println "Won't process file "+f;
135
        }
136
}
137
if (MONITOR != null) MONITOR.worked(5)
138

    
139
if (txmDir.listFiles() == null) {
140
        println "No txm file to process"
141
        return;
142
}
143

    
144
if (MONITOR != null) MONITOR.worked(1, "IMPORTER")
145
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
146
println "-- IMPORTER - Reading source files"
147
def imp = new importer()
148
if (!imp.run(srcDir, binDir, txmDir, basename, metadataXPath)) {
149
        println "import process stopped";
150
        return;
151
}
152

    
153
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
154
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
155
println "-- ANNOTATE - Running NLP tools - $model model"
156
boolean annotate_status = true;
157
if (annotate) {
158
        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
159
        if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
160
                annotate_status = true;
161
        }
162
}
163

    
164
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
165
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
166
println "-- COMPILING - Building Search Engine indexes"
167
def c = new compiler();
168
if (debug) c.setDebug();
169
//c.setCwbPath("~/TXM/cwb/bin");
170
c.setLang(lang);
171
c.setAnnotationDone(annotate_status)
172
if (!c.run(binDir, txmDir, corpusname, metadataXPath)) {
173
        println "import process stopped";
174
        return;
175
}
176

    
177
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
178

    
179
new File(binDir,"HTML/$corpusname").deleteDir();
180
new File(binDir,"HTML/$corpusname").mkdirs();
181
if (build_edition) {
182

    
183
        println "-- EDITION"
184
        if (MONITOR != null) MONITOR.worked(25, "EDITION")
185
        
186
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
187
        outdir.mkdirs();
188
        files = c.getOrderedTxmFiles();
189
        println("Building editions: "+files.size()+" files" );
190

    
191
        for (File txmFile : files) {
192
                print "."
193
                String txtname = txmFile.getName();
194
                int idx = txtname.lastIndexOf(".");
195
                if(idx > 0) txtname = txtname.substring(0, idx);
196
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
197
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);//["'","(","[","{","«"];
198

    
199
                Element text = params.addText(corpusElem, txtname, txmFile);
200

    
201
                def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, corpusname, metadataXPath);
202
                Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
203

    
204
                for (i = 0 ; i < ed.getPageFiles().size();) {
205
                        File f = ed.getPageFiles().get(i);
206
                        String wordid = ed.getIdx().get(i);
207
                        params.addPage(edition, ""+(++i), wordid);
208
                }
209
        }
210
}
211
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
212
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
213
File paramFile = new File(binDir, "import.xml");
214
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;