Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / tmx / tmxLoader.groovy @ 1000

History | View | Annotate | Download (6.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2015-08-31 13:45:22 +0200 (lun. 31 août 2015) $
25
// $LastChangedRevision: 3028 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.tmx;
29
import org.txm.searchengine.cqp.corpus.MainCorpus;
30

    
31
import java.io.File;
32
import org.txm.scripts.importer.tmx.importer;
33
import org.txm.scripts.importer.tmx.compiler;
34
import org.txm.scripts.importer.tmx.pager;
35
import org.txm.objects.*;
36
import org.txm.utils.*
37
import org.txm.utils.io.*;
38
import org.txm.*;
39
import org.txm.core.engines.*;
40
import org.txm.importer.scripts.xmltxm.*;
41
import org.txm.utils.i18n.*;
42
import org.w3c.dom.Element
43
import org.txm.utils.xml.DomUtils;
44

    
45
String userDir = System.getProperty("user.home");
46
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
47
def MONITOR;
48
BaseParameters params;
49
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
50
{        println "DEV MODE";//exception means we debug
51
        debug = true
52
        params = new BaseParameters(new File(userDir, "xml/tmx/import.xml"))
53
        params.load()
54
        if (!org.txm.Toolbox.isInitialized()) {
55

    
56
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
57
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
58
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
59
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
60
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
61
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
62
        }
63
}
64
if (params == null) { println "no parameters. Aborting"; return; }
65

    
66
String corpusname = params.getCorpusName();
67
Element corpusElem = params.corpora.get(corpusname);
68
String basename = params.name;
69
String rootDir = params.rootDir;
70
String lang = corpusElem.getAttribute("lang");
71
String model = lang;
72
String encoding = corpusElem.getAttribute("encoding");
73
boolean annotate = "true" == corpusElem.getAttribute("annotate");
74
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
75
def xslParams = params.getXsltParams(corpusElem);;
76
int wordsPerPage = params.getWordsPerPage("default")
77
String page_element = params.getPageElement("default")
78
boolean build_edition = params.getDoEdition("default")
79

    
80
File srcDir = new File(rootDir);
81
File binDir = new File(Toolbox.getTxmHomePath(),"corpora/"+basename);
82
binDir.deleteDir();
83
binDir.mkdirs();
84
if (!binDir.exists()) {
85
        println "Could not create binDir "+binDir
86
        return;
87
}
88

    
89
File txmDir = new File(binDir, "txm");
90
txmDir.deleteDir();
91
txmDir.mkdirs();
92
HashMap<String, String> textLangs = [:];
93
def langGroups = [:];
94

    
95
println "-- IMPORTER - Reading source files"
96
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
97
if (MONITOR != null) MONITOR.worked(20, "IMPORTER - Reading source files");
98
def imp = new importer()
99
imp.run(srcDir, binDir, txmDir, textLangs, langGroups);
100
def corpusIDS = imp.getCorpusIDS()
101
println "-- ANNOTATE - Running NLP tools"
102
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE - Running NLP tools");
103
if (annotate) {
104
        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
105
        if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
106
                annotationSuccess = true;
107
        }
108
}
109
println "langs : "+textLangs
110
println "texts : "+langGroups
111

    
112
println "-- COMPILING - Building Search Engine indexes"
113
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
114
if (MONITOR != null) MONITOR.worked(20, "COMPILING - Building Search Engine indexes")
115
def c = new compiler();
116
if (debug) c.setDebug();
117
c.setAlignment("seg","id");
118
c.setLangs(textLangs);
119
c.setCorpusIDS(corpusIDS);
120
c.setLangGroups(langGroups);
121
if (!c.run(binDir, txmDir, basename)) {
122
        println "Compiler failed"
123
        return;
124
}
125

    
126
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
127
new File(binDir,"HTML").deleteDir();
128
new File(binDir,"HTML").mkdirs();
129
if (build_edition) {
130
        
131
        println "-- EDITION - Building edition"
132
        if (MONITOR != null) MONITOR.worked(20, "EDITION - Building edition")
133
        
134
        List<File> filelist = new File(binDir,"txm").listFiles();
135
        def second = 0
136

    
137
        def splitTUsPerText = [:]
138

    
139
        for (File txmCorpusDir : txmDir.listFiles()) {
140
                def txmFiles = txmCorpusDir.listFiles();
141
                txmFiles.sort() // same order
142
                //println "paging txmFiles : "+txmFiles
143

    
144
                File outdir = new File(binDir,"/HTML/"+txmCorpusDir.getName()+"/default/");
145
                outdir.mkdirs();
146
                //println "processing pages of corpus "+txmCorpusDir.getName()
147

    
148
                Element paraCorpusElem = corpusElem.cloneNode(true);
149
                paraCorpusElem.setAttribute("name", txmCorpusDir.getName());
150

    
151
                for (File srcfile : txmFiles) {
152
                        print "."
153
                        String txtname = srcfile.getName();
154
                        int i = txtname.lastIndexOf("_");
155
                        if(i > 0) txtname = txtname.substring(0, i);
156

    
157
                        //println ""+srcfile.getName()+" -> "+splitTUsPerText[txtname]
158
                        def splitTUs = splitTUsPerText[txtname];
159
                        if (splitTUs == null) {
160
                                splitTUs = [];
161
                                splitTUsPerText[txtname] = splitTUs;
162
                        }
163

    
164
                        String l = textLangs.get(srcfile.getName())
165
                        if (l == null) l = "fr";
166
                        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(l);
167
                        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(l);
168

    
169
                        Element text = params.addText(paraCorpusElem, txtname, srcfile);
170

    
171
                        def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element,splitTUs);
172
                        Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
173

    
174
                        for (i = 0 ; i < ed.getPageFiles().size();) {
175
                                File f = ed.getPageFiles().get(i);
176
                                String wordid = ed.getIdx().get(i);
177
                                params.addPage(edition, ""+(++i), wordid);
178
                        }
179
                }
180

    
181
                params.corporaElement.appendChild(paraCorpusElem);
182

    
183
                File cssfile = new File(Toolbox.getTxmHomePath(), "css/tmx.css")
184
                if (cssfile.exists()) {
185
                        FileCopy.copy(cssfile, new File(outdir, "tmx.css"));
186
                }
187
                File stylefile = new File(binDir, "style.css")
188
                if (stylefile.exists()) {
189
                        FileCopy.copy(stylefile, new File(outdir, "style.css"));
190
                }
191
                println ""
192
        }
193
}
194
params.corporaElement.removeChild(corpusElem);
195

    
196
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
197
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
198
File paramFile = new File(binDir, "import.xml");
199
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;