Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / hyperbase / hyperbaseLoader.groovy @ 1000

History | View | Annotate | Download (5.6 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2015-06-03 15:04:53 +0200 (mer. 03 juin 2015) $
25
// $LastChangedRevision: 2984 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.hyperbase;
29

    
30

    
31
import org.txm.scripts.importer.hyperbase.importer;
32
import org.txm.scripts.importer.hyperbase.compiler;
33
import org.txm.scripts.importer.xml.pager;
34
import org.txm.objects.*;
35
import org.txm.utils.*;
36
import org.txm.*;
37
import org.txm.core.engines.*;
38
import org.txm.importer.scripts.xmltxm.*;
39
import org.txm.utils.i18n.*;
40
import org.w3c.dom.Element
41
import org.txm.utils.xml.DomUtils;
42

    
43
String userDir = System.getProperty("user.home");
44

    
45
def MONITOR;
46
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
47
BaseParameters params;
48
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
49
{        println "DEV MODE";//exception means we debug
50
        debug = true
51
        params = new BaseParameters(new File(userDir, "xml/TESTS/hyperbase/import.xml"))
52
        params.load()
53
        if (!org.txm.Toolbox.isInitialized()) {
54

    
55
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
56
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
57
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
58
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
59
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
60
        }
61
}
62
if (params == null) { println "no parameters. Aborting"; return; }
63

    
64
String corpusname = params.getCorpusName();
65
Element corpusElem = params.corpora.get(corpusname);
66
String basename = params.name;
67
String rootDir = params.rootDir;
68
String lang = corpusElem.getAttribute("lang");
69
String model = lang
70
String encoding = corpusElem.getAttribute("encoding");
71
boolean annotate = "true" == corpusElem.getAttribute("annotate");
72
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
73
def xslParams = params.getXsltParams(corpusElem);
74
int wordsPerPage = params.getWordsPerPage("default")
75
boolean build_edition = params.getDoEdition("default")
76

    
77
File srcDir = new File(rootDir);
78
File binDir = new File(Toolbox.getTxmHomePath(),"corpora/"+basename);
79
binDir.deleteDir();
80
binDir.mkdirs();
81
if (!binDir.exists()) {
82
        println "Could not create binDir "+binDir
83
        return;
84
}
85

    
86
File txmDir = new File(binDir,"txm/$corpusname");
87
txmDir.deleteDir();
88
txmDir.mkdirs();
89

    
90
if (MONITOR != null) MONITOR.worked(1, "IMPORTER")
91
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
92
println "-- IMPORTER - Reading source files"
93

    
94
if (!(new importer().run(srcDir, binDir, txmDir, encoding, basename, lang))) {
95
        println "import process stopped";
96
        return;
97
}
98

    
99
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
100
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
101
println "-- ANNOTATE - Running NLP tools"
102
boolean annotationSuccess = false;
103
if (annotate) {
104
        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
105
        if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
106
                annotationSuccess = true;
107
        }
108
}
109

    
110
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
111
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
112
println "-- COMPILING - Building Search Engine indexes"
113
def c = new compiler();
114
c.setAnnotationSuccess(annotationSuccess);
115
if (debug) c.setDebug();
116
c.setLang(lang);
117
if (!c.run(binDir, txmDir, corpusname)) {
118
        println "import process stopped";
119
        return;
120
}
121

    
122
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
123

    
124
new File(binDir,"HTML/$corpusname").deleteDir();
125
new File(binDir,"HTML/$corpusname").mkdirs();
126
if (build_edition) {
127

    
128
        println "-- EDITION - Building edition"
129
        if (MONITOR != null) MONITOR.worked(20, "EDITION")
130
        
131
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
132
        outdir.mkdirs();
133
        List<File> filelist = txmDir.listFiles();
134
        Collections.sort(filelist);
135
        def second = 0
136

    
137
        println "Paginating texts: "
138

    
139
        for (File srcfile : filelist) {
140
                print "."
141
                String txtname = srcfile.getName();
142
                int i = txtname.lastIndexOf(".");
143
                if(i > 0) txtname = txtname.substring(0, i);
144

    
145
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
146
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
147

    
148
                Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile);
149

    
150
                def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, "pb");
151
                Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
152

    
153
                for (i = 0 ; i < ed.getPageFiles().size();) {
154
                        File f = ed.getPageFiles().get(i);
155
                        String wordid = ed.getIdx().get(i);
156
                        params.addPage(edition, ""+(++i), wordid);
157
                }
158
        }
159
}
160
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
161
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
162
File paramFile = new File(binDir, "import.xml");
163
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;