Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / discours / discoursLoader.groovy @ 479

History | View | Annotate | Download (5.3 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2014-11-20 11:30:43 +0100 (jeu. 20 nov. 2014) $
25
// $LastChangedRevision: 2904 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.discours;
29

    
30
import org.txm.importer.discours.compiler;
31
import org.txm.importer.xml.pager;
32
import org.txm.objects.*;
33
import org.txm.utils.*;
34
import org.txm.*;
35
import org.txm.utils.i18n.*;
36
import org.w3c.dom.Element
37
import org.txm.utils.xml.DomUtils;
38

    
39
String userDir = System.getProperty("user.home");
40

    
41
def MONITOR;
42
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
43
BaseParameters params;
44
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
45
{        println "DEV MODE";//exception means we debug
46
        debug = true;
47
        params = new BaseParameters(new File(userDir, "xml/discours/import.xml"))
48
        params.load()
49
        if (!org.txm.Toolbox.isInitialized()) {
50

    
51
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
52
                Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
53
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
54
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
55
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
56
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
57
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
58
        }
59
}
60
if (params == null) { println "no parameters. Aborting"; return; }
61

    
62
String corpusname = params.getCorpusName();
63
Element corpusElem = params.corpora.get(corpusname);
64
String basename = params.name;
65
String rootDir = params.rootDir;
66
String lang = corpusElem.getAttribute("lang");
67
String model = lang
68
String encoding = corpusElem.getAttribute("encoding");
69
boolean annotate = "true" == corpusElem.getAttribute("annotate");
70
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
71
def xslParams = params.getXsltParams(corpusElem);
72
int wordsPerPage = params.getWordsPerPage("default")
73
boolean build_edition = params.getDoEdition("default")
74

    
75
File srcDir = new File(rootDir);
76
File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME),"corpora/"+basename);
77
binDir.deleteDir();
78
binDir.mkdirs();
79
if (!binDir.exists()) {
80
        println "Could not create binDir "+binDir
81
        return;
82
}
83

    
84
File txmDir = new File(binDir,"txm/$corpusname");
85
txmDir.deleteDir();
86
txmDir.mkdirs();
87

    
88
if (MONITOR != null) MONITOR.worked(1, "IMPORTER")
89
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
90
println "-- IMPORTER - Reading source files"
91
def imp = new importer();
92
if (!imp.run(srcDir, binDir, txmDir, encoding, basename)) {
93
        println "import process stopped";
94
        return;
95
}
96

    
97
println "-- ANNOTATE - Running NLP tools"
98
println "No annotation to do"
99

    
100
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
101
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
102
println("-- COMPILING - Building Search Engine indexes")
103
List<File> files = txmDir.listFiles();
104
def c = new compiler()
105
if (debug) c.setDebug();
106
// no annotation
107
c.setLang(lang);
108
if (!c.run(files, binDir, txmDir, corpusname)) {
109
        println "import process stopped";
110
        return;
111
}
112

    
113
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
114

    
115
new File(binDir,"HTML/$corpusname").deleteDir();
116
new File(binDir,"HTML/$corpusname").mkdirs();
117
if (build_edition) {
118

    
119
        println "-- EDITION - Building edition"
120
        if (MONITOR != null) MONITOR.worked(20, "EDITION")
121
        
122
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
123
        outdir.mkdirs();
124
        List<File> filelist = txmDir.listFiles();
125
        Collections.sort(filelist);
126

    
127
        println "Paginating texts: "
128

    
129
        for (File srcfile : filelist) {
130
                print "."
131
                String txtname = srcfile.getName();
132
                int i = txtname.lastIndexOf(".");
133
                if(i > 0) txtname = txtname.substring(0, i);
134

    
135
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
136
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
137

    
138
                Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile);
139

    
140
                def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, "pb");
141
                Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
142

    
143
                for (i = 0 ; i < ed.getPageFiles().size();) {
144
                        File f = ed.getPageFiles().get(i);
145
                        String wordid = ed.getIdx().get(i);
146
                        params.addPage(edition, ""+(++i), wordid);
147
                }
148
        }
149
}
150
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
151
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
152
File paramFile = new File(binDir, "import.xml");
153
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;