Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / limsi / limsiLoader.groovy @ 1000

History | View | Annotate | Download (7.4 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
22 321 mdecorde
//
23 321 mdecorde
// This file is part of the TXM platform.
24 321 mdecorde
//
25 321 mdecorde
// The TXM platform is free software: you can redistribute it and/or modif y
26 321 mdecorde
// it under the terms of the GNU General Public License as published by
27 321 mdecorde
// the Free Software Foundation, either version 3 of the License, or
28 321 mdecorde
// (at your option) any later version.
29 321 mdecorde
//
30 321 mdecorde
// The TXM platform is distributed in the hope that it will be useful,
31 321 mdecorde
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32 321 mdecorde
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33 321 mdecorde
// GNU General Public License for more details.
34 321 mdecorde
//
35 321 mdecorde
// You should have received a copy of the GNU General Public License
36 321 mdecorde
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37 321 mdecorde
//
38 321 mdecorde
//
39 321 mdecorde
//
40 321 mdecorde
// $LastChangedDate:$
41 321 mdecorde
// $LastChangedRevision:$
42 321 mdecorde
// $LastChangedBy:$
43 321 mdecorde
//
44 986 mdecorde
package org.txm.scripts.importer.limsi;
45 321 mdecorde
46 321 mdecorde
import org.eclipse.swt.widgets.*;
47 986 mdecorde
import org.txm.scripts.importer.transcriber.compiler.*;
48 986 mdecorde
import org.txm.scripts.importer.transcriber.pager.*;
49 986 mdecorde
import org.txm.scripts.importer.transcriber.*;
50 321 mdecorde
51 321 mdecorde
import java.io.File;
52 321 mdecorde
53 1000 mdecorde
import org.txm.importer.scripts.xmltxm.*;
54 321 mdecorde
import org.txm.stat.utils.ConsoleProgressBar
55 927 mdecorde
import org.txm.*;
56 927 mdecorde
import org.txm.core.engines.*;
57 321 mdecorde
import org.txm.objects.*;
58 321 mdecorde
import org.txm.utils.i18n.*;
59 321 mdecorde
import org.txm.utils.*;
60 986 mdecorde
import org.txm.scripts.importer.*;
61 479 mdecorde
import org.txm.utils.io.FileCopy;
62 321 mdecorde
import org.w3c.dom.Element
63 479 mdecorde
import org.txm.utils.xml.DomUtils;
64 499 mdecorde
import org.txm.rcp.commands.*
65 321 mdecorde
66 321 mdecorde
67 321 mdecorde
//PARAMETERS
68 321 mdecorde
boolean removeInterviewer = false;//if true the transcription of speakers (en1 and enq2) defined in metadatas file will be ignored
69 321 mdecorde
boolean includeComments = false;
70 321 mdecorde
boolean ignoreTranscriberMetadata = false;
71 321 mdecorde
int csvHeaderNumber = 1;
72 321 mdecorde
int maxlines = 200;
73 321 mdecorde
74 321 mdecorde
String userDir = System.getProperty("user.home");
75 321 mdecorde
76 321 mdecorde
def MONITOR;
77 321 mdecorde
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
78 321 mdecorde
BaseParameters params;
79 321 mdecorde
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
80 321 mdecorde
{        println "DEV MODE";//exception means we debug
81 321 mdecorde
        debug = true
82 321 mdecorde
        //
83 321 mdecorde
                params = new BaseParameters(new File(userDir, "xml/limsi/files/import.xml"))
84 321 mdecorde
                println "loading "+params.paramFile
85 321 mdecorde
                params.load()
86 321 mdecorde
        if (!org.txm.Toolbox.isInitialized()) {
87 321 mdecorde
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
88 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
89 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
90 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
91 321 mdecorde
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
92 321 mdecorde
        }
93 321 mdecorde
}
94 321 mdecorde
if (params == null) { println "no parameters. Aborting"; return; }
95 321 mdecorde
96 321 mdecorde
String corpusname = params.getCorpusName();
97 321 mdecorde
Element corpusElem = params.corpora.get(corpusname);
98 321 mdecorde
String basename = params.name;
99 321 mdecorde
String rootDir = params.rootDir;
100 321 mdecorde
String lang = corpusElem.getAttribute("lang");
101 321 mdecorde
String model = lang
102 321 mdecorde
String encoding = corpusElem.getAttribute("encoding");
103 321 mdecorde
boolean annotate = "true" == corpusElem.getAttribute("annotate");
104 321 mdecorde
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
105 321 mdecorde
def xslParams = params.getXsltParams(corpusElem);
106 321 mdecorde
107 321 mdecorde
File srcDir = new File(rootDir);
108 878 sjacqu01
File binDir = new File(Toolbox.getTxmHomePath(),"corpora/"+basename);
109 321 mdecorde
//binDir.deleteDir();
110 321 mdecorde
binDir.mkdirs();
111 321 mdecorde
if (!binDir.exists()) {
112 321 mdecorde
        println "Could not create binDir "+binDir
113 321 mdecorde
        return;
114 321 mdecorde
}
115 321 mdecorde
116 321 mdecorde
File txmDir = new File(binDir,"txm/$corpusname");
117 321 mdecorde
//txmDir.deleteDir();
118 321 mdecorde
txmDir.mkdirs();
119 321 mdecorde
120 321 mdecorde
try {
121 321 mdecorde
        // select only xml files
122 321 mdecorde
        String ext = ".xml";
123 321 mdecorde
        ArrayList<File> limsiFiles = srcDir.listFiles(); //find all trs files
124 321 mdecorde
        if (limsiFiles  == null) {
125 321 mdecorde
                println ("No files in "+srcDir.getAbsolutePath())
126 321 mdecorde
                return false;
127 321 mdecorde
        }
128 321 mdecorde
129 321 mdecorde
        // remove non XML files
130 321 mdecorde
        for (int i = 0 ; i < limsiFiles.size() ; i++) {
131 321 mdecorde
                File f = limsiFiles.get(i);
132 321 mdecorde
                if (f.getName().equals("import.xml") || !f.getName().endsWith(ext) || !f.canRead() || f.isHidden()) {
133 321 mdecorde
                        limsiFiles.remove(i)
134 321 mdecorde
                        i--;
135 321 mdecorde
                }
136 321 mdecorde
        }
137 321 mdecorde
138 321 mdecorde
        if (limsiFiles.size() == 0) {
139 321 mdecorde
                println ("No transcriptions in "+srcDir.getAbsolutePath())
140 321 mdecorde
                return false;
141 321 mdecorde
        } else {
142 321 mdecorde
                println "Number of transcriptions: "+limsiFiles.size();
143 321 mdecorde
        }
144 321 mdecorde
145 321 mdecorde
        if (MONITOR != null) MONITOR.worked(1, "IMPORTER")
146 321 mdecorde
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
147 321 mdecorde
        println "-- IMPORTER"
148 321 mdecorde
        def imp = new importer(limsiFiles, binDir, txmDir) //put result in the txm folder of binDir
149 321 mdecorde
        if (!imp.run()) {
150 321 mdecorde
                println "Failed to prepare files - Aborting";
151 321 mdecorde
                return;
152 321 mdecorde
        }
153 321 mdecorde
        if (MONITOR != null) MONITOR.worked(20)
154 321 mdecorde
155 321 mdecorde
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
156 321 mdecorde
        if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
157 321 mdecorde
        println "-- ANNOTATE CQP- Running NLP tools"
158 321 mdecorde
        boolean annotationSuccess = false;
159 804 mdecorde
        if (annotate && new AnnotateCQP().run(binDir, txmDir, model+".par")) {
160 321 mdecorde
                annotationSuccess = true;
161 321 mdecorde
        }
162 321 mdecorde
163 321 mdecorde
        if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
164 321 mdecorde
        if (MONITOR != null) MONITOR.worked(25, "COMPILING")
165 321 mdecorde
        println "--COMPILING - Building Search Engine indexes"
166 321 mdecorde
        limsiFiles = txmDir.listFiles();
167 321 mdecorde
168 321 mdecorde
        def comp = new compiler()
169 321 mdecorde
        if(debug) comp.setDebug();
170 321 mdecorde
        comp.setAnnotationSuccess(annotationSuccess)
171 321 mdecorde
        if (!comp.run(binDir, txmDir, corpusname)) {
172 321 mdecorde
                println "Failed to compile files";
173 321 mdecorde
                return;
174 321 mdecorde
        }
175 321 mdecorde
176 321 mdecorde
        // create HTML directory but don't build an edition
177 321 mdecorde
        File htmlDir = new File(binDir,"HTML/$corpusname");
178 321 mdecorde
        htmlDir.mkdirs();
179 321 mdecorde
180 321 mdecorde
        def filelist;
181 321 mdecorde
        if (annotationSuccess)
182 321 mdecorde
                filelist = new File(binDir, "annotations").listFiles();
183 321 mdecorde
        else
184 321 mdecorde
                filelist = txmDir.listFiles();
185 321 mdecorde
        filelist.sort();
186 321 mdecorde
        def second = 0
187 321 mdecorde
188 321 mdecorde
        println "Registering texts: "+filelist.size()
189 321 mdecorde
        ConsoleProgressBar cpb = new ConsoleProgressBar(filelist.size());
190 321 mdecorde
        for (File txmFile : filelist) {
191 321 mdecorde
                cpb.tick()
192 321 mdecorde
193 321 mdecorde
                String txtname = txmFile.getName();
194 321 mdecorde
                int i = txtname.lastIndexOf(".");
195 321 mdecorde
                if(i > 0) txtname = txtname.substring(0, i);
196 321 mdecorde
197 321 mdecorde
                Element text = params.addText(corpusElem, txtname, txmFile);
198 321 mdecorde
        }
199 321 mdecorde
}
200 321 mdecorde
catch (Exception e){org.txm.utils.logger.Log.printStackTrace(e);}
201 321 mdecorde
202 321 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
203 321 mdecorde
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
204 321 mdecorde
File paramFile = new File(binDir, "import.xml");
205 321 mdecorde
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;
206 321 mdecorde
207 321 mdecorde
208 321 mdecorde
// load corpus
209 321 mdecorde
println "Loading corpus..."
210 321 mdecorde
try {LoadBinaryCorpus.loadBase(binDir)} catch(Exception e777){try {AddBases.loadBase(binDir, MONITOR)} catch(Exception e){}} // LoadBinaryCorpus does not exist if TXM version is < 0.7.7
211 321 mdecorde
212 321 mdecorde
Toolbox.restartWorkspace();
213 321 mdecorde
Toolbox.restartSearchEngine();
214 321 mdecorde
215 321 mdecorde
Display.getDefault().syncExec(new Runnable() {
216 321 mdecorde
        @Override
217 321 mdecorde
        public void run() {
218 321 mdecorde
                println "Reloading corpora view..."
219 321 mdecorde
                RestartTXM.reloadViews();
220 321 mdecorde
                println "import done."
221 321 mdecorde
        }
222 321 mdecorde
});