root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / limsi / limsiLoader.groovy @ 1000
History | View | Annotate | Download (7.4 kB)
1 | 321 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 321 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 321 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 321 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 321 | mdecorde | //
|
6 | 321 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 321 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 321 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 321 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 321 | mdecorde | // later version.
|
11 | 321 | mdecorde | //
|
12 | 321 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 321 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 321 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 321 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 321 | mdecorde | // details.
|
17 | 321 | mdecorde | //
|
18 | 321 | mdecorde | // You should have received a copy of the GNU General
|
19 | 321 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 321 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 321 | mdecorde | |
22 | 321 | mdecorde | //
|
23 | 321 | mdecorde | // This file is part of the TXM platform.
|
24 | 321 | mdecorde | //
|
25 | 321 | mdecorde | // The TXM platform is free software: you can redistribute it and/or modif y
|
26 | 321 | mdecorde | // it under the terms of the GNU General Public License as published by
|
27 | 321 | mdecorde | // the Free Software Foundation, either version 3 of the License, or
|
28 | 321 | mdecorde | // (at your option) any later version.
|
29 | 321 | mdecorde | //
|
30 | 321 | mdecorde | // The TXM platform is distributed in the hope that it will be useful,
|
31 | 321 | mdecorde | // but WITHOUT ANY WARRANTY; without even the implied warranty of
|
32 | 321 | mdecorde | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
33 | 321 | mdecorde | // GNU General Public License for more details.
|
34 | 321 | mdecorde | //
|
35 | 321 | mdecorde | // You should have received a copy of the GNU General Public License
|
36 | 321 | mdecorde | // along with the TXM platform. If not, see <http://www.gnu.org/licenses/>.
|
37 | 321 | mdecorde | //
|
38 | 321 | mdecorde | //
|
39 | 321 | mdecorde | //
|
40 | 321 | mdecorde | // $LastChangedDate:$
|
41 | 321 | mdecorde | // $LastChangedRevision:$
|
42 | 321 | mdecorde | // $LastChangedBy:$
|
43 | 321 | mdecorde | //
|
44 | 986 | mdecorde | package org.txm.scripts.importer.limsi;
|
45 | 321 | mdecorde | |
46 | 321 | mdecorde | import org.eclipse.swt.widgets.*; |
47 | 986 | mdecorde | import org.txm.scripts.importer.transcriber.compiler.*; |
48 | 986 | mdecorde | import org.txm.scripts.importer.transcriber.pager.*; |
49 | 986 | mdecorde | import org.txm.scripts.importer.transcriber.*; |
50 | 321 | mdecorde | |
51 | 321 | mdecorde | import java.io.File; |
52 | 321 | mdecorde | |
53 | 1000 | mdecorde | import org.txm.importer.scripts.xmltxm.*; |
54 | 321 | mdecorde | import org.txm.stat.utils.ConsoleProgressBar |
55 | 927 | mdecorde | import org.txm.*; |
56 | 927 | mdecorde | import org.txm.core.engines.*; |
57 | 321 | mdecorde | import org.txm.objects.*; |
58 | 321 | mdecorde | import org.txm.utils.i18n.*; |
59 | 321 | mdecorde | import org.txm.utils.*; |
60 | 986 | mdecorde | import org.txm.scripts.importer.*; |
61 | 479 | mdecorde | import org.txm.utils.io.FileCopy; |
62 | 321 | mdecorde | import org.w3c.dom.Element |
63 | 479 | mdecorde | import org.txm.utils.xml.DomUtils; |
64 | 499 | mdecorde | import org.txm.rcp.commands.* |
65 | 321 | mdecorde | |
66 | 321 | mdecorde | |
67 | 321 | mdecorde | //PARAMETERS
|
68 | 321 | mdecorde | boolean removeInterviewer = false;//if true the transcription of speakers (en1 and enq2) defined in metadatas file will be ignored |
69 | 321 | mdecorde | boolean includeComments = false; |
70 | 321 | mdecorde | boolean ignoreTranscriberMetadata = false; |
71 | 321 | mdecorde | int csvHeaderNumber = 1; |
72 | 321 | mdecorde | int maxlines = 200; |
73 | 321 | mdecorde | |
74 | 321 | mdecorde | String userDir = System.getProperty("user.home"); |
75 | 321 | mdecorde | |
76 | 321 | mdecorde | def MONITOR;
|
77 | 321 | mdecorde | boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
|
78 | 321 | mdecorde | BaseParameters params; |
79 | 321 | mdecorde | try {params = paramsBinding;MONITOR=monitor} catch (Exception) |
80 | 321 | mdecorde | { println "DEV MODE";//exception means we debug |
81 | 321 | mdecorde | debug = true
|
82 | 321 | mdecorde | //
|
83 | 321 | mdecorde | params = new BaseParameters(new File(userDir, "xml/limsi/files/import.xml")) |
84 | 321 | mdecorde | println "loading "+params.paramFile
|
85 | 321 | mdecorde | params.load() |
86 | 321 | mdecorde | if (!org.txm.Toolbox.isInitialized()) {
|
87 | 321 | mdecorde | Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM")); |
88 | 321 | mdecorde | Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
|
89 | 321 | mdecorde | Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
|
90 | 321 | mdecorde | Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
|
91 | 321 | mdecorde | Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM")); |
92 | 321 | mdecorde | } |
93 | 321 | mdecorde | } |
94 | 321 | mdecorde | if (params == null) { println "no parameters. Aborting"; return; } |
95 | 321 | mdecorde | |
96 | 321 | mdecorde | String corpusname = params.getCorpusName();
|
97 | 321 | mdecorde | Element corpusElem = params.corpora.get(corpusname);
|
98 | 321 | mdecorde | String basename = params.name;
|
99 | 321 | mdecorde | String rootDir = params.rootDir;
|
100 | 321 | mdecorde | String lang = corpusElem.getAttribute("lang"); |
101 | 321 | mdecorde | String model = lang
|
102 | 321 | mdecorde | String encoding = corpusElem.getAttribute("encoding"); |
103 | 321 | mdecorde | boolean annotate = "true" == corpusElem.getAttribute("annotate"); |
104 | 321 | mdecorde | String xsl = params.getXsltElement(corpusElem).getAttribute("xsl") |
105 | 321 | mdecorde | def xslParams = params.getXsltParams(corpusElem);
|
106 | 321 | mdecorde | |
107 | 321 | mdecorde | File srcDir = new File(rootDir); |
108 | 878 | sjacqu01 | File binDir = new File(Toolbox.getTxmHomePath(),"corpora/"+basename); |
109 | 321 | mdecorde | //binDir.deleteDir();
|
110 | 321 | mdecorde | binDir.mkdirs(); |
111 | 321 | mdecorde | if (!binDir.exists()) {
|
112 | 321 | mdecorde | println "Could not create binDir "+binDir
|
113 | 321 | mdecorde | return;
|
114 | 321 | mdecorde | } |
115 | 321 | mdecorde | |
116 | 321 | mdecorde | File txmDir = new File(binDir,"txm/$corpusname"); |
117 | 321 | mdecorde | //txmDir.deleteDir();
|
118 | 321 | mdecorde | txmDir.mkdirs(); |
119 | 321 | mdecorde | |
120 | 321 | mdecorde | try {
|
121 | 321 | mdecorde | // select only xml files
|
122 | 321 | mdecorde | String ext = ".xml"; |
123 | 321 | mdecorde | ArrayList<File> limsiFiles = srcDir.listFiles(); //find all trs files |
124 | 321 | mdecorde | if (limsiFiles == null) { |
125 | 321 | mdecorde | println ("No files in "+srcDir.getAbsolutePath())
|
126 | 321 | mdecorde | return false; |
127 | 321 | mdecorde | } |
128 | 321 | mdecorde | |
129 | 321 | mdecorde | // remove non XML files
|
130 | 321 | mdecorde | for (int i = 0 ; i < limsiFiles.size() ; i++) { |
131 | 321 | mdecorde | File f = limsiFiles.get(i);
|
132 | 321 | mdecorde | if (f.getName().equals("import.xml") || !f.getName().endsWith(ext) || !f.canRead() || f.isHidden()) { |
133 | 321 | mdecorde | limsiFiles.remove(i) |
134 | 321 | mdecorde | i--; |
135 | 321 | mdecorde | } |
136 | 321 | mdecorde | } |
137 | 321 | mdecorde | |
138 | 321 | mdecorde | if (limsiFiles.size() == 0) { |
139 | 321 | mdecorde | println ("No transcriptions in "+srcDir.getAbsolutePath())
|
140 | 321 | mdecorde | return false; |
141 | 321 | mdecorde | } else {
|
142 | 321 | mdecorde | println "Number of transcriptions: "+limsiFiles.size();
|
143 | 321 | mdecorde | } |
144 | 321 | mdecorde | |
145 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(1, "IMPORTER") |
146 | 321 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
147 | 321 | mdecorde | println "-- IMPORTER"
|
148 | 321 | mdecorde | def imp = new importer(limsiFiles, binDir, txmDir) //put result in the txm folder of binDir |
149 | 321 | mdecorde | if (!imp.run()) {
|
150 | 321 | mdecorde | println "Failed to prepare files - Aborting";
|
151 | 321 | mdecorde | return;
|
152 | 321 | mdecorde | } |
153 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(20) |
154 | 321 | mdecorde | |
155 | 321 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
156 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(20, "ANNOTATE") |
157 | 321 | mdecorde | println "-- ANNOTATE CQP- Running NLP tools"
|
158 | 321 | mdecorde | boolean annotationSuccess = false; |
159 | 804 | mdecorde | if (annotate && new AnnotateCQP().run(binDir, txmDir, model+".par")) { |
160 | 321 | mdecorde | annotationSuccess = true;
|
161 | 321 | mdecorde | } |
162 | 321 | mdecorde | |
163 | 321 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
164 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(25, "COMPILING") |
165 | 321 | mdecorde | println "--COMPILING - Building Search Engine indexes"
|
166 | 321 | mdecorde | limsiFiles = txmDir.listFiles(); |
167 | 321 | mdecorde | |
168 | 321 | mdecorde | def comp = new compiler() |
169 | 321 | mdecorde | if(debug) comp.setDebug();
|
170 | 321 | mdecorde | comp.setAnnotationSuccess(annotationSuccess) |
171 | 321 | mdecorde | if (!comp.run(binDir, txmDir, corpusname)) {
|
172 | 321 | mdecorde | println "Failed to compile files";
|
173 | 321 | mdecorde | return;
|
174 | 321 | mdecorde | } |
175 | 321 | mdecorde | |
176 | 321 | mdecorde | // create HTML directory but don't build an edition
|
177 | 321 | mdecorde | File htmlDir = new File(binDir,"HTML/$corpusname"); |
178 | 321 | mdecorde | htmlDir.mkdirs(); |
179 | 321 | mdecorde | |
180 | 321 | mdecorde | def filelist;
|
181 | 321 | mdecorde | if (annotationSuccess)
|
182 | 321 | mdecorde | filelist = new File(binDir, "annotations").listFiles(); |
183 | 321 | mdecorde | else
|
184 | 321 | mdecorde | filelist = txmDir.listFiles(); |
185 | 321 | mdecorde | filelist.sort(); |
186 | 321 | mdecorde | def second = 0 |
187 | 321 | mdecorde | |
188 | 321 | mdecorde | println "Registering texts: "+filelist.size()
|
189 | 321 | mdecorde | ConsoleProgressBar cpb = new ConsoleProgressBar(filelist.size());
|
190 | 321 | mdecorde | for (File txmFile : filelist) { |
191 | 321 | mdecorde | cpb.tick() |
192 | 321 | mdecorde | |
193 | 321 | mdecorde | String txtname = txmFile.getName();
|
194 | 321 | mdecorde | int i = txtname.lastIndexOf("."); |
195 | 321 | mdecorde | if(i > 0) txtname = txtname.substring(0, i); |
196 | 321 | mdecorde | |
197 | 321 | mdecorde | Element text = params.addText(corpusElem, txtname, txmFile);
|
198 | 321 | mdecorde | } |
199 | 321 | mdecorde | } |
200 | 321 | mdecorde | catch (Exception e){org.txm.utils.logger.Log.printStackTrace(e);} |
201 | 321 | mdecorde | |
202 | 321 | mdecorde | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
203 | 321 | mdecorde | if (MONITOR != null) MONITOR.worked(20, "FINALIZING") |
204 | 321 | mdecorde | File paramFile = new File(binDir, "import.xml"); |
205 | 321 | mdecorde | DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;
|
206 | 321 | mdecorde | |
207 | 321 | mdecorde | |
208 | 321 | mdecorde | // load corpus
|
209 | 321 | mdecorde | println "Loading corpus..."
|
210 | 321 | mdecorde | try {LoadBinaryCorpus.loadBase(binDir)} catch(Exception e777){try {AddBases.loadBase(binDir, MONITOR)} catch(Exception e){}} // LoadBinaryCorpus does not exist if TXM version is < 0.7.7 |
211 | 321 | mdecorde | |
212 | 321 | mdecorde | Toolbox.restartWorkspace(); |
213 | 321 | mdecorde | Toolbox.restartSearchEngine(); |
214 | 321 | mdecorde | |
215 | 321 | mdecorde | Display.getDefault().syncExec(new Runnable() { |
216 | 321 | mdecorde | @Override
|
217 | 321 | mdecorde | public void run() { |
218 | 321 | mdecorde | println "Reloading corpora view..."
|
219 | 321 | mdecorde | RestartTXM.reloadViews(); |
220 | 321 | mdecorde | println "import done."
|
221 | 321 | mdecorde | } |
222 | 321 | mdecorde | }); |