Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / txt / txtLoader.groovy @ 966

History | View | Annotate | Download (7.4 kB)

1 479 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 479 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 479 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 479 mdecorde
// Sophia Antipolis, University of Paris 3.
5 479 mdecorde
//
6 479 mdecorde
// The TXM platform is free software: you can redistribute it
7 479 mdecorde
// and/or modify it under the terms of the GNU General Public
8 479 mdecorde
// License as published by the Free Software Foundation,
9 479 mdecorde
// either version 2 of the License, or (at your option) any
10 479 mdecorde
// later version.
11 479 mdecorde
//
12 479 mdecorde
// The TXM platform is distributed in the hope that it will be
13 479 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 479 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 479 mdecorde
// PURPOSE. See the GNU General Public License for more
16 479 mdecorde
// details.
17 479 mdecorde
//
18 479 mdecorde
// You should have received a copy of the GNU General
19 479 mdecorde
// Public License along with the TXM platform. If not, see
20 479 mdecorde
// http://www.gnu.org/licenses.
21 479 mdecorde
//
22 479 mdecorde
//
23 479 mdecorde
//
24 479 mdecorde
// $LastChangedDate: 2017-04-11 15:30:35 +0200 (mar. 11 avril 2017) $
25 479 mdecorde
// $LastChangedRevision: 3426 $
26 479 mdecorde
// $LastChangedBy: mdecorde $
27 479 mdecorde
//
28 479 mdecorde
package org.txm.importer.txt;
29 479 mdecorde
30 479 mdecorde
import org.txm.importer.txt.importer;
31 479 mdecorde
import org.txm.importer.txt.compiler;
32 479 mdecorde
import org.txm.importer.xml.pager;
33 479 mdecorde
import org.txm.objects.*;
34 927 mdecorde
import org.txm.importer.xmltxm.*;
35 479 mdecorde
import org.txm.*;
36 927 mdecorde
import org.txm.core.engines.*;
37 479 mdecorde
import org.txm.utils.i18n.*;
38 479 mdecorde
import org.txm.metadatas.*;
39 479 mdecorde
import org.txm.utils.io.FileCopy;
40 479 mdecorde
import org.w3c.dom.Element
41 479 mdecorde
import org.txm.utils.xml.DomUtils;
42 479 mdecorde
43 479 mdecorde
String userDir = System.getProperty("user.home");
44 479 mdecorde
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
45 479 mdecorde
def MONITOR;
46 479 mdecorde
BaseParameters params;
47 479 mdecorde
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
48 479 mdecorde
{        println "DEV MODE";//exception means we debug
49 479 mdecorde
        debug = true
50 479 mdecorde
        params = new BaseParameters(new File(userDir, "TXM/clipboard/clipboard2/import.xml"))
51 479 mdecorde
        params.load()
52 479 mdecorde
        if (!org.txm.Toolbox.isInitialized()) {
53 479 mdecorde
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("C:/Program Files/TXM"));
54 479 mdecorde
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH, new File("C:/Users/mdecorde/TXM/treetagger"));
55 479 mdecorde
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
56 479 mdecorde
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
57 479 mdecorde
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
58 479 mdecorde
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(userDir, "TXM"));
59 479 mdecorde
        }
60 479 mdecorde
}
61 479 mdecorde
if (params == null) { println "No parameters file. Aborting"; return;}
62 479 mdecorde
63 479 mdecorde
String corpusname = params.getCorpusName();
64 479 mdecorde
Element corpusElem = params.corpora.get(corpusname);
65 479 mdecorde
String basename = params.name;
66 479 mdecorde
String rootDir = params.rootDir;
67 479 mdecorde
String lang = corpusElem.getAttribute("lang");
68 479 mdecorde
String model = lang
69 479 mdecorde
String encoding = corpusElem.getAttribute("encoding");
70 479 mdecorde
boolean annotate = "true" == corpusElem.getAttribute("annotate");
71 479 mdecorde
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
72 479 mdecorde
def xslParams = params.getXsltParams(corpusElem);
73 479 mdecorde
int wordsPerPage = params.getWordsPerPage("default")
74 479 mdecorde
boolean build_edition = params.getDoEdition("default")
75 479 mdecorde
76 479 mdecorde
File srcDir = new File(rootDir);
77 878 sjacqu01
File binDir = new File(Toolbox.getTxmHomePath(), "corpora/"+basename);
78 479 mdecorde
binDir.deleteDir();
79 479 mdecorde
binDir.mkdirs();
80 479 mdecorde
if (!binDir.exists()) {
81 479 mdecorde
        println "Could not create binDir "+binDir
82 479 mdecorde
        return;
83 479 mdecorde
}
84 479 mdecorde
85 479 mdecorde
File txmDir = new File(binDir, "txm/$corpusname");
86 479 mdecorde
txmDir.deleteDir();
87 479 mdecorde
txmDir.mkdirs();
88 479 mdecorde
89 479 mdecorde
//get metadata values from CSV
90 479 mdecorde
Metadatas metadatas; // text metadata
91 965 mdecorde
File allmetadatasfile = Metadatas.findMetadataFile(srcDir);
92 479 mdecorde
93 479 mdecorde
if (allmetadatasfile.exists()) {
94 479 mdecorde
        println "Trying to read metadata from: "+allmetadatasfile
95 966 mdecorde
        File copy = new File(binDir, allmetadatasfile.getName())
96 479 mdecorde
        if (!FileCopy.copy(allmetadatasfile, copy)) {
97 479 mdecorde
                println "Error: could not create a copy of metadata file "+allmetadatasfile.getAbsoluteFile();
98 479 mdecorde
                return;
99 479 mdecorde
        }
100 945 mdecorde
        metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(),
101 945 mdecorde
                Toolbox.getMetadataColumnSeparator(),
102 945 mdecorde
                Toolbox.getMetadataTextSeparator(), 1)
103 479 mdecorde
} else {
104 479 mdecorde
        println "No metadata file: "+allmetadatasfile
105 479 mdecorde
}
106 479 mdecorde
107 479 mdecorde
def suffixes = ["txt", "TXT"]
108 479 mdecorde
println "-- IMPORTER - Reading source files with extension "+suffixes
109 479 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
110 479 mdecorde
if (MONITOR != null) MONITOR.worked(20, "IMPORTER - Reading source files with extension "+suffixes)
111 479 mdecorde
if (!new importer().run(srcDir, binDir, txmDir,encoding, suffixes, basename, lang)) {
112 479 mdecorde
        println "Import process stopped";
113 479 mdecorde
        return;
114 479 mdecorde
}
115 479 mdecorde
116 479 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
117 479 mdecorde
if (MONITOR != null) MONITOR.worked(20, "INJECTING METADATA - from csv file: "+allmetadatasfile)
118 479 mdecorde
if (metadatas != null) {
119 479 mdecorde
        println "-- INJECTING METADATA - from csv file: "+allmetadatasfile
120 479 mdecorde
        println("Injecting metadata: "+metadatas.getHeadersList()+" in texts of directory "+txmDir)
121 479 mdecorde
        for (File infile : txmDir.listFiles()) {
122 479 mdecorde
                print "."
123 479 mdecorde
                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
124 479 mdecorde
125 479 mdecorde
                if (!metadatas.injectMetadatasInXml(infile, outfile, "text", null)) {
126 479 mdecorde
                        outfile.delete();
127 479 mdecorde
                } else {
128 479 mdecorde
                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
129 479 mdecorde
                        if (!infile.exists()) {
130 479 mdecorde
                                println "Error: could not replace $infile by $outfile"
131 479 mdecorde
                                return false;
132 479 mdecorde
                        }
133 479 mdecorde
                }
134 479 mdecorde
        }
135 479 mdecorde
        println ""
136 479 mdecorde
}
137 479 mdecorde
138 479 mdecorde
println "-- ANNOTATE - Running NLP tools"
139 479 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
140 479 mdecorde
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE - Running NLP tools")
141 479 mdecorde
boolean annotationSuccess = true;
142 927 mdecorde
if (annotate) {
143 927 mdecorde
        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
144 927 mdecorde
        if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
145 927 mdecorde
                annotationSuccess = true;
146 927 mdecorde
        }
147 479 mdecorde
}
148 479 mdecorde
149 479 mdecorde
println "-- COMPILING - Building Search Engine indexes"
150 479 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
151 479 mdecorde
if (MONITOR != null) MONITOR.worked(20, "COMPILING - Building Search Engine indexes")
152 479 mdecorde
def c = new compiler(null, "", corpusname, "default");
153 479 mdecorde
//c.setCwbPath(userDir+"/TXM/cwb/bin/")// for developers
154 479 mdecorde
if (metadatas != null)
155 479 mdecorde
        c.setMetadataAttributes(metadatas.getSattributes())
156 479 mdecorde
c.setLang(lang);
157 479 mdecorde
c.setAnnotationSuccess(annotationSuccess)
158 479 mdecorde
if (debug) c.setDebug();
159 479 mdecorde
if (!c.run(binDir, txmDir, corpusname)) {
160 479 mdecorde
        println "Import process stopped";
161 479 mdecorde
        return;
162 479 mdecorde
}
163 479 mdecorde
//println "basename :"+basename;
164 479 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
165 479 mdecorde
new File(binDir,"HTML/$corpusname").deleteDir();
166 479 mdecorde
new File(binDir,"HTML/$corpusname").mkdirs();
167 479 mdecorde
if (build_edition) {
168 479 mdecorde
169 479 mdecorde
        println "-- EDITION - Building edition"
170 479 mdecorde
        if (MONITOR != null) MONITOR.worked(20, "EDITION - Building edition")
171 479 mdecorde
172 479 mdecorde
        File outdir = new File(binDir, "HTML/$corpusname/default/");
173 479 mdecorde
        outdir.mkdirs();
174 479 mdecorde
        List<File> filelist = txmDir.listFiles();
175 479 mdecorde
        Collections.sort(filelist);
176 479 mdecorde
        def second = 0
177 479 mdecorde
178 479 mdecorde
179 479 mdecorde
        for (File srcfile : filelist) {
180 479 mdecorde
                print "."
181 479 mdecorde
                String txtname = srcfile.getName();
182 479 mdecorde
                int i = txtname.lastIndexOf(".");
183 479 mdecorde
                if (i > 0) txtname = txtname.substring(0, i);
184 479 mdecorde
185 479 mdecorde
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
186 479 mdecorde
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
187 479 mdecorde
188 479 mdecorde
                Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile);
189 479 mdecorde
190 479 mdecorde
                def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, null);
191 479 mdecorde
                Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
192 479 mdecorde
193 479 mdecorde
                for (i = 0 ; i < ed.getPageFiles().size();) {
194 479 mdecorde
                        File f = ed.getPageFiles().get(i);
195 479 mdecorde
                        String wordid = ed.getIdx().get(i);
196 479 mdecorde
                        params.addPage(edition, ""+(++i), wordid);
197 479 mdecorde
                }
198 479 mdecorde
        }
199 479 mdecorde
}
200 479 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
201 479 mdecorde
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
202 479 mdecorde
File paramFile = new File(binDir, "import.xml");
203 479 mdecorde
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;