Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / txt / txtLoader.groovy @ 479

History | View | Annotate | Download (7.4 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-04-11 15:30:35 +0200 (mar. 11 avril 2017) $
25
// $LastChangedRevision: 3426 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.txt;
29

    
30
import org.txm.importer.txt.importer;
31
import org.txm.importer.txt.compiler;
32
import org.txm.importer.xml.pager;
33
import org.txm.objects.*;
34
import org.txm.scripts.teitxm.*;
35
import org.txm.*;
36
import org.txm.utils.i18n.*;
37
import org.txm.metadatas.*;
38
import org.txm.utils.io.FileCopy;
39
import org.w3c.dom.Element
40
import org.txm.utils.xml.DomUtils;
41

    
42
String userDir = System.getProperty("user.home");
43
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
44
def MONITOR;
45
BaseParameters params;
46
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
47
{        println "DEV MODE";//exception means we debug
48
        debug = true
49
        params = new BaseParameters(new File(userDir, "TXM/clipboard/clipboard2/import.xml"))
50
        params.load()
51
        if (!org.txm.Toolbox.isInitialized()) {
52
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("C:/Program Files/TXM"));
53
                Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH, new File("C:/Program Files/TreeTagger"));
54
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH, new File("C:/Users/mdecorde/TXM/treetagger"));
55
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
56
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
57
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
58
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(userDir, "TXM"));
59
        }
60
}
61
if (params == null) { println "No parameters file. Aborting"; return;}
62

    
63
String corpusname = params.getCorpusName();
64
Element corpusElem = params.corpora.get(corpusname);
65
String basename = params.name;
66
String rootDir = params.rootDir;
67
String lang = corpusElem.getAttribute("lang");
68
String model = lang
69
String encoding = corpusElem.getAttribute("encoding");
70
boolean annotate = "true" == corpusElem.getAttribute("annotate");
71
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
72
def xslParams = params.getXsltParams(corpusElem);
73
int wordsPerPage = params.getWordsPerPage("default")
74
boolean build_edition = params.getDoEdition("default")
75

    
76
File srcDir = new File(rootDir);
77
File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME), "corpora/"+basename);
78
binDir.deleteDir();
79
binDir.mkdirs();
80
if (!binDir.exists()) {
81
        println "Could not create binDir "+binDir
82
        return;
83
}
84

    
85
File txmDir = new File(binDir, "txm/$corpusname");
86
txmDir.deleteDir();
87
txmDir.mkdirs();
88

    
89
//get metadata values from CSV
90
Metadatas metadatas; // text metadata
91
File allmetadatasfile = new File(srcDir, "metadata.csv");
92

    
93
if (allmetadatasfile.exists()) {
94
        println "Trying to read metadata from: "+allmetadatasfile
95
        File copy = new File(binDir, "metadata.csv")
96
        if (!FileCopy.copy(allmetadatasfile, copy)) {
97
                println "Error: could not create a copy of metadata file "+allmetadatasfile.getAbsoluteFile();
98
                return;
99
        }
100
        metadatas = new Metadatas(copy, Toolbox.getParam(Toolbox.METADATA_ENCODING), Toolbox.getParam(Toolbox.METADATA_COLSEPARATOR), Toolbox.getParam(Toolbox.METADATA_TXTSEPARATOR), 1)
101
} else {
102
        println "No metadata file: "+allmetadatasfile
103
}
104

    
105
def suffixes = ["txt", "TXT"]
106
println "-- IMPORTER - Reading source files with extension "+suffixes
107
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
108
if (MONITOR != null) MONITOR.worked(20, "IMPORTER - Reading source files with extension "+suffixes)
109
if (!new importer().run(srcDir, binDir, txmDir,encoding, suffixes, basename, lang)) {
110
        println "Import process stopped";
111
        return;
112
}
113

    
114
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
115
if (MONITOR != null) MONITOR.worked(20, "INJECTING METADATA - from csv file: "+allmetadatasfile)
116
if (metadatas != null) {
117
        println "-- INJECTING METADATA - from csv file: "+allmetadatasfile
118
        println("Injecting metadata: "+metadatas.getHeadersList()+" in texts of directory "+txmDir)
119
        for (File infile : txmDir.listFiles()) {
120
                print "."
121
                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
122

    
123
                if (!metadatas.injectMetadatasInXml(infile, outfile, "text", null)) {
124
                        outfile.delete();
125
                } else {
126
                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
127
                        if (!infile.exists()) {
128
                                println "Error: could not replace $infile by $outfile"
129
                                return false;
130
                        }
131
                }
132
        }
133
        println ""
134
}
135

    
136
println "-- ANNOTATE - Running NLP tools"
137
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
138
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE - Running NLP tools")
139
boolean annotationSuccess = true;
140
if (annotate && !new Annotate().run(binDir, txmDir, model+".par")) {
141
        annotationSuccess = false;
142
}
143

    
144
println "-- COMPILING - Building Search Engine indexes"
145
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
146
if (MONITOR != null) MONITOR.worked(20, "COMPILING - Building Search Engine indexes")
147
def c = new compiler(null, "", corpusname, "default");
148
//c.setCwbPath(userDir+"/TXM/cwb/bin/")// for developers
149
if (metadatas != null)
150
        c.setMetadataAttributes(metadatas.getSattributes())
151
c.setLang(lang);
152
c.setAnnotationSuccess(annotationSuccess)
153
if (debug) c.setDebug();
154
if (!c.run(binDir, txmDir, corpusname)) {
155
        println "Import process stopped";
156
        return;
157
}
158
//println "basename :"+basename;
159
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
160
new File(binDir,"HTML/$corpusname").deleteDir();
161
new File(binDir,"HTML/$corpusname").mkdirs();
162
if (build_edition) {
163

    
164
        println "-- EDITION - Building edition"
165
        if (MONITOR != null) MONITOR.worked(20, "EDITION - Building edition")
166
        
167
        File outdir = new File(binDir, "HTML/$corpusname/default/");
168
        outdir.mkdirs();
169
        List<File> filelist = txmDir.listFiles();
170
        Collections.sort(filelist);
171
        def second = 0
172

    
173

    
174
        for (File srcfile : filelist) {
175
                print "."
176
                String txtname = srcfile.getName();
177
                int i = txtname.lastIndexOf(".");
178
                if (i > 0) txtname = txtname.substring(0, i);
179

    
180
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
181
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
182

    
183
                Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile);
184

    
185
                def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, null);
186
                Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
187

    
188
                for (i = 0 ; i < ed.getPageFiles().size();) {
189
                        File f = ed.getPageFiles().get(i);
190
                        String wordid = ed.getIdx().get(i);
191
                        params.addPage(edition, ""+(++i), wordid);
192
                }
193
        }
194
}
195
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
196
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
197
File paramFile = new File(binDir, "import.xml");
198
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;