Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / txt / txtLoader.groovy @ 2554

History | View | Annotate | Download (6.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-04-11 15:30:35 +0200 (mar. 11 avril 2017) $
25
// $LastChangedRevision: 3426 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.txt;
29

    
30
import org.txm.scripts.importer.txt.importer;
31
import org.txm.scripts.importer.txt.compiler;
32
import org.txm.scripts.importer.xml.pager;
33
import org.txm.objects.*;
34
import org.txm.importer.scripts.xmltxm.*;
35
import org.txm.*;
36
import org.txm.objects.*;
37
import org.txm.core.engines.*;
38
import org.txm.utils.i18n.*;
39
import org.txm.metadatas.*;
40
import org.txm.utils.io.FileCopy;
41
import org.w3c.dom.Element
42
import org.txm.utils.xml.DomUtils;
43
import org.txm.importer.*
44

    
45
String userDir = System.getProperty("user.home");
46

    
47
def MONITOR;
48
Project project;
49

    
50
try {project=projectBinding;MONITOR=monitor} catch (Exception)
51
{        }
52
if (project == null) { println "no project set. Aborting"; return; }
53

    
54
String corpusname = project.getName();
55
String basename = corpusname
56
String rootDir = project.getSrcdir();
57
String lang = project.getLang()
58
String model = lang
59
String encoding = project.getEncoding()
60
boolean annotate = project.getAnnotate()
61
String xsl = project.getFrontXSL();
62
def xslParams = project.getXsltParameters();
63
int wordsPerPage = project.getEditionDefinition("default").getWordsPerPage()
64
boolean build_edition = project.getEditionDefinition("default").getBuildEdition()
65

    
66
File srcDir = new File(rootDir);
67
File binDir = project.getProjectDirectory();
68
binDir.mkdirs();
69
if (!binDir.exists()) {
70
        println "Could not create binDir "+binDir
71
        return;
72
}
73

    
74
File txmDir = new File(binDir, "txm/$corpusname");
75
txmDir.deleteDir();
76
txmDir.mkdirs();
77

    
78
//get metadata values from CSV
79
Metadatas metadatas; // text metadata
80
File allMetadataFile = Metadatas.findMetadataFile(srcDir);
81

    
82
if (allMetadataFile.exists()) {
83
        println "Trying to read metadata from: "+allMetadataFile
84
        File copy = new File(binDir, allMetadataFile.getName())
85
        if (!FileCopy.copy(allMetadataFile, copy)) {
86
                println "Error: could not create a copy of metadata file "+allMetadataFile.getAbsoluteFile();
87
                return;
88
        }
89
        metadatas = new Metadatas(copy, Toolbox.getMetadataEncoding(), 
90
                Toolbox.getMetadataColumnSeparator(), 
91
                Toolbox.getMetadataTextSeparator(), 1)
92
} else {
93
        println "No metadata file: "+allMetadataFile
94
}
95

    
96
def suffixes = ["txt", "TXT"]
97
println "-- IMPORTER - Reading source files with extension "+suffixes
98
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
99
if (MONITOR != null) MONITOR.worked(20, "IMPORTER - Reading source files with extension "+suffixes)
100
if (!new importer().run(srcDir, binDir, txmDir,encoding, suffixes, basename, lang)) {
101
        println "Import process stopped";
102
        return;
103
}
104

    
105
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
106
if (MONITOR != null) MONITOR.worked(20, "INJECTING METADATA - from csv file: "+allMetadataFile)
107
if (metadatas != null) {
108
        println "-- INJECTING METADATA - from csv file: "+allMetadataFile
109
        println("Injecting metadata: "+metadatas.getHeadersList()+" in texts of directory "+txmDir)
110
        for (File infile : txmDir.listFiles()) {
111
                print "."
112
                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
113

    
114
                if (!metadatas.injectMetadatasInXml(infile, outfile, "text", null)) {
115
                        outfile.delete();
116
                } else {
117
                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
118
                        if (!infile.exists()) {
119
                                println "Error: could not replace $infile by $outfile"
120
                                return false;
121
                        }
122
                }
123
        }
124
        println ""
125
}
126

    
127

    
128
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
129
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE - Running NLP tools")
130
boolean annotationSuccess = true;
131
if (annotate) {
132
        println "-- ANNOTATE - Running NLP tools"
133
        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
134
        if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
135
                annotationSuccess = true;
136
        }
137
}
138

    
139
println "-- COMPILING - Building Search Engine indexes"
140
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
141
if (MONITOR != null) MONITOR.worked(20, "COMPILING - Building Search Engine indexes")
142
def c = new compiler(null, "", corpusname, "default");
143
//c.setCwbPath(userDir+"/TXM/cwb/bin/")// for developers
144
if (metadatas != null)
145
        c.setMetadataAttributes(metadatas.getSattributes())
146
c.setLang(lang);
147
c.setAnnotationSuccess(annotationSuccess)
148
if (debug) c.setDebug();
149
if (!c.run(project)) {
150
        println "Import process stopped";
151
        return;
152
}
153
//println "basename :"+basename;
154
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
155
new File(binDir,"HTML/$corpusname").deleteDir();
156
new File(binDir,"HTML/$corpusname").mkdirs();
157
if (build_edition) {
158

    
159
        println "-- EDITION - Building edition"
160
        if (MONITOR != null) MONITOR.worked(20, "EDITION - Building edition")
161
        
162
        File outdir = new File(binDir, "HTML/$corpusname/default/");
163
        outdir.mkdirs();
164
        List<File> filelist = txmDir.listFiles();
165
        Collections.sort(filelist);
166
        def second = 0
167

    
168
        ConsoleProgressBar cpb = new ConsoleProgressBar(filelist.size());
169
        for (File srcfile : filelist) {
170
                cpb.tick()
171
                String txtname = srcfile.getName();
172
                int i = txtname.lastIndexOf(".");
173
                if (i > 0) txtname = txtname.substring(0, i);
174

    
175
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
176
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
177

    
178
                Text t = new Text(project);
179
                t.setName(txtname);
180
                t.setSourceFile(srcfile)
181
                t.setTXMFile(srcfile)
182
                
183
                def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, null);
184
                Edition edition = new Edition(t);
185
                edition.setName("default");
186
                edition.setIndex(outdir.getAbsolutePath());
187
                for (i = 0 ; i < ed.getPageFiles().size();) {
188
                        File f = ed.getPageFiles().get(i);
189
                        String wordid = "w_0";
190
                                if (i < ed.getIdx().size()) wordid = ed.getIdx().get(i);
191
                        edition.addPage(""+(++i), wordid);
192
                }
193
        }
194
        cpb.done()
195
}
196

    
197
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
198
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
199

    
200
readyToLoad = project.save();