Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / tmx / tmxLoader.groovy @ 3051

History | View | Annotate | Download (6.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2015-08-31 13:45:22 +0200 (lun. 31 août 2015) $
25
// $LastChangedRevision: 3028 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.tmx;
29
import org.txm.searchengine.cqp.corpus.MainCorpus;
30

    
31
import java.io.File;
32
import org.txm.scripts.importer.tmx.importer;
33
import org.txm.scripts.importer.tmx.compiler;
34
import org.txm.scripts.importer.tmx.pager;
35
import org.txm.objects.*;
36
import org.txm.utils.*
37
import org.txm.utils.io.*;
38
import org.txm.*;
39
import org.txm.core.engines.*;
40
import org.txm.importer.scripts.xmltxm.*;
41
import org.txm.utils.i18n.*;
42
import org.w3c.dom.Element
43
import org.txm.utils.xml.DomUtils;
44

    
45
String userDir = System.getProperty("user.home");
46
def MONITOR;
47
Project project;
48

    
49
try {project=projectBinding;MONITOR=monitor} catch (Exception)
50
{        }
51
if (project == null) { println "no project set. Aborting"; return; }
52

    
53
String corpusname = project.getName();
54
String basename = corpusname
55
String rootDir = project.getSrcdir();
56
String lang = project.getLang()
57
String model = lang
58
String encoding = project.getEncoding()
59
boolean annotate = project.getAnnotate()
60
String xsl = project.getFrontXSL();
61
def xslParams = project.getXsltParameters();
62
int wordsPerPage = project.getEditionDefinition("default").getWordsPerPage()
63
String page_element = project.getEditionDefinition("default").getPageElement()
64
boolean build_edition = project.getEditionDefinition("default").getBuildEdition()
65

    
66
File srcDir = new File(rootDir);
67
File binDir = project.getProjectDirectory();
68
binDir.mkdirs();
69
if (!binDir.exists()) {
70
        println "Could not create binDir "+binDir
71
        return;
72
}
73

    
74
File txmDir = new File(binDir, "txm");
75
txmDir.deleteDir();
76
txmDir.mkdirs();
77
HashMap<String, String> textLangs = [:];
78
def langGroups = [:];
79

    
80
println "-- IMPORTER - Reading source files"
81
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
82
if (MONITOR != null) MONITOR.worked(20, "IMPORTER - Reading source files");
83
def imp = new importer()
84
imp.run(srcDir, binDir, txmDir, textLangs, langGroups, project);
85
def corpusIDS = imp.getCorpusIDS()
86

    
87
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE - Running NLP tools");
88
if (annotate) {
89
        println "-- ANNOTATE - Running NLP tools"
90
        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
91
        def mapForTreeTagger = [:]
92
        for (def k : textLangs.keySet()) mapForTreeTagger[k] = textLangs[k].toLowerCase();
93
                println "TreeTagger models to use per text: $mapForTreeTagger"
94
        if (engine.processDirectory(txmDir, binDir, ["langs":mapForTreeTagger])) {
95
                annotationSuccess = true;
96
                if (project.getCleanAfterBuild()) {
97
                        new File(binDir, "treetagger").deleteDir()
98
                        new File(binDir, "ptreetagger").deleteDir()
99
                        new File(binDir, "annotations").deleteDir()
100
                }
101
        }
102
}
103
//println "langs : "+textLangs
104
//println "texts : "+langGroups
105

    
106
println "-- COMPILING - Building Search Engine indexes"
107
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
108
if (MONITOR != null) MONITOR.worked(20, "COMPILING - Building Search Engine indexes")
109
def c = new compiler();
110
if (debug) c.setDebug();
111
c.setAlignment("seg","id");
112
c.setLangs(textLangs);
113
c.setCorpusIDS(corpusIDS);
114
c.setLangGroups(langGroups);
115
if (!c.run(project, binDir, txmDir, basename)) {
116
        println "Compiler failed"
117
        return;
118
}
119

    
120
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
121
new File(binDir,"HTML").deleteDir();
122
new File(binDir,"HTML").mkdirs();
123
if (build_edition) {
124

    
125
        println "-- EDITION - Building edition"
126
        if (MONITOR != null) MONITOR.worked(20, "EDITION - Building edition")
127

    
128
        List<File> filelist = new File(binDir,"txm").listFiles();
129
        def second = 0
130

    
131
        def splitTUsPerText = [:]
132

    
133
        for (File txmCorpusDir : txmDir.listFiles()) {
134
                def txmFiles = txmCorpusDir.listFiles();
135
                txmFiles.sort() // same order
136
                //println "paging txmFiles : "+txmFiles
137

    
138
                File outdir = new File(binDir,"/HTML/"+txmCorpusDir.getName()+"/default/");
139
                outdir.mkdirs();
140
                //println "processing pages of corpus "+txmCorpusDir.getName()
141

    
142
                //                Element paraCorpusElem = corpusElem.cloneNode(true);
143
                //                paraCorpusElem.setAttribute("name", txmCorpusDir.getName());
144

    
145
                for (File txmFile : txmFiles) {
146
                        print "."
147
                        String txtname = FileUtils.stripExtension(txmFile);
148

    
149
                        //println ""+srcfile.getName()+" -> "+splitTUsPerText[txtname]
150
                        def splitTUs = splitTUsPerText[txtname];
151
                        if (splitTUs == null) {
152
                                splitTUs = [];
153
                                splitTUsPerText[txtname] = splitTUs;
154
                        }
155

    
156
                        String l = textLangs.get(txmFile.getName()).toLowerCase()
157
                        if (l == null) l = "fr";
158
                        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(l);
159
                        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(l);
160

    
161
                        Text t = new Text(project);
162
                        t.setName(txtname);
163
                        t.setSourceFile(txmFile)
164
                        t.setTXMFile(txmFile)
165

    
166
                        def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, page_element,splitTUs);
167
                        Edition edition = new Edition(t);
168
                        edition.setName("default");
169
                        edition.saveParameter("corpus", txmCorpusDir.getName());
170
                        edition.setIndex(outdir.getAbsolutePath());
171
                        for (i = 0 ; i < ed.getPageFiles().size();) {
172
                                File f = ed.getPageFiles().get(i);
173
                                String wordid = "w_0";
174
                                if (i < ed.getIdx().size()) wordid = ed.getIdx().get(i);
175
                                edition.addPage(""+(++i), wordid);
176
                        }
177
                }
178

    
179
                File cssfile = new File(Toolbox.getTxmHomePath(), "css/tmx.css")
180
                if (cssfile.exists()) {
181
                        FileCopy.copy(cssfile, new File(outdir, "tmx.css"));
182
                }
183
                File stylefile = new File(binDir, "style.css")
184
                if (stylefile.exists()) {
185
                        FileCopy.copy(stylefile, new File(outdir, "style.css"));
186
                }
187
                println ""
188
        }
189
}
190

    
191
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
192
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
193

    
194
readyToLoad = project.save();