Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / xmltxmpara / xmltxmparaLoader.groovy @ 967

History | View | Annotate | Download (7.4 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
// $LastChangedDate: 2011-06-03 12:37:57 +0200 (Fri, 03 Jun 2011) $
23
// $LastChangedRevision: 1867 $
24
// $LastChangedBy: mdecorde $ 
25
//
26
package org.txm.importer.xmltxmpara;
27

    
28
import org.txm.importer.xmltxmpara.compiler;
29
import org.txm.importer.xml.pager_old;
30
import org.txm.objects.*;
31
import org.txm.utils.*
32
import org.txm.utils.io.*;
33
import org.txm.*;
34
import org.txm.core.engines.*;
35
import org.txm.importer.xmltxm.*;
36
import org.txm.utils.i18n.*;
37
import org.txm.importer.*;
38
import org.txm.importer.cwb.CwbAlign;
39
import org.txm.importer.cwb.PatchCwbRegistry;
40
import org.w3c.dom.Element
41

    
42
String userDir = System.getProperty("user.home");
43
String rootDir;
44
String lang;
45
String encoding;
46
String model;
47
String basename;
48
String xsl;
49

    
50
try {rootDir = rootDirBinding;}
51
catch (Exception)
52
{        println "DEV MODE";//exception means we debug
53
        if (!org.txm.Toolbox.isInitialized()) {
54
                rootDir = userDir+"/xml/qgraal";
55
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
56
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
57
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
58
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
59
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"),"TXM"));
60
        }
61
}
62

    
63
File srcDir = new File(rootDir);
64
File binDir = new File(Toolbox.getTxmHomePath(),"corpora/"+srcDir.getName());
65
binDir.deleteDir();
66
binDir.mkdirs();
67
if (!binDir.exists()) {
68
        println "Could not create binDir "+binDir
69
        return;
70
}
71

    
72
File srcParamFile = new File(rootDir, "import.xml");
73
File paramFile = new File(binDir, "import.xml")
74
if (!srcParamFile.exists()) {
75
        println "Stop import: can't find params file: "+srcParamFile
76
        return;
77
}
78
FileCopy.copy(srcParamFile, paramFile);
79
if (!paramFile.exists()) {
80
        println "Stop import: failed to copy param file: "+srcParamFile+" to "+paramFile
81
        return;
82
}
83
BaseParameters params = new BaseParameters(paramFile);
84
params.load();
85
basename = params.name;
86

    
87
File paraDir = new File(binDir, "paracorpus/");
88
paraDir.deleteDir();
89
paraDir.mkdir();
90

    
91
// Apply XSL
92
if (xsl != null && xsl.trim().length() > 0) {
93
        if (!ApplyXsl2.processImportSources(new File(xsl, new File(rootDir), srcDir)))
94
        return; // error durring process
95
        srcDir = new File(binDir, "src");
96
}
97

    
98
//copy txm files
99
List<File> srcfiles = srcDir.listFiles();
100
for (File f : srcfiles) {// check XML format, and copy file into binDir
101
        if (f.getName().equals("import.xml") || f.getName().matches("metadata\\.....?") || f.getName().endsWith(".properties"))
102
        continue;
103
        if (ValidateXml.test(f)) {
104
                FileCopy.copy(f, new File(paraDir, f.getName()));
105
        } else {
106
                println "Won't process file "+f;
107
        }
108
}
109

    
110
if (paraDir.listFiles() == null) {
111
        println "No txm file to process"
112
        return;
113
}
114

    
115
//contains the text files associated to a corpus
116
println "Getting lang of each corpus and splitting corpus: "+params.corpora.keySet()
117
for (String corpusname : params.corpora.keySet()) {
118
        
119
        lang = params.corpora.get(corpusname).getAttribute("lang");
120
        encoding = params.corpora.get(corpusname).getAttribute("encoding");
121
        model = lang
122
        
123
        print "."
124
        println "split in"
125
        File txmDir = new File(binDir,"txm/$corpusname");
126
        txmDir.deleteDir();
127
        txmDir.mkdirs();
128
        
129
        def corpusElem = params.corpora.get(corpusname)
130
        String corpuslang = corpusElem.getAttribute("lang")
131
        
132
        println "split corpus $corpusname by TEI@id"
133
        File f = new File(paraDir, corpusname+".xml")
134
        if (!f.exists()) {
135
                println "source file is missing: "+f;
136
        }
137
        def splitter = new SplitBy(f);
138
        if (splitter.process(txmDir, "TEI", "id")) {
139
                for (File textfile : splitter.getFiles()) {
140
                        params.addText(corpusElem, basename, textfile)
141
                }
142
        } else {
143
                println "failed to split corpus "+corpusname;
144
                return false;
145
        }
146
}
147

    
148
println "-- COMPILING - Building Search Engine indexes"
149
for (String corpusname : base.getCorpora().keySet()) {
150
        println "Build corpus "+corpusname
151
        def c = new compiler();
152
        c.setDebug();
153
        //c.setCwbPath("~/TXM/cwb/bin");
154
        if (!c.run(corpusfiles.get(corpusname), binDir, corpusname, basename)) {
155
                println "Compiler failed"
156
                return;
157
        }
158
}
159

    
160
println "-- ALIGN - aligning CQP corpus with each others"
161
for (String alignement : parameters.getAlignements()) {
162
        println "aligning..."+alignement
163
                String alignStructure = parameters.getStructure(alignement);
164
                String alignAttribute = "align";
165
                
166
                String regpath = new File(binDir,"registry").getAbsolutePath();
167
                
168
                if (!(CwbAlign.isExecutableAvailable())) {
169
                                        println ("Error: CWB executables not well set.")
170
                                        return false;
171
                                }
172
                
173
                println "aligned corpora: "+parameters.getTargets(alignement);
174
                for (String corpusName : parameters.getTargets(alignement)) {
175
                        for (String targetName : parameters.getTargets(alignement)) {
176
                                if (corpusName != targetName) {
177
                                        try {
178
                                                println "build align index: "+corpusName+" with "+targetName
179
                                                //./cwb-align -V seg_id -r ~/TXM/corpora/tmxtest/registry/ tmxen tmxfr seg
180
                                                CwbAlign tt = new CwbAlign();
181
                                                tt.setV(alignStructure+"_"+alignAttribute);
182
                                                tt.setr(regpath);
183
                                                tt.cwbalign(corpusName, targetName, alignStructure);
184
                                                
185
                                                //patch registry
186
                                                println " patch reg : "+new File(regpath, corpusName.toLowerCase())
187
                                                if (PatchCwbRegistry.patchAlignment(new File(regpath, corpusName.toLowerCase()), targetName.toLowerCase()))
188
                                                {
189
                                                        // ./cwb-align-encode -D -r ~/TXM/corpora/tmxtest/registry/ -v out.align
190
                                                        tt = new CwbAlign();
191
                                                        tt.setD();
192
                                                        tt.setv();
193
                                                        tt.setr(regpath);
194
                                                        tt.cwbalignencode("out.align");
195
                                                }
196
                                        } catch (IOException e) {
197
                                                org.txm.utils.logger.Log.printStackTrace(e);
198
                                                return;
199
                                        }
200
                                }
201
                        }
202
                }
203

    
204
}
205

    
206
println "-- EDITION - Building edition"
207
new File(binDir,"HTML").deleteDir();
208
new File(binDir,"HTML").mkdir();
209
new File(binDir,"HTML/default").mkdir();
210
List<File> filelist = new File(binDir,"txm").listFiles();
211
def second = 0
212

    
213
println "Paginating text: "
214
for(String textname : base.getTextsID())
215
{
216
        Text text = base.getText(textname);
217
        File srcfile = text.getSource();
218
        File resultfile = new File(binDir,"HTML/"+srcfile.getName().substring(0,srcfile.getName().length()-4)+".html");
219
        lang = text.getAttribute("lang").toString();
220
        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
221
        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
222
                
223
        if (second) { print(", ") }
224
        if (second > 0 && (second % 5) == 0) println ""
225
        print(srcfile.getName());
226
        second++
227
        
228
        def ed = new pager_old(srcfile,resultfile, NoSpaceBefore, NoSpaceAfter, 500, basename, "pb");
229
        
230
        Edition editionweb = text.addEdition("default", "html", resultfile);
231
        for(int i = 0 ; i < ed.getPageFiles().size();i++)
232
        {
233
                File f = ed.getPageFiles().get(i);
234
                String idx = ed.getIdx().get(i);
235
                editionweb.addPage(f,idx);
236
        }
237
}
238

    
239
DomUtils.print(params.root.getOwnerDocument())