Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / xmltxmpara / xmltxmparaLoader.groovy @ 966

History | View | Annotate | Download (7.4 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
//
22 321 mdecorde
// $LastChangedDate: 2011-06-03 12:37:57 +0200 (Fri, 03 Jun 2011) $
23 321 mdecorde
// $LastChangedRevision: 1867 $
24 321 mdecorde
// $LastChangedBy: mdecorde $
25 321 mdecorde
//
26 321 mdecorde
package org.txm.importer.xmltxmpara;
27 321 mdecorde
28 321 mdecorde
import org.txm.importer.xmltxmpara.compiler;
29 321 mdecorde
import org.txm.importer.xml.pager_old;
30 321 mdecorde
import org.txm.objects.*;
31 927 mdecorde
import org.txm.utils.*
32 928 mdecorde
import org.txm.utils.io.*;
33 321 mdecorde
import org.txm.*;
34 927 mdecorde
import org.txm.core.engines.*;
35 927 mdecorde
import org.txm.importer.xmltxm.*;
36 321 mdecorde
import org.txm.utils.i18n.*;
37 321 mdecorde
import org.txm.importer.*;
38 321 mdecorde
import org.txm.importer.cwb.CwbAlign;
39 321 mdecorde
import org.txm.importer.cwb.PatchCwbRegistry;
40 321 mdecorde
import org.w3c.dom.Element
41 321 mdecorde
42 321 mdecorde
String userDir = System.getProperty("user.home");
43 321 mdecorde
String rootDir;
44 321 mdecorde
String lang;
45 321 mdecorde
String encoding;
46 321 mdecorde
String model;
47 321 mdecorde
String basename;
48 321 mdecorde
String xsl;
49 321 mdecorde
50 321 mdecorde
try {rootDir = rootDirBinding;}
51 321 mdecorde
catch (Exception)
52 321 mdecorde
{        println "DEV MODE";//exception means we debug
53 321 mdecorde
        if (!org.txm.Toolbox.isInitialized()) {
54 321 mdecorde
                rootDir = userDir+"/xml/qgraal";
55 321 mdecorde
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
56 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
57 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
58 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
59 321 mdecorde
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"),"TXM"));
60 321 mdecorde
        }
61 321 mdecorde
}
62 321 mdecorde
63 321 mdecorde
File srcDir = new File(rootDir);
64 878 sjacqu01
File binDir = new File(Toolbox.getTxmHomePath(),"corpora/"+srcDir.getName());
65 321 mdecorde
binDir.deleteDir();
66 321 mdecorde
binDir.mkdirs();
67 321 mdecorde
if (!binDir.exists()) {
68 321 mdecorde
        println "Could not create binDir "+binDir
69 321 mdecorde
        return;
70 321 mdecorde
}
71 321 mdecorde
72 321 mdecorde
File srcParamFile = new File(rootDir, "import.xml");
73 321 mdecorde
File paramFile = new File(binDir, "import.xml")
74 321 mdecorde
if (!srcParamFile.exists()) {
75 321 mdecorde
        println "Stop import: can't find params file: "+srcParamFile
76 321 mdecorde
        return;
77 321 mdecorde
}
78 321 mdecorde
FileCopy.copy(srcParamFile, paramFile);
79 321 mdecorde
if (!paramFile.exists()) {
80 321 mdecorde
        println "Stop import: failed to copy param file: "+srcParamFile+" to "+paramFile
81 321 mdecorde
        return;
82 321 mdecorde
}
83 321 mdecorde
BaseParameters params = new BaseParameters(paramFile);
84 321 mdecorde
params.load();
85 321 mdecorde
basename = params.name;
86 321 mdecorde
87 321 mdecorde
File paraDir = new File(binDir, "paracorpus/");
88 321 mdecorde
paraDir.deleteDir();
89 321 mdecorde
paraDir.mkdir();
90 321 mdecorde
91 321 mdecorde
// Apply XSL
92 321 mdecorde
if (xsl != null && xsl.trim().length() > 0) {
93 321 mdecorde
        if (!ApplyXsl2.processImportSources(new File(xsl, new File(rootDir), srcDir)))
94 321 mdecorde
        return; // error durring process
95 321 mdecorde
        srcDir = new File(binDir, "src");
96 321 mdecorde
}
97 321 mdecorde
98 321 mdecorde
//copy txm files
99 321 mdecorde
List<File> srcfiles = srcDir.listFiles();
100 321 mdecorde
for (File f : srcfiles) {// check XML format, and copy file into binDir
101 966 mdecorde
        if (f.getName().equals("import.xml") || f.getName().matches("metadata\.....?") || f.getName().endsWith(".properties"))
102 321 mdecorde
        continue;
103 321 mdecorde
        if (ValidateXml.test(f)) {
104 321 mdecorde
                FileCopy.copy(f, new File(paraDir, f.getName()));
105 321 mdecorde
        } else {
106 321 mdecorde
                println "Won't process file "+f;
107 321 mdecorde
        }
108 321 mdecorde
}
109 321 mdecorde
110 321 mdecorde
if (paraDir.listFiles() == null) {
111 321 mdecorde
        println "No txm file to process"
112 321 mdecorde
        return;
113 321 mdecorde
}
114 321 mdecorde
115 321 mdecorde
//contains the text files associated to a corpus
116 321 mdecorde
println "Getting lang of each corpus and splitting corpus: "+params.corpora.keySet()
117 321 mdecorde
for (String corpusname : params.corpora.keySet()) {
118 321 mdecorde
119 321 mdecorde
        lang = params.corpora.get(corpusname).getAttribute("lang");
120 321 mdecorde
        encoding = params.corpora.get(corpusname).getAttribute("encoding");
121 321 mdecorde
        model = lang
122 321 mdecorde
123 321 mdecorde
        print "."
124 321 mdecorde
        println "split in"
125 321 mdecorde
        File txmDir = new File(binDir,"txm/$corpusname");
126 321 mdecorde
        txmDir.deleteDir();
127 321 mdecorde
        txmDir.mkdirs();
128 321 mdecorde
129 321 mdecorde
        def corpusElem = params.corpora.get(corpusname)
130 321 mdecorde
        String corpuslang = corpusElem.getAttribute("lang")
131 321 mdecorde
132 321 mdecorde
        println "split corpus $corpusname by TEI@id"
133 321 mdecorde
        File f = new File(paraDir, corpusname+".xml")
134 321 mdecorde
        if (!f.exists()) {
135 321 mdecorde
                println "source file is missing: "+f;
136 321 mdecorde
        }
137 321 mdecorde
        def splitter = new SplitBy(f);
138 321 mdecorde
        if (splitter.process(txmDir, "TEI", "id")) {
139 321 mdecorde
                for (File textfile : splitter.getFiles()) {
140 321 mdecorde
                        params.addText(corpusElem, basename, textfile)
141 321 mdecorde
                }
142 321 mdecorde
        } else {
143 321 mdecorde
                println "failed to split corpus "+corpusname;
144 321 mdecorde
                return false;
145 321 mdecorde
        }
146 321 mdecorde
}
147 321 mdecorde
148 321 mdecorde
println "-- COMPILING - Building Search Engine indexes"
149 321 mdecorde
for (String corpusname : base.getCorpora().keySet()) {
150 321 mdecorde
        println "Build corpus "+corpusname
151 321 mdecorde
        def c = new compiler();
152 321 mdecorde
        c.setDebug();
153 321 mdecorde
        //c.setCwbPath("~/TXM/cwb/bin");
154 321 mdecorde
        if (!c.run(corpusfiles.get(corpusname), binDir, corpusname, basename)) {
155 321 mdecorde
                println "Compiler failed"
156 321 mdecorde
                return;
157 321 mdecorde
        }
158 321 mdecorde
}
159 321 mdecorde
160 321 mdecorde
println "-- ALIGN - aligning CQP corpus with each others"
161 321 mdecorde
for (String alignement : parameters.getAlignements()) {
162 321 mdecorde
        println "aligning..."+alignement
163 321 mdecorde
                String alignStructure = parameters.getStructure(alignement);
164 321 mdecorde
                String alignAttribute = "align";
165 321 mdecorde
166 321 mdecorde
                String regpath = new File(binDir,"registry").getAbsolutePath();
167 321 mdecorde
168 714 mdecorde
                if (!(CwbAlign.isExecutableAvailable())) {
169 714 mdecorde
                                        println ("Error: CWB executables not well set.")
170 714 mdecorde
                                        return false;
171 714 mdecorde
                                }
172 321 mdecorde
173 321 mdecorde
                println "aligned corpora: "+parameters.getTargets(alignement);
174 321 mdecorde
                for (String corpusName : parameters.getTargets(alignement)) {
175 321 mdecorde
                        for (String targetName : parameters.getTargets(alignement)) {
176 321 mdecorde
                                if (corpusName != targetName) {
177 321 mdecorde
                                        try {
178 321 mdecorde
                                                println "build align index: "+corpusName+" with "+targetName
179 321 mdecorde
                                                //./cwb-align -V seg_id -r ~/TXM/corpora/tmxtest/registry/ tmxen tmxfr seg
180 714 mdecorde
                                                CwbAlign tt = new CwbAlign();
181 321 mdecorde
                                                tt.setV(alignStructure+"_"+alignAttribute);
182 321 mdecorde
                                                tt.setr(regpath);
183 321 mdecorde
                                                tt.cwbalign(corpusName, targetName, alignStructure);
184 321 mdecorde
185 321 mdecorde
                                                //patch registry
186 321 mdecorde
                                                println " patch reg : "+new File(regpath, corpusName.toLowerCase())
187 321 mdecorde
                                                if (PatchCwbRegistry.patchAlignment(new File(regpath, corpusName.toLowerCase()), targetName.toLowerCase()))
188 321 mdecorde
                                                {
189 321 mdecorde
                                                        // ./cwb-align-encode -D -r ~/TXM/corpora/tmxtest/registry/ -v out.align
190 714 mdecorde
                                                        tt = new CwbAlign();
191 321 mdecorde
                                                        tt.setD();
192 321 mdecorde
                                                        tt.setv();
193 321 mdecorde
                                                        tt.setr(regpath);
194 321 mdecorde
                                                        tt.cwbalignencode("out.align");
195 321 mdecorde
                                                }
196 321 mdecorde
                                        } catch (IOException e) {
197 321 mdecorde
                                                org.txm.utils.logger.Log.printStackTrace(e);
198 321 mdecorde
                                                return;
199 321 mdecorde
                                        }
200 321 mdecorde
                                }
201 321 mdecorde
                        }
202 321 mdecorde
                }
203 321 mdecorde
204 321 mdecorde
}
205 321 mdecorde
206 321 mdecorde
println "-- EDITION - Building edition"
207 321 mdecorde
new File(binDir,"HTML").deleteDir();
208 321 mdecorde
new File(binDir,"HTML").mkdir();
209 321 mdecorde
new File(binDir,"HTML/default").mkdir();
210 321 mdecorde
List<File> filelist = new File(binDir,"txm").listFiles();
211 321 mdecorde
def second = 0
212 321 mdecorde
213 321 mdecorde
println "Paginating text: "
214 321 mdecorde
for(String textname : base.getTextsID())
215 321 mdecorde
{
216 321 mdecorde
        Text text = base.getText(textname);
217 321 mdecorde
        File srcfile = text.getSource();
218 321 mdecorde
        File resultfile = new File(binDir,"HTML/"+srcfile.getName().substring(0,srcfile.getName().length()-4)+".html");
219 321 mdecorde
        lang = text.getAttribute("lang").toString();
220 321 mdecorde
        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
221 321 mdecorde
        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
222 321 mdecorde
223 321 mdecorde
        if (second) { print(", ") }
224 321 mdecorde
        if (second > 0 && (second % 5) == 0) println ""
225 321 mdecorde
        print(srcfile.getName());
226 321 mdecorde
        second++
227 321 mdecorde
228 321 mdecorde
        def ed = new pager_old(srcfile,resultfile, NoSpaceBefore, NoSpaceAfter, 500, basename, "pb");
229 321 mdecorde
230 321 mdecorde
        Edition editionweb = text.addEdition("default", "html", resultfile);
231 321 mdecorde
        for(int i = 0 ; i < ed.getPageFiles().size();i++)
232 321 mdecorde
        {
233 321 mdecorde
                File f = ed.getPageFiles().get(i);
234 321 mdecorde
                String idx = ed.getIdx().get(i);
235 321 mdecorde
                editionweb.addPage(f,idx);
236 321 mdecorde
        }
237 321 mdecorde
}
238 321 mdecorde
239 321 mdecorde
DomUtils.print(params.root.getOwnerDocument())