Statistics
| Revision:

root / tmp / org.txm.tigersearch.rcp / src / org / txm / importer / srcmf / tigersearchLoader.groovy @ 458

History | View | Annotate | Download (7.8 kB)

1 458 mdecorde
package org.txm.importer.srcmf
2 458 mdecorde
3 458 mdecorde
// srcdir
4 458 mdecorde
// txml/master.xml
5 458 mdecorde
// txml/*.xml
6 458 mdecorde
// txm/*.xml
7 458 mdecorde
8 458 mdecorde
// 1 import xml-txm
9 458 mdecorde
import ims.tiger.gui.tigerregistry.TIGERRegistry;
10 458 mdecorde
import ims.tiger.index.writer.IndexBuilderErrorHandler;
11 458 mdecorde
import ims.tiger.index.writer.SimpleErrorHandler;
12 458 mdecorde
import ims.tiger.index.writer.XMLIndexing;
13 458 mdecorde
import ims.tiger.system.*;
14 458 mdecorde
15 458 mdecorde
import javax.xml.stream.*;
16 458 mdecorde
17 458 mdecorde
import java.io.BufferedWriter;
18 458 mdecorde
import java.io.File;
19 458 mdecorde
import java.io.FileOutputStream;
20 458 mdecorde
import java.io.IOException;
21 458 mdecorde
import java.io.OutputStreamWriter;
22 458 mdecorde
import java.io.PrintStream;
23 458 mdecorde
import java.net.URL;
24 458 mdecorde
import java.util.HashMap;
25 458 mdecorde
import java.util.logging.FileHandler
26 458 mdecorde
import java.io.File;
27 458 mdecorde
28 458 mdecorde
import org.apache.log4j.BasicConfigurator;
29 458 mdecorde
import org.txm.export.ts.*;
30 458 mdecorde
import org.txm.importer.ApplyXsl2;
31 458 mdecorde
import org.txm.utils.xml.DomUtils;
32 458 mdecorde
import org.txm.importer.ValidateXml;
33 458 mdecorde
import org.txm.importer.XPathResult;
34 458 mdecorde
import org.txm.importer.xmltxm.compiler;
35 458 mdecorde
import org.txm.importer.xml.pager;
36 458 mdecorde
import org.txm.objects.*;
37 458 mdecorde
import org.txm.utils.*;
38 458 mdecorde
import org.txm.*;
39 458 mdecorde
import org.txm.scripts.teitxm.*;
40 458 mdecorde
import org.txm.metadatas.*;
41 458 mdecorde
import org.txm.utils.i18n.*;
42 458 mdecorde
import org.w3c.dom.Element
43 458 mdecorde
import org.xml.sax.SAXException;
44 458 mdecorde
45 458 mdecorde
import javax.xml.parsers.DocumentBuilderFactory
46 458 mdecorde
47 458 mdecorde
String userDir = System.getProperty("user.home");
48 458 mdecorde
boolean debug = false
49 458 mdecorde
def MONITOR;
50 458 mdecorde
BaseParameters params;
51 458 mdecorde
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
52 458 mdecorde
{        println "DEV MODE";//exception means we debug
53 458 mdecorde
        debug = true
54 458 mdecorde
        params = new BaseParameters(new File(userDir, "xml/slethgier/import.xml"))
55 458 mdecorde
        params.load()
56 458 mdecorde
        if (!org.txm.Toolbox.isInitialized()) {
57 458 mdecorde
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
58 458 mdecorde
                Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
59 458 mdecorde
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
60 458 mdecorde
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
61 458 mdecorde
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
62 458 mdecorde
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
63 458 mdecorde
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
64 458 mdecorde
        }
65 458 mdecorde
}
66 458 mdecorde
if (params == null) { println "no parameters. Aborting"; return; }
67 458 mdecorde
68 458 mdecorde
String corpusname = params.getCorpusName();
69 458 mdecorde
Element corpusElem = params.corpora.get(corpusname);
70 458 mdecorde
String basename = params.name;
71 458 mdecorde
String rootDir = params.rootDir;
72 458 mdecorde
String lang = corpusElem.getAttribute("lang");
73 458 mdecorde
String model = lang
74 458 mdecorde
String encoding = corpusElem.getAttribute("encoding");
75 458 mdecorde
boolean annotate = "true" == corpusElem.getAttribute("annotate");
76 458 mdecorde
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
77 458 mdecorde
def xslParams = params.getXsltParams(corpusElem);
78 458 mdecorde
79 458 mdecorde
File txmSrcDir = new File(rootDir, "xml-txm");
80 458 mdecorde
File tigerSrcDir = new File(rootDir, "tiger-xml");
81 458 mdecorde
File headerFile = new File(rootDir, "tiger-xml/header.xml");
82 458 mdecorde
83 458 mdecorde
File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME),"corpora/"+basename);
84 458 mdecorde
85 458 mdecorde
binDir.deleteDir();
86 458 mdecorde
binDir.mkdirs();
87 458 mdecorde
if (!binDir.exists()) {
88 458 mdecorde
        println "Could not create binDir "+binDir
89 458 mdecorde
        return;
90 458 mdecorde
}
91 458 mdecorde
if (!txmSrcDir.exists()) {
92 458 mdecorde
        println "Error: txm src dir does not exists: "+txmSrcDir
93 458 mdecorde
        return;
94 458 mdecorde
}
95 458 mdecorde
if (!tigerSrcDir.exists()) {
96 458 mdecorde
        println "Error: tiger src dir does not exists: "+tigerSrcDir
97 458 mdecorde
        return;
98 458 mdecorde
}
99 458 mdecorde
File txmDir = new File(binDir,"txm/$corpusname");
100 458 mdecorde
txmDir.deleteDir();
101 458 mdecorde
txmDir.mkdirs();
102 458 mdecorde
// copy txm files
103 458 mdecorde
List<File> srcfiles = txmSrcDir.listFiles();
104 458 mdecorde
for (File f : srcfiles) {// check XML format, and copy file into binDir
105 458 mdecorde
        if (f.getName().equals("import.xml") || f.getName().equals("metadata.csv") || f.getName().endsWith(".properties"))
106 458 mdecorde
                continue;
107 458 mdecorde
        if (ValidateXml.test(f)) {
108 458 mdecorde
                FileCopy.copy(f, new File(txmDir, f.getName()));
109 458 mdecorde
        } else {
110 458 mdecorde
                println "Won't process file "+f;
111 458 mdecorde
        }
112 458 mdecorde
}
113 458 mdecorde
if (txmDir.listFiles() == null) {
114 458 mdecorde
        println "No txm file to process"
115 458 mdecorde
        return;
116 458 mdecorde
}
117 458 mdecorde
118 458 mdecorde
List<File> filelist = txmDir.listFiles();
119 458 mdecorde
Collections.sort(filelist);
120 458 mdecorde
121 458 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
122 458 mdecorde
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
123 458 mdecorde
println "-- COMPILING - Building Search Engine indexes"
124 458 mdecorde
println "binDir: $binDir"
125 458 mdecorde
println "txmDir: $txmDir"
126 458 mdecorde
def c = new compiler();
127 458 mdecorde
if (debug) c.setDebug();
128 458 mdecorde
c.setLang(lang);
129 458 mdecorde
//c.setSortMetadata(sortMetadata)
130 458 mdecorde
if (!c.run(binDir, txmDir, basename, corpusname, filelist)) {
131 458 mdecorde
        println "import process stopped";
132 458 mdecorde
        return;
133 458 mdecorde
}
134 458 mdecorde
135 458 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
136 458 mdecorde
if (MONITOR != null) MONITOR.worked(20, "EDITION")
137 458 mdecorde
println "-- EDITION - Building edition"
138 458 mdecorde
new File(binDir,"HTML/$corpusname").deleteDir();
139 458 mdecorde
new File(binDir,"HTML/$corpusname").mkdirs();
140 458 mdecorde
File outdir = new File(binDir,"/HTML/$corpusname/default/");
141 458 mdecorde
outdir.mkdirs();
142 458 mdecorde
143 458 mdecorde
144 458 mdecorde
def second = 0
145 458 mdecorde
println "Paginating text: "
146 458 mdecorde
for (File srcfile : filelist) {
147 458 mdecorde
        String txtname = srcfile.getName();
148 458 mdecorde
        int i = txtname.lastIndexOf(".");
149 458 mdecorde
        if(i > 0) txtname = txtname.substring(0, i);
150 458 mdecorde
        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
151 458 mdecorde
        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
152 458 mdecorde
        Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile);
153 458 mdecorde
        if (second) { print(", ") }
154 458 mdecorde
        if (second > 0 && (second++ % 5) == 0) println ""
155 458 mdecorde
        print(srcfile.getName());
156 458 mdecorde
        def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, 500, basename, "pb");
157 458 mdecorde
        Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
158 458 mdecorde
        for (i = 0 ; i < ed.getPageFiles().size();) {
159 458 mdecorde
                File f = ed.getPageFiles().get(i);
160 458 mdecorde
                String wordid = ed.getIdx().get(i);
161 458 mdecorde
                params.addPage(edition, ""+(++i), wordid);
162 458 mdecorde
        }
163 458 mdecorde
}
164 458 mdecorde
165 458 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
166 458 mdecorde
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
167 458 mdecorde
File paramFile = new File(binDir, "import.xml");
168 458 mdecorde
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;
169 458 mdecorde
170 458 mdecorde
171 458 mdecorde
// TMR Modifications start here
172 458 mdecorde
File tigerDir = new File(binDir, "tiger");
173 458 mdecorde
tigerDir.mkdir();
174 458 mdecorde
File master = new File(tigerSrcDir, "master.xml");
175 458 mdecorde
File txmSrcFile = txmSrcDir.listFiles()[0]; // only one text file
176 458 mdecorde
177 458 mdecorde
// Run SRCMF cmd line corpus import script
178 458 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
179 458 mdecorde
def importer = new SrcmfImporter()
180 458 mdecorde
def tigerXmlAll = importer.doAllButPnc(master, txmSrcFile, headerFile, binDir)
181 458 mdecorde
182 458 mdecorde
// Create TigerXml-POS-PNC
183 458 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
184 458 mdecorde
println "create TIGER XML POS PNC"
185 458 mdecorde
PunctInjectImport injector;
186 458 mdecorde
if (Toolbox.isInitialized()) {
187 458 mdecorde
        // get idx from CQP
188 458 mdecorde
        // INCOMPLETE
189 458 mdecorde
        // def injector = new PunctInjectImport(tmpFile, words)
190 458 mdecorde
        println "Error: the loader is not ready to be launched from RCP"
191 458 mdecorde
        return;
192 458 mdecorde
} else {
193 458 mdecorde
        injector = new PunctInjectImport(tigerXmlAll, txmSrcFile)
194 458 mdecorde
}
195 458 mdecorde
injector.outputFile = new File(tigerDir, "TigerPnc.xml")
196 458 mdecorde
injector.process()
197 458 mdecorde
198 458 mdecorde
// Create the tigersearch.logprop file (used to launch TIGERSearch)
199 458 mdecorde
File logprop = new File(tigerDir, "tigersearch.logprop");
200 458 mdecorde
logprop.withWriter("UTF-8") { writer ->
201 458 mdecorde
        writer.write("""# Default log configuration of the TIGERSearch suite
202 458 mdecorde

203 458 mdecorde
log4j.rootLogger=WARN,Logfile
204 458 mdecorde
log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=INFO
205 458 mdecorde

206 458 mdecorde
log4j.appender.Logfile=org.apache.log4j.RollingFileAppender
207 458 mdecorde
log4j.appender.Logfile.File=\${user.home}/tigersearch/tigersearch.log
208 458 mdecorde
log4j.appender.Logfile.MaxFileSize=500KB
209 458 mdecorde
log4j.appender.Logfile.MaxBackupIndex=1
210 458 mdecorde

211 458 mdecorde
log4j.appender.Logfile.layout=org.apache.log4j.PatternLayout
212 458 mdecorde
log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n""")
213 458 mdecorde
}
214 458 mdecorde
215 458 mdecorde
// Run TigerRegistry
216 458 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
217 458 mdecorde
BasicConfigurator.configure();
218 458 mdecorde
String uri = tigerXmlAll.getAbsolutePath();
219 458 mdecorde
File tigerBinDir = new File(tigerDir, corpusname)
220 458 mdecorde
tigerBinDir.mkdir()
221 458 mdecorde
try {
222 458 mdecorde
        IndexBuilderErrorHandler handler = new SimpleErrorHandler(tigerBinDir.getAbsolutePath());
223 458 mdecorde
        XMLIndexing indexing = new XMLIndexing(corpusname,uri, tigerBinDir.getAbsolutePath(), handler,false);
224 458 mdecorde
        indexing.startIndexing();
225 458 mdecorde
}
226 458 mdecorde
catch (IOException e) { System.out.println("IO: "+e.getMessage()); }
227 458 mdecorde
catch (SAXException e) { System.out.println("SAX: "+e.getMessage()); }
228 458 mdecorde
229 458 mdecorde
tigerXmlAll.delete()