Statistics
| Revision:

root / tmp / org.txm.tigersearch.rcp / groovy / org / txm / importer / srcmf / srcmfLoader.groovy @ 966

History | View | Annotate | Download (7.8 kB)

1 911 mdecorde
package org.txm.importer.srcmf
2 911 mdecorde
3 911 mdecorde
// srcdir
4 911 mdecorde
// txml/master.xml
5 911 mdecorde
// txml/*.xml
6 911 mdecorde
// txm/*.xml
7 911 mdecorde
8 911 mdecorde
// 1 import xml-txm
9 911 mdecorde
import ims.tiger.gui.tigerregistry.TIGERRegistry;
10 911 mdecorde
import ims.tiger.index.writer.IndexBuilderErrorHandler;
11 911 mdecorde
import ims.tiger.index.writer.SimpleErrorHandler;
12 911 mdecorde
import ims.tiger.index.writer.XMLIndexing;
13 911 mdecorde
import ims.tiger.system.*;
14 911 mdecorde
15 911 mdecorde
import javax.xml.stream.*;
16 911 mdecorde
17 911 mdecorde
import java.io.BufferedWriter;
18 911 mdecorde
import java.io.File;
19 911 mdecorde
import java.io.FileOutputStream;
20 911 mdecorde
import java.io.IOException;
21 911 mdecorde
import java.io.OutputStreamWriter;
22 911 mdecorde
import java.io.PrintStream;
23 911 mdecorde
import java.net.URL;
24 911 mdecorde
import java.util.HashMap;
25 911 mdecorde
import java.util.logging.FileHandler
26 911 mdecorde
import java.io.File;
27 911 mdecorde
28 911 mdecorde
import org.apache.log4j.BasicConfigurator;
29 911 mdecorde
import org.txm.export.ts.*;
30 911 mdecorde
import org.txm.importer.ApplyXsl2;
31 911 mdecorde
import org.txm.utils.xml.DomUtils;
32 911 mdecorde
import org.txm.importer.ValidateXml;
33 911 mdecorde
import org.txm.importer.XPathResult;
34 911 mdecorde
import org.txm.importer.xmltxm.compiler;
35 911 mdecorde
import org.txm.importer.xml.pager;
36 911 mdecorde
import org.txm.objects.*;
37 927 mdecorde
import org.txm.utils.*
38 928 mdecorde
import org.txm.utils.io.*;
39 911 mdecorde
import org.txm.*;
40 927 mdecorde
import org.txm.importer.xmltxm.*;
41 911 mdecorde
import org.txm.metadatas.*;
42 911 mdecorde
import org.txm.utils.i18n.*;
43 911 mdecorde
import org.w3c.dom.Element
44 911 mdecorde
import org.xml.sax.SAXException;
45 911 mdecorde
46 911 mdecorde
import javax.xml.parsers.DocumentBuilderFactory
47 911 mdecorde
48 911 mdecorde
String userDir = System.getProperty("user.home");
49 911 mdecorde
boolean debug = false
50 911 mdecorde
def MONITOR;
51 911 mdecorde
BaseParameters params;
52 911 mdecorde
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
53 911 mdecorde
{        println "DEV MODE";//exception means we debug
54 911 mdecorde
        debug = true
55 911 mdecorde
        params = new BaseParameters(new File(userDir, "xml/slethgier/import.xml"))
56 911 mdecorde
        params.load()
57 911 mdecorde
        if (!org.txm.Toolbox.isInitialized()) {
58 911 mdecorde
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
59 911 mdecorde
                Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
60 911 mdecorde
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
61 911 mdecorde
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
62 911 mdecorde
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
63 911 mdecorde
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
64 911 mdecorde
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
65 911 mdecorde
        }
66 911 mdecorde
}
67 911 mdecorde
if (params == null) { println "no parameters. Aborting"; return; }
68 911 mdecorde
69 911 mdecorde
String corpusname = params.getCorpusName();
70 911 mdecorde
Element corpusElem = params.corpora.get(corpusname);
71 911 mdecorde
String basename = params.name;
72 911 mdecorde
String rootDir = params.rootDir;
73 911 mdecorde
String lang = corpusElem.getAttribute("lang");
74 911 mdecorde
String model = lang
75 911 mdecorde
String encoding = corpusElem.getAttribute("encoding");
76 911 mdecorde
boolean annotate = "true" == corpusElem.getAttribute("annotate");
77 911 mdecorde
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
78 911 mdecorde
def xslParams = params.getXsltParams(corpusElem);
79 911 mdecorde
80 911 mdecorde
File txmSrcDir = new File(rootDir, "xml-txm");
81 911 mdecorde
File tigerSrcDir = new File(rootDir, "tiger-xml");
82 911 mdecorde
File headerFile = new File(rootDir, "tiger-xml/header.xml");
83 911 mdecorde
84 911 mdecorde
File binDir = new File(Toolbox.getTXMHOMEPATH(),"corpora/"+basename);
85 911 mdecorde
86 911 mdecorde
binDir.deleteDir();
87 911 mdecorde
binDir.mkdirs();
88 911 mdecorde
if (!binDir.exists()) {
89 911 mdecorde
        println "Could not create binDir "+binDir
90 911 mdecorde
        return;
91 911 mdecorde
}
92 911 mdecorde
if (!txmSrcDir.exists()) {
93 911 mdecorde
        println "Error: txm src dir does not exists: "+txmSrcDir
94 911 mdecorde
        return;
95 911 mdecorde
}
96 911 mdecorde
if (!tigerSrcDir.exists()) {
97 911 mdecorde
        println "Error: tiger src dir does not exists: "+tigerSrcDir
98 911 mdecorde
        return;
99 911 mdecorde
}
100 911 mdecorde
File txmDir = new File(binDir,"txm/$corpusname");
101 911 mdecorde
txmDir.deleteDir();
102 911 mdecorde
txmDir.mkdirs();
103 911 mdecorde
// copy txm files
104 911 mdecorde
List<File> srcfiles = txmSrcDir.listFiles();
105 911 mdecorde
for (File f : srcfiles) {// check XML format, and copy file into binDir
106 966 mdecorde
        if (f.getName().equals("import.xml") || f.getName().matches("metadata\.....?") || f.getName().endsWith(".properties"))
107 911 mdecorde
                continue;
108 911 mdecorde
        if (ValidateXml.test(f)) {
109 911 mdecorde
                FileCopy.copy(f, new File(txmDir, f.getName()));
110 911 mdecorde
        } else {
111 911 mdecorde
                println "Won't process file "+f;
112 911 mdecorde
        }
113 911 mdecorde
}
114 911 mdecorde
if (txmDir.listFiles() == null) {
115 911 mdecorde
        println "No txm file to process"
116 911 mdecorde
        return;
117 911 mdecorde
}
118 911 mdecorde
119 911 mdecorde
List<File> filelist = txmDir.listFiles();
120 911 mdecorde
Collections.sort(filelist);
121 911 mdecorde
122 911 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
123 911 mdecorde
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
124 911 mdecorde
println "-- COMPILING - Building Search Engine indexes"
125 911 mdecorde
println "binDir: $binDir"
126 911 mdecorde
println "txmDir: $txmDir"
127 911 mdecorde
def c = new compiler();
128 911 mdecorde
if (debug) c.setDebug();
129 911 mdecorde
c.setLang(lang);
130 911 mdecorde
//c.setSortMetadata(sortMetadata)
131 911 mdecorde
if (!c.run(binDir, txmDir, basename, corpusname, filelist)) {
132 911 mdecorde
        println "import process stopped";
133 911 mdecorde
        return;
134 911 mdecorde
}
135 911 mdecorde
136 911 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
137 911 mdecorde
if (MONITOR != null) MONITOR.worked(20, "EDITION")
138 911 mdecorde
println "-- EDITION - Building edition"
139 911 mdecorde
new File(binDir,"HTML/$corpusname").deleteDir();
140 911 mdecorde
new File(binDir,"HTML/$corpusname").mkdirs();
141 911 mdecorde
File outdir = new File(binDir,"/HTML/$corpusname/default/");
142 911 mdecorde
outdir.mkdirs();
143 911 mdecorde
144 911 mdecorde
145 911 mdecorde
def second = 0
146 911 mdecorde
println "Paginating text: "
147 911 mdecorde
for (File srcfile : filelist) {
148 911 mdecorde
        String txtname = srcfile.getName();
149 911 mdecorde
        int i = txtname.lastIndexOf(".");
150 911 mdecorde
        if(i > 0) txtname = txtname.substring(0, i);
151 911 mdecorde
        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
152 911 mdecorde
        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
153 911 mdecorde
        Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile);
154 911 mdecorde
        if (second) { print(", ") }
155 911 mdecorde
        if (second > 0 && (second++ % 5) == 0) println ""
156 911 mdecorde
        print(srcfile.getName());
157 911 mdecorde
        def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, 500, basename, "pb");
158 911 mdecorde
        Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
159 911 mdecorde
        for (i = 0 ; i < ed.getPageFiles().size();) {
160 911 mdecorde
                File f = ed.getPageFiles().get(i);
161 911 mdecorde
                String wordid = ed.getIdx().get(i);
162 911 mdecorde
                params.addPage(edition, ""+(++i), wordid);
163 911 mdecorde
        }
164 911 mdecorde
}
165 911 mdecorde
166 911 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
167 911 mdecorde
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
168 911 mdecorde
File paramFile = new File(binDir, "import.xml");
169 911 mdecorde
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;
170 911 mdecorde
171 911 mdecorde
172 911 mdecorde
// TMR Modifications start here
173 911 mdecorde
File tigerDir = new File(binDir, "tiger");
174 911 mdecorde
tigerDir.mkdir();
175 911 mdecorde
File master = new File(tigerSrcDir, "master.xml");
176 911 mdecorde
File txmSrcFile = txmSrcDir.listFiles()[0]; // only one text file
177 911 mdecorde
178 911 mdecorde
// Run SRCMF cmd line corpus import script
179 911 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
180 911 mdecorde
def importer = new SrcmfImporter()
181 911 mdecorde
def tigerXmlAll = importer.doAllButPnc(master, txmSrcFile, headerFile, binDir)
182 911 mdecorde
183 911 mdecorde
// Create TigerXml-POS-PNC
184 911 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
185 911 mdecorde
println "create TIGER XML POS PNC"
186 911 mdecorde
PunctInjectImport injector;
187 911 mdecorde
if (Toolbox.isInitialized()) {
188 911 mdecorde
        // get idx from CQP
189 911 mdecorde
        // INCOMPLETE
190 911 mdecorde
        // def injector = new PunctInjectImport(tmpFile, words)
191 911 mdecorde
        println "Error: the loader is not ready to be launched from RCP"
192 911 mdecorde
        return;
193 911 mdecorde
} else {
194 911 mdecorde
        injector = new PunctInjectImport(tigerXmlAll, txmSrcFile)
195 911 mdecorde
}
196 911 mdecorde
injector.outputFile = new File(tigerDir, "TigerPnc.xml")
197 911 mdecorde
injector.process()
198 911 mdecorde
199 911 mdecorde
// Create the tigersearch.logprop file (used to launch TIGERSearch)
200 911 mdecorde
File logprop = new File(tigerDir, "tigersearch.logprop");
201 911 mdecorde
logprop.withWriter("UTF-8") { writer ->
202 911 mdecorde
        writer.write("""# Default log configuration of the TIGERSearch suite
203 911 mdecorde

204 911 mdecorde
log4j.rootLogger=WARN,Logfile
205 911 mdecorde
log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=INFO
206 911 mdecorde

207 911 mdecorde
log4j.appender.Logfile=org.apache.log4j.RollingFileAppender
208 911 mdecorde
log4j.appender.Logfile.File=\${user.home}/tigersearch/tigersearch.log
209 911 mdecorde
log4j.appender.Logfile.MaxFileSize=500KB
210 911 mdecorde
log4j.appender.Logfile.MaxBackupIndex=1
211 911 mdecorde

212 911 mdecorde
log4j.appender.Logfile.layout=org.apache.log4j.PatternLayout
213 911 mdecorde
log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n""")
214 911 mdecorde
}
215 911 mdecorde
216 911 mdecorde
// Run TigerRegistry
217 911 mdecorde
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
218 911 mdecorde
BasicConfigurator.configure();
219 911 mdecorde
String uri = tigerXmlAll.getAbsolutePath();
220 911 mdecorde
File tigerBinDir = new File(tigerDir, corpusname)
221 911 mdecorde
tigerBinDir.mkdir()
222 911 mdecorde
try {
223 911 mdecorde
        IndexBuilderErrorHandler handler = new SimpleErrorHandler(tigerBinDir.getAbsolutePath());
224 911 mdecorde
        XMLIndexing indexing = new XMLIndexing(corpusname,uri, tigerBinDir.getAbsolutePath(), handler,false);
225 911 mdecorde
        indexing.startIndexing();
226 911 mdecorde
}
227 911 mdecorde
catch (IOException e) { System.out.println("IO: "+e.getMessage()); }
228 911 mdecorde
catch (SAXException e) { System.out.println("SAX: "+e.getMessage()); }
229 911 mdecorde
230 927 mdecorde
tigerXmlAll.delete()