root / tmp / org.txm.tigersearch.rcp / groovy / org / txm / scripts / importer / srcmf / srcmfLoader.groovy @ 1002
History | View | Annotate | Download (7.8 kB)
1 |
package org.txm.scripts.importer.srcmf
|
---|---|
2 |
|
3 |
// srcdir
|
4 |
// txml/master.xml
|
5 |
// txml/*.xml
|
6 |
// txm/*.xml
|
7 |
|
8 |
// 1 import xml-txm
|
9 |
import ims.tiger.gui.tigerregistry.TIGERRegistry; |
10 |
import ims.tiger.index.writer.IndexBuilderErrorHandler; |
11 |
import ims.tiger.index.writer.SimpleErrorHandler; |
12 |
import ims.tiger.index.writer.XMLIndexing; |
13 |
import ims.tiger.system.*; |
14 |
|
15 |
import javax.xml.stream.*; |
16 |
|
17 |
import java.io.BufferedWriter; |
18 |
import java.io.File; |
19 |
import java.io.FileOutputStream; |
20 |
import java.io.IOException; |
21 |
import java.io.OutputStreamWriter; |
22 |
import java.io.PrintStream; |
23 |
import java.net.URL; |
24 |
import java.util.HashMap; |
25 |
import java.util.logging.FileHandler |
26 |
import java.io.File; |
27 |
|
28 |
import org.apache.log4j.BasicConfigurator; |
29 |
import org.txm.export.ts.*; |
30 |
import org.txm.importer.ApplyXsl2; |
31 |
import org.txm.utils.xml.DomUtils; |
32 |
import org.txm.importer.ValidateXml; |
33 |
import org.txm.importer.XPathResult; |
34 |
import org.txm.importer.xmltxm.compiler; |
35 |
import org.txm.importer.xml.pager; |
36 |
import org.txm.objects.*; |
37 |
import org.txm.utils.* |
38 |
import org.txm.utils.io.*; |
39 |
import org.txm.*; |
40 |
import org.txm.importer.xmltxm.*; |
41 |
import org.txm.metadatas.*; |
42 |
import org.txm.utils.i18n.*; |
43 |
import org.w3c.dom.Element |
44 |
import org.xml.sax.SAXException; |
45 |
|
46 |
import javax.xml.parsers.DocumentBuilderFactory |
47 |
|
48 |
String userDir = System.getProperty("user.home"); |
49 |
boolean debug = false |
50 |
def MONITOR;
|
51 |
BaseParameters params; |
52 |
try {params = paramsBinding;MONITOR=monitor} catch (Exception) |
53 |
{ println "DEV MODE";//exception means we debug |
54 |
debug = true
|
55 |
params = new BaseParameters(new File(userDir, "xml/slethgier/import.xml")) |
56 |
params.load() |
57 |
if (!org.txm.Toolbox.isInitialized()) {
|
58 |
Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM")); |
59 |
Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger")); |
60 |
Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models")); |
61 |
Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
|
62 |
Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
|
63 |
Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
|
64 |
Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM")); |
65 |
} |
66 |
} |
67 |
if (params == null) { println "no parameters. Aborting"; return; } |
68 |
|
69 |
String corpusname = params.getCorpusName();
|
70 |
Element corpusElem = params.corpora.get(corpusname);
|
71 |
String basename = params.name;
|
72 |
String rootDir = params.rootDir;
|
73 |
String lang = corpusElem.getAttribute("lang"); |
74 |
String model = lang
|
75 |
String encoding = corpusElem.getAttribute("encoding"); |
76 |
boolean annotate = "true" == corpusElem.getAttribute("annotate"); |
77 |
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl") |
78 |
def xslParams = params.getXsltParams(corpusElem);
|
79 |
|
80 |
File txmSrcDir = new File(rootDir, "xml-txm"); |
81 |
File tigerSrcDir = new File(rootDir, "tiger-xml"); |
82 |
File headerFile = new File(rootDir, "tiger-xml/header.xml"); |
83 |
|
84 |
File binDir = new File(Toolbox.getTxmHomePath(),"corpora/"+basename); |
85 |
|
86 |
binDir.deleteDir(); |
87 |
binDir.mkdirs(); |
88 |
if (!binDir.exists()) {
|
89 |
println "Could not create binDir "+binDir
|
90 |
return;
|
91 |
} |
92 |
if (!txmSrcDir.exists()) {
|
93 |
println "Error: txm src dir does not exists: "+txmSrcDir
|
94 |
return;
|
95 |
} |
96 |
if (!tigerSrcDir.exists()) {
|
97 |
println "Error: tiger src dir does not exists: "+tigerSrcDir
|
98 |
return;
|
99 |
} |
100 |
File txmDir = new File(binDir,"txm/$corpusname"); |
101 |
txmDir.deleteDir(); |
102 |
txmDir.mkdirs(); |
103 |
// copy txm files
|
104 |
List<File> srcfiles = txmSrcDir.listFiles(); |
105 |
for (File f : srcfiles) {// check XML format, and copy file into binDir |
106 |
if (f.getName().equals("import.xml") || f.getName().matches("metadata\\.....?") || f.getName().endsWith(".properties")) |
107 |
continue;
|
108 |
if (ValidateXml.test(f)) {
|
109 |
FileCopy.copy(f, new File(txmDir, f.getName())); |
110 |
} else {
|
111 |
println "Won't process file "+f;
|
112 |
} |
113 |
} |
114 |
if (txmDir.listFiles() == null) { |
115 |
println "No txm file to process"
|
116 |
return;
|
117 |
} |
118 |
|
119 |
List<File> filelist = txmDir.listFiles(); |
120 |
Collections.sort(filelist);
|
121 |
|
122 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
123 |
if (MONITOR != null) MONITOR.worked(25, "COMPILING") |
124 |
println "-- COMPILING - Building Search Engine indexes"
|
125 |
println "binDir: $binDir"
|
126 |
println "txmDir: $txmDir"
|
127 |
def c = new compiler(); |
128 |
if (debug) c.setDebug();
|
129 |
c.setLang(lang); |
130 |
//c.setSortMetadata(sortMetadata)
|
131 |
if (!c.run(binDir, txmDir, basename, corpusname, filelist)) {
|
132 |
println "import process stopped";
|
133 |
return;
|
134 |
} |
135 |
|
136 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
137 |
if (MONITOR != null) MONITOR.worked(20, "EDITION") |
138 |
println "-- EDITION - Building edition"
|
139 |
new File(binDir,"HTML/$corpusname").deleteDir(); |
140 |
new File(binDir,"HTML/$corpusname").mkdirs(); |
141 |
File outdir = new File(binDir,"/HTML/$corpusname/default/"); |
142 |
outdir.mkdirs(); |
143 |
|
144 |
|
145 |
def second = 0 |
146 |
println "Paginating text: "
|
147 |
for (File srcfile : filelist) { |
148 |
String txtname = srcfile.getName();
|
149 |
int i = txtname.lastIndexOf("."); |
150 |
if(i > 0) txtname = txtname.substring(0, i); |
151 |
List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang); |
152 |
List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang); |
153 |
Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile);
|
154 |
if (second) { print(", ") } |
155 |
if (second > 0 && (second++ % 5) == 0) println "" |
156 |
print(srcfile.getName()); |
157 |
def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, 500, basename, "pb"); |
158 |
Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html"); |
159 |
for (i = 0 ; i < ed.getPageFiles().size();) { |
160 |
File f = ed.getPageFiles().get(i);
|
161 |
String wordid = ed.getIdx().get(i);
|
162 |
params.addPage(edition, ""+(++i), wordid);
|
163 |
} |
164 |
} |
165 |
|
166 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
167 |
if (MONITOR != null) MONITOR.worked(20, "FINALIZING") |
168 |
File paramFile = new File(binDir, "import.xml"); |
169 |
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;
|
170 |
|
171 |
|
172 |
// TMR Modifications start here
|
173 |
File tigerDir = new File(binDir, "tiger"); |
174 |
tigerDir.mkdir(); |
175 |
File master = new File(tigerSrcDir, "master.xml"); |
176 |
File txmSrcFile = txmSrcDir.listFiles()[0]; // only one text file |
177 |
|
178 |
// Run SRCMF cmd line corpus import script
|
179 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
180 |
def importer = new SrcmfImporter() |
181 |
def tigerXmlAll = importer.doAllButPnc(master, txmSrcFile, headerFile, binDir)
|
182 |
|
183 |
// Create TigerXml-POS-PNC
|
184 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
185 |
println "create TIGER XML POS PNC"
|
186 |
PunctInjectImport injector; |
187 |
if (Toolbox.isInitialized()) {
|
188 |
// get idx from CQP
|
189 |
// INCOMPLETE
|
190 |
// def injector = new PunctInjectImport(tmpFile, words)
|
191 |
println "Error: the loader is not ready to be launched from RCP"
|
192 |
return;
|
193 |
} else {
|
194 |
injector = new PunctInjectImport(tigerXmlAll, txmSrcFile)
|
195 |
} |
196 |
injector.outputFile = new File(tigerDir, "TigerPnc.xml") |
197 |
injector.process() |
198 |
|
199 |
// Create the tigersearch.logprop file (used to launch TIGERSearch)
|
200 |
File logprop = new File(tigerDir, "tigersearch.logprop"); |
201 |
logprop.withWriter("UTF-8") { writer ->
|
202 |
writer.write("""# Default log configuration of the TIGERSearch suite
|
203 |
|
204 |
log4j.rootLogger=WARN,Logfile
|
205 |
log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=INFO
|
206 |
|
207 |
log4j.appender.Logfile=org.apache.log4j.RollingFileAppender
|
208 |
log4j.appender.Logfile.File=\${user.home}/tigersearch/tigersearch.log
|
209 |
log4j.appender.Logfile.MaxFileSize=500KB
|
210 |
log4j.appender.Logfile.MaxBackupIndex=1
|
211 |
|
212 |
log4j.appender.Logfile.layout=org.apache.log4j.PatternLayout
|
213 |
log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n""")
|
214 |
} |
215 |
|
216 |
// Run TigerRegistry
|
217 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
218 |
BasicConfigurator.configure(); |
219 |
String uri = tigerXmlAll.getAbsolutePath();
|
220 |
File tigerBinDir = new File(tigerDir, corpusname) |
221 |
tigerBinDir.mkdir() |
222 |
try {
|
223 |
IndexBuilderErrorHandler handler = new SimpleErrorHandler(tigerBinDir.getAbsolutePath());
|
224 |
XMLIndexing indexing = new XMLIndexing(corpusname,uri, tigerBinDir.getAbsolutePath(), handler,false); |
225 |
indexing.startIndexing(); |
226 |
} |
227 |
catch (IOException e) { System.out.println("IO: "+e.getMessage()); } |
228 |
catch (SAXException e) { System.out.println("SAX: "+e.getMessage()); } |
229 |
|
230 |
tigerXmlAll.delete() |