Statistics
| Revision:

root / tmp / org.txm.tigersearch.rcp / src / org / txm / importer / srcmf / tigersearchLoader.groovy @ 458

History | View | Annotate | Download (7.8 kB)

1
package org.txm.importer.srcmf
2

    
3
// srcdir
4
// txml/master.xml
5
// txml/*.xml
6
// txm/*.xml
7

    
8
// 1 import xml-txm
9
import ims.tiger.gui.tigerregistry.TIGERRegistry;
10
import ims.tiger.index.writer.IndexBuilderErrorHandler;
11
import ims.tiger.index.writer.SimpleErrorHandler;
12
import ims.tiger.index.writer.XMLIndexing;
13
import ims.tiger.system.*;
14

    
15
import javax.xml.stream.*;
16

    
17
import java.io.BufferedWriter;
18
import java.io.File;
19
import java.io.FileOutputStream;
20
import java.io.IOException;
21
import java.io.OutputStreamWriter;
22
import java.io.PrintStream;
23
import java.net.URL;
24
import java.util.HashMap;
25
import java.util.logging.FileHandler
26
import java.io.File;
27

    
28
import org.apache.log4j.BasicConfigurator;
29
import org.txm.export.ts.*;
30
import org.txm.importer.ApplyXsl2;
31
import org.txm.utils.xml.DomUtils;
32
import org.txm.importer.ValidateXml;
33
import org.txm.importer.XPathResult;
34
import org.txm.importer.xmltxm.compiler;
35
import org.txm.importer.xml.pager;
36
import org.txm.objects.*;
37
import org.txm.utils.*;
38
import org.txm.*;
39
import org.txm.scripts.teitxm.*;
40
import org.txm.metadatas.*;
41
import org.txm.utils.i18n.*;
42
import org.w3c.dom.Element
43
import org.xml.sax.SAXException;
44

    
45
import javax.xml.parsers.DocumentBuilderFactory
46

    
47
String userDir = System.getProperty("user.home");
48
boolean debug = false
49
def MONITOR;
50
BaseParameters params;
51
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
52
{        println "DEV MODE";//exception means we debug
53
        debug = true
54
        params = new BaseParameters(new File(userDir, "xml/slethgier/import.xml"))
55
        params.load()
56
        if (!org.txm.Toolbox.isInitialized()) {
57
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
58
                Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
59
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
60
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
61
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
62
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
63
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
64
        }
65
}
66
if (params == null) { println "no parameters. Aborting"; return; }
67

    
68
String corpusname = params.getCorpusName();
69
Element corpusElem = params.corpora.get(corpusname);
70
String basename = params.name;
71
String rootDir = params.rootDir;
72
String lang = corpusElem.getAttribute("lang");
73
String model = lang
74
String encoding = corpusElem.getAttribute("encoding");
75
boolean annotate = "true" == corpusElem.getAttribute("annotate");
76
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
77
def xslParams = params.getXsltParams(corpusElem);
78

    
79
File txmSrcDir = new File(rootDir, "xml-txm");
80
File tigerSrcDir = new File(rootDir, "tiger-xml");
81
File headerFile = new File(rootDir, "tiger-xml/header.xml");
82

    
83
File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME),"corpora/"+basename);
84

    
85
binDir.deleteDir();
86
binDir.mkdirs();
87
if (!binDir.exists()) {
88
        println "Could not create binDir "+binDir
89
        return;
90
}
91
if (!txmSrcDir.exists()) {
92
        println "Error: txm src dir does not exists: "+txmSrcDir
93
        return;
94
}
95
if (!tigerSrcDir.exists()) {
96
        println "Error: tiger src dir does not exists: "+tigerSrcDir
97
        return;
98
}
99
File txmDir = new File(binDir,"txm/$corpusname");
100
txmDir.deleteDir();
101
txmDir.mkdirs();
102
// copy txm files
103
List<File> srcfiles = txmSrcDir.listFiles();
104
for (File f : srcfiles) {// check XML format, and copy file into binDir
105
        if (f.getName().equals("import.xml") || f.getName().equals("metadata.csv") || f.getName().endsWith(".properties"))
106
                continue;
107
        if (ValidateXml.test(f)) {
108
                FileCopy.copy(f, new File(txmDir, f.getName()));
109
        } else {
110
                println "Won't process file "+f;
111
        }
112
}
113
if (txmDir.listFiles() == null) {
114
        println "No txm file to process"
115
        return;
116
}
117

    
118
List<File> filelist = txmDir.listFiles();
119
Collections.sort(filelist);
120

    
121
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
122
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
123
println "-- COMPILING - Building Search Engine indexes"
124
println "binDir: $binDir"
125
println "txmDir: $txmDir"
126
def c = new compiler();
127
if (debug) c.setDebug();
128
c.setLang(lang);
129
//c.setSortMetadata(sortMetadata)
130
if (!c.run(binDir, txmDir, basename, corpusname, filelist)) {
131
        println "import process stopped";
132
        return;
133
}
134

    
135
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
136
if (MONITOR != null) MONITOR.worked(20, "EDITION")
137
println "-- EDITION - Building edition"
138
new File(binDir,"HTML/$corpusname").deleteDir();
139
new File(binDir,"HTML/$corpusname").mkdirs();
140
File outdir = new File(binDir,"/HTML/$corpusname/default/");
141
outdir.mkdirs();
142

    
143

    
144
def second = 0
145
println "Paginating text: "
146
for (File srcfile : filelist) {
147
        String txtname = srcfile.getName();
148
        int i = txtname.lastIndexOf(".");
149
        if(i > 0) txtname = txtname.substring(0, i);
150
        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
151
        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
152
        Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile);
153
        if (second) { print(", ") }
154
        if (second > 0 && (second++ % 5) == 0) println ""
155
        print(srcfile.getName());
156
        def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, 500, basename, "pb");
157
        Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
158
        for (i = 0 ; i < ed.getPageFiles().size();) {
159
                File f = ed.getPageFiles().get(i);
160
                String wordid = ed.getIdx().get(i);
161
                params.addPage(edition, ""+(++i), wordid);
162
        }
163
}
164

    
165
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
166
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
167
File paramFile = new File(binDir, "import.xml");
168
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;
169

    
170

    
171
// TMR Modifications start here
172
File tigerDir = new File(binDir, "tiger");
173
tigerDir.mkdir();
174
File master = new File(tigerSrcDir, "master.xml");
175
File txmSrcFile = txmSrcDir.listFiles()[0]; // only one text file 
176

    
177
// Run SRCMF cmd line corpus import script 
178
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
179
def importer = new SrcmfImporter()
180
def tigerXmlAll = importer.doAllButPnc(master, txmSrcFile, headerFile, binDir)
181

    
182
// Create TigerXml-POS-PNC
183
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
184
println "create TIGER XML POS PNC"
185
PunctInjectImport injector;
186
if (Toolbox.isInitialized()) {
187
        // get idx from CQP
188
        // INCOMPLETE
189
        // def injector = new PunctInjectImport(tmpFile, words)
190
        println "Error: the loader is not ready to be launched from RCP"
191
        return;
192
} else {
193
        injector = new PunctInjectImport(tigerXmlAll, txmSrcFile)
194
}
195
injector.outputFile = new File(tigerDir, "TigerPnc.xml")
196
injector.process()
197

    
198
// Create the tigersearch.logprop file (used to launch TIGERSearch)
199
File logprop = new File(tigerDir, "tigersearch.logprop");
200
logprop.withWriter("UTF-8") { writer ->
201
        writer.write("""# Default log configuration of the TIGERSearch suite
202

203
log4j.rootLogger=WARN,Logfile
204
log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=INFO
205

206
log4j.appender.Logfile=org.apache.log4j.RollingFileAppender
207
log4j.appender.Logfile.File=\${user.home}/tigersearch/tigersearch.log
208
log4j.appender.Logfile.MaxFileSize=500KB
209
log4j.appender.Logfile.MaxBackupIndex=1
210

211
log4j.appender.Logfile.layout=org.apache.log4j.PatternLayout
212
log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n""")        
213
}
214

    
215
// Run TigerRegistry
216
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
217
BasicConfigurator.configure();
218
String uri = tigerXmlAll.getAbsolutePath();
219
File tigerBinDir = new File(tigerDir, corpusname)
220
tigerBinDir.mkdir()
221
try {
222
        IndexBuilderErrorHandler handler = new SimpleErrorHandler(tigerBinDir.getAbsolutePath());
223
        XMLIndexing indexing = new XMLIndexing(corpusname,uri, tigerBinDir.getAbsolutePath(), handler,false);
224
        indexing.startIndexing();
225
}
226
catch (IOException e) { System.out.println("IO: "+e.getMessage()); }
227
catch (SAXException e) { System.out.println("SAX: "+e.getMessage()); }
228

    
229
tigerXmlAll.delete()