Statistics
| Revision:

root / tmp / org.txm.tigersearch.rcp / groovy / org / txm / importer / srcmf / srcmfLoader.groovy @ 967

History | View | Annotate | Download (7.8 kB)

1
package org.txm.importer.srcmf
2

    
3
// srcdir
4
// txml/master.xml
5
// txml/*.xml
6
// txm/*.xml
7

    
8
// 1 import xml-txm
9
import ims.tiger.gui.tigerregistry.TIGERRegistry;
10
import ims.tiger.index.writer.IndexBuilderErrorHandler;
11
import ims.tiger.index.writer.SimpleErrorHandler;
12
import ims.tiger.index.writer.XMLIndexing;
13
import ims.tiger.system.*;
14

    
15
import javax.xml.stream.*;
16

    
17
import java.io.BufferedWriter;
18
import java.io.File;
19
import java.io.FileOutputStream;
20
import java.io.IOException;
21
import java.io.OutputStreamWriter;
22
import java.io.PrintStream;
23
import java.net.URL;
24
import java.util.HashMap;
25
import java.util.logging.FileHandler
26
import java.io.File;
27

    
28
import org.apache.log4j.BasicConfigurator;
29
import org.txm.export.ts.*;
30
import org.txm.importer.ApplyXsl2;
31
import org.txm.utils.xml.DomUtils;
32
import org.txm.importer.ValidateXml;
33
import org.txm.importer.XPathResult;
34
import org.txm.importer.xmltxm.compiler;
35
import org.txm.importer.xml.pager;
36
import org.txm.objects.*;
37
import org.txm.utils.*
38
import org.txm.utils.io.*;
39
import org.txm.*;
40
import org.txm.importer.xmltxm.*;
41
import org.txm.metadatas.*;
42
import org.txm.utils.i18n.*;
43
import org.w3c.dom.Element
44
import org.xml.sax.SAXException;
45

    
46
import javax.xml.parsers.DocumentBuilderFactory
47

    
48
String userDir = System.getProperty("user.home");
49
boolean debug = false
50
def MONITOR;
51
BaseParameters params;
52
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
53
{        println "DEV MODE";//exception means we debug
54
        debug = true
55
        params = new BaseParameters(new File(userDir, "xml/slethgier/import.xml"))
56
        params.load()
57
        if (!org.txm.Toolbox.isInitialized()) {
58
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
59
                Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
60
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
61
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
62
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
63
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
64
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
65
        }
66
}
67
if (params == null) { println "no parameters. Aborting"; return; }
68

    
69
String corpusname = params.getCorpusName();
70
Element corpusElem = params.corpora.get(corpusname);
71
String basename = params.name;
72
String rootDir = params.rootDir;
73
String lang = corpusElem.getAttribute("lang");
74
String model = lang
75
String encoding = corpusElem.getAttribute("encoding");
76
boolean annotate = "true" == corpusElem.getAttribute("annotate");
77
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
78
def xslParams = params.getXsltParams(corpusElem);
79

    
80
File txmSrcDir = new File(rootDir, "xml-txm");
81
File tigerSrcDir = new File(rootDir, "tiger-xml");
82
File headerFile = new File(rootDir, "tiger-xml/header.xml");
83

    
84
File binDir = new File(Toolbox.getTXMHOMEPATH(),"corpora/"+basename);
85

    
86
binDir.deleteDir();
87
binDir.mkdirs();
88
if (!binDir.exists()) {
89
        println "Could not create binDir "+binDir
90
        return;
91
}
92
if (!txmSrcDir.exists()) {
93
        println "Error: txm src dir does not exists: "+txmSrcDir
94
        return;
95
}
96
if (!tigerSrcDir.exists()) {
97
        println "Error: tiger src dir does not exists: "+tigerSrcDir
98
        return;
99
}
100
File txmDir = new File(binDir,"txm/$corpusname");
101
txmDir.deleteDir();
102
txmDir.mkdirs();
103
// copy txm files
104
List<File> srcfiles = txmSrcDir.listFiles();
105
for (File f : srcfiles) {// check XML format, and copy file into binDir
106
        if (f.getName().equals("import.xml") || f.getName().matches("metadata\\.....?") || f.getName().endsWith(".properties"))
107
                continue;
108
        if (ValidateXml.test(f)) {
109
                FileCopy.copy(f, new File(txmDir, f.getName()));
110
        } else {
111
                println "Won't process file "+f;
112
        }
113
}
114
if (txmDir.listFiles() == null) {
115
        println "No txm file to process"
116
        return;
117
}
118

    
119
List<File> filelist = txmDir.listFiles();
120
Collections.sort(filelist);
121

    
122
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
123
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
124
println "-- COMPILING - Building Search Engine indexes"
125
println "binDir: $binDir"
126
println "txmDir: $txmDir"
127
def c = new compiler();
128
if (debug) c.setDebug();
129
c.setLang(lang);
130
//c.setSortMetadata(sortMetadata)
131
if (!c.run(binDir, txmDir, basename, corpusname, filelist)) {
132
        println "import process stopped";
133
        return;
134
}
135

    
136
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
137
if (MONITOR != null) MONITOR.worked(20, "EDITION")
138
println "-- EDITION - Building edition"
139
new File(binDir,"HTML/$corpusname").deleteDir();
140
new File(binDir,"HTML/$corpusname").mkdirs();
141
File outdir = new File(binDir,"/HTML/$corpusname/default/");
142
outdir.mkdirs();
143

    
144

    
145
def second = 0
146
println "Paginating text: "
147
for (File srcfile : filelist) {
148
        String txtname = srcfile.getName();
149
        int i = txtname.lastIndexOf(".");
150
        if(i > 0) txtname = txtname.substring(0, i);
151
        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
152
        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
153
        Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile);
154
        if (second) { print(", ") }
155
        if (second > 0 && (second++ % 5) == 0) println ""
156
        print(srcfile.getName());
157
        def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, 500, basename, "pb");
158
        Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
159
        for (i = 0 ; i < ed.getPageFiles().size();) {
160
                File f = ed.getPageFiles().get(i);
161
                String wordid = ed.getIdx().get(i);
162
                params.addPage(edition, ""+(++i), wordid);
163
        }
164
}
165

    
166
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
167
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
168
File paramFile = new File(binDir, "import.xml");
169
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;
170

    
171

    
172
// TMR Modifications start here
173
File tigerDir = new File(binDir, "tiger");
174
tigerDir.mkdir();
175
File master = new File(tigerSrcDir, "master.xml");
176
File txmSrcFile = txmSrcDir.listFiles()[0]; // only one text file 
177

    
178
// Run SRCMF cmd line corpus import script 
179
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
180
def importer = new SrcmfImporter()
181
def tigerXmlAll = importer.doAllButPnc(master, txmSrcFile, headerFile, binDir)
182

    
183
// Create TigerXml-POS-PNC
184
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
185
println "create TIGER XML POS PNC"
186
PunctInjectImport injector;
187
if (Toolbox.isInitialized()) {
188
        // get idx from CQP
189
        // INCOMPLETE
190
        // def injector = new PunctInjectImport(tmpFile, words)
191
        println "Error: the loader is not ready to be launched from RCP"
192
        return;
193
} else {
194
        injector = new PunctInjectImport(tigerXmlAll, txmSrcFile)
195
}
196
injector.outputFile = new File(tigerDir, "TigerPnc.xml")
197
injector.process()
198

    
199
// Create the tigersearch.logprop file (used to launch TIGERSearch)
200
File logprop = new File(tigerDir, "tigersearch.logprop");
201
logprop.withWriter("UTF-8") { writer ->
202
        writer.write("""# Default log configuration of the TIGERSearch suite
203

204
log4j.rootLogger=WARN,Logfile
205
log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=INFO
206

207
log4j.appender.Logfile=org.apache.log4j.RollingFileAppender
208
log4j.appender.Logfile.File=\${user.home}/tigersearch/tigersearch.log
209
log4j.appender.Logfile.MaxFileSize=500KB
210
log4j.appender.Logfile.MaxBackupIndex=1
211

212
log4j.appender.Logfile.layout=org.apache.log4j.PatternLayout
213
log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n""")        
214
}
215

    
216
// Run TigerRegistry
217
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
218
BasicConfigurator.configure();
219
String uri = tigerXmlAll.getAbsolutePath();
220
File tigerBinDir = new File(tigerDir, corpusname)
221
tigerBinDir.mkdir()
222
try {
223
        IndexBuilderErrorHandler handler = new SimpleErrorHandler(tigerBinDir.getAbsolutePath());
224
        XMLIndexing indexing = new XMLIndexing(corpusname,uri, tigerBinDir.getAbsolutePath(), handler,false);
225
        indexing.startIndexing();
226
}
227
catch (IOException e) { System.out.println("IO: "+e.getMessage()); }
228
catch (SAXException e) { System.out.println("SAX: "+e.getMessage()); }
229

    
230
tigerXmlAll.delete()