Statistics
| Revision:

root / tmp / org.txm.tigersearch.rcp / groovy / org / txm / scripts / importer / srcmf / srcmfLoader.groovy @ 2080

History | View | Annotate | Download (7.4 kB)

1
package org.txm.scripts.importer.srcmf
2

    
3
// srcdir
4
// txml/master.xml
5
// txml/*.xml
6
// txm/*.xml
7

    
8
// 1 import xml-txm
9
import ims.tiger.gui.tigerregistry.TIGERRegistry;
10
import ims.tiger.index.writer.IndexBuilderErrorHandler;
11
import ims.tiger.index.writer.SimpleErrorHandler;
12
import ims.tiger.index.writer.XMLIndexing;
13
import ims.tiger.system.*;
14

    
15
import javax.xml.stream.*;
16

    
17
import java.io.BufferedWriter;
18
import java.io.File;
19
import java.io.FileOutputStream;
20
import java.io.IOException;
21
import java.io.OutputStreamWriter;
22
import java.io.PrintStream;
23
import java.net.URL;
24
import java.util.HashMap;
25
import java.util.logging.FileHandler
26
import java.io.File;
27

    
28
import org.apache.log4j.BasicConfigurator;
29
import org.txm.export.ts.*;
30
import org.txm.importer.ApplyXsl2;
31
import org.txm.utils.xml.DomUtils;
32
import org.txm.importer.ValidateXml;
33
import org.txm.importer.XPathResult;
34
import org.txm.importer.xmltxm.compiler;
35
import org.txm.importer.xml.pager;
36
import org.txm.objects.*;
37
import org.txm.utils.*
38
import org.txm.utils.io.*;
39
import org.txm.*;
40
import org.txm.importer.xmltxm.*;
41
import org.txm.metadatas.*;
42
import org.txm.utils.i18n.*;
43
import org.w3c.dom.Element
44
import org.xml.sax.SAXException;
45

    
46
import javax.xml.parsers.DocumentBuilderFactory
47

    
48
String userDir = System.getProperty("user.home");
49
boolean debug = false
50
def MONITOR;
51
BaseParameters params;
52
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
53
{        println "DEV MODE";//exception means we debug
54
        debug = true
55
        params = new BaseParameters(new File(userDir, "xml/slethgier/import.xml"))
56
        params.load()
57
}
58
if (params == null) { println "no parameters. Aborting"; return; }
59

    
60
String corpusname = params.getCorpusName();
61
Element corpusElem = params.corpora.get(corpusname);
62
String basename = params.name;
63
String rootDir = params.rootDir;
64
String lang = corpusElem.getAttribute("lang");
65
String model = lang
66
String encoding = corpusElem.getAttribute("encoding");
67
boolean annotate = "true" == corpusElem.getAttribute("annotate");
68
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
69
def xslParams = params.getXsltParams(corpusElem);
70

    
71
File txmSrcDir = new File(rootDir, "xml-txm");
72
File tigerSrcDir = new File(rootDir, "tiger-xml");
73
File headerFile = new File(rootDir, "tiger-xml/header.xml");
74

    
75
File binDir = new File(Toolbox.getTxmHomePath(),"corpora/"+basename);
76

    
77
binDir.deleteDir();
78
binDir.mkdirs();
79
if (!binDir.exists()) {
80
        println "Could not create binDir "+binDir
81
        return;
82
}
83
if (!txmSrcDir.exists()) {
84
        println "Error: txm src dir does not exists: "+txmSrcDir
85
        return;
86
}
87
if (!tigerSrcDir.exists()) {
88
        println "Error: tiger src dir does not exists: "+tigerSrcDir
89
        return;
90
}
91
File txmDir = new File(binDir,"txm/$corpusname");
92
txmDir.deleteDir();
93
txmDir.mkdirs();
94
// copy txm files
95
List<File> srcfiles = txmSrcDir.listFiles(IOUtils.HIDDENFILE_FILTER);
96
for (File f : srcfiles) {// check XML format, and copy file into binDir
97
        if (f.getName().equals("import.xml") || f.getName().matches("metadata\\.....?") || f.getName().endsWith(".properties"))
98
                continue;
99
        if (ValidateXml.test(f)) {
100
                FileCopy.copy(f, new File(txmDir, f.getName()));
101
        } else {
102
                println "Won't process file "+f;
103
        }
104
}
105
if (txmDir.listFiles(IOUtils.HIDDENFILE_FILTER) == null) {
106
        println "No txm file to process"
107
        return;
108
}
109

    
110
List<File> filelist = txmDir.listFiles(IOUtils.HIDDENFILE_FILTER);
111
Collections.sort(filelist);
112

    
113
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
114
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
115
println "-- COMPILING - Building Search Engine indexes"
116
println "binDir: $binDir"
117
println "txmDir: $txmDir"
118
def c = new compiler();
119
if (debug) c.setDebug();
120
c.setLang(lang);
121
//c.setSortMetadata(sortMetadata)
122
if (!c.run(binDir, txmDir, basename, corpusname, filelist)) {
123
        println "import process stopped";
124
        return;
125
}
126

    
127
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
128
if (MONITOR != null) MONITOR.worked(20, "EDITION")
129
println "-- EDITION - Building edition"
130
new File(binDir,"HTML/$corpusname").deleteDir();
131
new File(binDir,"HTML/$corpusname").mkdirs();
132
File outdir = new File(binDir,"/HTML/$corpusname/default/");
133
outdir.mkdirs();
134

    
135

    
136
def second = 0
137
println "Paginating text: "
138
for (File srcfile : filelist) {
139
        String txtname = srcfile.getName();
140
        int i = txtname.lastIndexOf(".");
141
        if(i > 0) txtname = txtname.substring(0, i);
142
        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
143
        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
144
        Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile);
145
        if (second) { print(", ") }
146
        if (second > 0 && (second++ % 5) == 0) println ""
147
        print(srcfile.getName());
148
        def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, 500, basename, "pb");
149
        Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
150
        for (i = 0 ; i < ed.getPageFiles().size();) {
151
                File f = ed.getPageFiles().get(i);
152
                String wordid = ed.getIdx().get(i);
153
                params.addPage(edition, ""+(++i), wordid);
154
        }
155
}
156

    
157
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
158
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
159
File paramFile = new File(binDir, "import.xml");
160
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;
161

    
162

    
163
// TMR Modifications start here
164
File tigerDir = new File(binDir, "tiger");
165
tigerDir.mkdir();
166
File master = new File(tigerSrcDir, "master.xml");
167
File txmSrcFile = txmSrcDir.listFiles(IOUtils.HIDDENFILE_FILTER)[0]; // only one text file 
168

    
169
// Run SRCMF cmd line corpus import script 
170
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
171
def importer = new SrcmfImporter()
172
def tigerXmlAll = importer.doAllButPnc(master, txmSrcFile, headerFile, binDir)
173

    
174
// Create TigerXml-POS-PNC
175
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
176
println "create TIGER XML POS PNC"
177
PunctInjectImport injector;
178
if (Toolbox.isInitialized()) {
179
        // get idx from CQP
180
        // INCOMPLETE
181
        // def injector = new PunctInjectImport(tmpFile, words)
182
        println "Error: the loader is not ready to be launched from RCP"
183
        return;
184
} else {
185
        injector = new PunctInjectImport(tigerXmlAll, txmSrcFile)
186
}
187
injector.outputFile = new File(tigerDir, "TigerPnc.xml")
188
injector.process()
189

    
190
// Create the tigersearch.logprop file (used to launch TIGERSearch)
191
File logprop = new File(tigerDir, "tigersearch.logprop");
192
logprop.withWriter("UTF-8") { writer ->
193
        writer.write("""# Default log configuration of the TIGERSearch suite
194

195
log4j.rootLogger=WARN,Logfile
196
log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=INFO
197

198
log4j.appender.Logfile=org.apache.log4j.RollingFileAppender
199
log4j.appender.Logfile.File=\${user.home}/tigersearch/tigersearch.log
200
log4j.appender.Logfile.MaxFileSize=500KB
201
log4j.appender.Logfile.MaxBackupIndex=1
202

203
log4j.appender.Logfile.layout=org.apache.log4j.PatternLayout
204
log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n""")        
205
}
206

    
207
// Run TigerRegistry
208
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
209
BasicConfigurator.configure();
210
String uri = tigerXmlAll.getAbsolutePath();
211
File tigerBinDir = new File(tigerDir, corpusname)
212
tigerBinDir.mkdir()
213
try {
214
        IndexBuilderErrorHandler handler = new SimpleErrorHandler(tigerBinDir.getAbsolutePath());
215
        XMLIndexing indexing = new XMLIndexing(corpusname,uri, tigerBinDir.getAbsolutePath(), handler,false);
216
        indexing.startIndexing();
217
}
218
catch (IOException e) { System.out.println("IO: "+e.getMessage()); }
219
catch (SAXException e) { System.out.println("SAX: "+e.getMessage()); }
220

    
221
tigerXmlAll.delete()