Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / wtc / wtcLoader.groovy @ 479

History | View | Annotate | Download (5.1 kB)

1
package org.txm.importer.wtc
2

    
3
import org.txm.importer.cwb.BuildCwbEncodeArgs;
4
import java.io.File;
5
import java.util.ArrayList;
6

    
7
import org.txm.objects.*;
8
import org.txm.scripts.teitxm.*;
9
import org.txm.*;
10
import org.txm.utils.i18n.*;
11
import org.txm.metadatas.*;
12
import org.txm.utils.io.FileCopy;
13
import org.w3c.dom.Element
14
import org.txm.utils.xml.DomUtils;
15

    
16
String userDir = System.getProperty("user.home");
17

    
18
def MONITOR;
19
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
20
BaseParameters params;
21
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
22
{        println "DEV MODE";//exception means we debug
23
        debug = true
24
        params = new BaseParameters(new File(userDir, "xml/wtc/import.xml"))
25
        params.load()
26
        if (!org.txm.Toolbox.isInitialized()) {
27
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
28
                //Toolbox.setParam(Toolbox.INSTALL_DIR,new File("C:\\Program Files\\TXM"));//For Windows
29
                Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
30
                //Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File("C:\\Program Files\\treetagger"));//for Windows
31
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
32
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
33
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
34
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
35
                //Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File("C:\\Program Files\\treetagger\\models"));//for Windows
36
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
37
        }
38
}
39
if (params == null) { println "no parameters. Aborting"; return; }
40

    
41
String corpusname = params.getCorpusName();
42
Element corpusElem = params.corpora.get(corpusname);
43
String basename = params.name;
44
String rootDir = params.rootDir;
45
String lang = corpusElem.getAttribute("lang");
46
String model = lang
47
String encoding = corpusElem.getAttribute("encoding");
48
boolean annotate = "true" == corpusElem.getAttribute("annotate");
49
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
50
def xslParams = params.getXsltParams(corpusElem);
51
int wordsPerPage = params.getWordsPerPage("default")
52
boolean build_edition = params.getDoEdition("default")
53

    
54
File srcDir = new File(rootDir);
55
File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME),"corpora/"+basename);
56
binDir.deleteDir();
57
binDir.mkdirs();
58
if (!binDir.exists()) {
59
        println "Could not create binDir "+binDir
60
        return;
61
}
62

    
63
// FAKE: will contains no XML-TXM files
64
File txmDir = new File(binDir, "txm/$corpusname");
65
txmDir.deleteDir();
66
txmDir.mkdirs();
67

    
68
// Will contains the edition files
69
File htmlDir = new File(binDir, "HTML/$corpusname");
70
htmlDir.deleteDir();
71
htmlDir.mkdirs();
72

    
73
// get the wtc file and the registry file
74
File wtcFile;
75
File registryFile;
76
println "Looking for files to process..."
77
for(File f : srcDir.listFiles()) {
78
        if (f.isHidden()) continue;
79
        if (f.isDirectory()) continue;
80
        if (f.getName().endsWith(".wtc")) {
81
                if (wtcFile == null) {
82
                        wtcFile = f;
83
                        println "Using WTC file : $wtcFile"
84
                }
85
        } else if (f.getName().endsWith(".xml")) {
86

    
87
        } else if (!f.getName().contains(".")){ // its the registry file
88
                if (registryFile == null) {
89
                        registryFile = f;
90
                        println "Using registry file : $f"
91
                }
92
        }
93
}
94

    
95
if (wtcFile == null) {
96
        println "No WTC file in source directory: "+srcDir;
97
        return false;
98
}
99

    
100
// get pAttributes an run cwb-encode + cwb-makeall
101
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
102
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
103
println "-- COMPILING - Building Search Engine indexes"
104
def c = new compiler(wtcFile, registryFile)
105
c.setDebug(debug)
106
if (!c.run(binDir, corpusname)) {
107
        println "Import stopped"
108
        return;
109
}
110

    
111
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
112

    
113
new File(binDir,"HTML/$corpusname").deleteDir();
114
new File(binDir,"HTML/$corpusname").mkdirs();
115
if (build_edition) {
116
        
117
        println "-- EDITION - Building edition"
118
        if (MONITOR != null) MONITOR.worked(20, "EDITION - Building edition")
119

    
120
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
121
        outdir.mkdirs();
122

    
123
        print "."
124

    
125
        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
126
        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
127

    
128
        File binWtcFile = new File(binDir,"wtc/"+corpusname+".wtc");
129
        def ed = new pager(binWtcFile, outdir, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, registryFile !=null, c.pAttributesList);
130
        def allIndexes = ed.getIdx();
131
        def allPages = ed.getPageFiles();
132
        for (String txtname : ed.getTextNames()) {
133
                def txtIndexes = allIndexes[txtname];
134
                def txtPages = allPages[txtname];
135

    
136
                Element text = params.addText(params.corpora.get(corpusname), txtname, wtcFile);
137
                Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
138

    
139
                for (i = 0 ; i < txtPages.size();) {
140
                        File f = txtPages.get(i);
141
                        String wordid = txtIndexes.get(i);
142
                        params.addPage(edition, ""+(++i), wordid);
143
                }
144
        }
145
}
146
println ""
147

    
148
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
149
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
150
File paramFile = new File(binDir, "import.xml");
151
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;