Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / wtc / wtcLoader.groovy @ 148

History | View | Annotate | Download (4.9 kB)

1
package org.txm.importer.wtc
2

    
3
import org.txm.*
4
import org.txm.metadatas.*
5
import org.txm.objects.*
6
import org.txm.scripts.teitxm.*
7
import org.txm.utils.i18n.*
8
import org.w3c.dom.Element
9

    
10
String userDir = System.getProperty("user.home");
11

    
12
def MONITOR;
13
boolean debug = org.txm.utils.logger.Log.isPrintingErrors();
14
BaseParameters params;
15
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
16
{        println "DEV MODE";//exception means we debug
17
        debug = true
18
        params = new BaseParameters(new File(userDir, "xml/wtc/import.xml"))
19
        params.load()
20
        if (!org.txm.Toolbox.isInitialized()) {
21
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
22
                //Toolbox.setParam(Toolbox.INSTALL_DIR,new File("C:\\Program Files\\TXM"));//For Windows
23
                Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
24
                //Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File("C:\\Program Files\\treetagger"));//for Windows
25
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
26
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
27
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
28
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
29
                //Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File("C:\\Program Files\\treetagger\\models"));//for Windows
30
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
31
        }
32
}
33
if (params == null) { println "no parameters. Aborting"; return; }
34

    
35
String corpusname = params.getCorpusName();
36
Element corpusElem = params.corpora.get(corpusname);
37
String basename = params.name;
38
String rootDir = params.rootDir;
39
String lang = corpusElem.getAttribute("lang");
40
String model = lang
41
String encoding = corpusElem.getAttribute("encoding");
42
boolean annotate = "true" == corpusElem.getAttribute("annotate");
43
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
44
def xslParams = params.getXsltParams(corpusElem);
45
int wordsPerPage = params.getWordsPerPage("default")
46
boolean build_edition = params.getDoEdition("default")
47

    
48
File srcDir = new File(rootDir);
49
File binDir = new File(Toolbox.getParam(Toolbox.USER_TXM_HOME),"corpora/"+basename);
50
binDir.deleteDir();
51
binDir.mkdirs();
52
if (!binDir.exists()) {
53
        println "Could not create binDir "+binDir
54
        return;
55
}
56

    
57
// FAKE: will contains no XML-TXM files
58
File txmDir = new File(binDir, "txm/$corpusname");
59
txmDir.deleteDir();
60
txmDir.mkdirs();
61

    
62
// Will contains the edition files
63
File htmlDir = new File(binDir, "HTML/$corpusname");
64
htmlDir.deleteDir();
65
htmlDir.mkdirs();
66

    
67
// get the wtc file and the registry file
68
File wtcFile;
69
File registryFile;
70
println "Looking for files to process..."
71
for(File f : srcDir.listFiles()) {
72
        if (f.isHidden()) continue;
73
        if (f.isDirectory()) continue;
74
        if (f.getName().endsWith(".wtc")) {
75
                if (wtcFile == null) {
76
                        wtcFile = f;
77
                        println "Using WTC file : $wtcFile"
78
                }
79
        } else if (f.getName().endsWith(".xml")) {
80

    
81
        } else if (!f.getName().contains(".")){ // its the registry file
82
                if (registryFile == null) {
83
                        registryFile = f;
84
                        println "Using registry file : $f"
85
                }
86
        }
87
}
88

    
89
if (wtcFile == null) {
90
        println "No WTC file in source directory: "+srcDir;
91
        return false;
92
}
93

    
94
// get pAttributes an run cwb-encode + cwb-makeall
95
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
96
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
97
println "-- COMPILING - Building Search Engine indexes"
98
def c = new compiler(wtcFile, registryFile)
99
c.setDebug(debug)
100
if (!c.run(binDir, corpusname)) {
101
        println "Import stopped"
102
        return;
103
}
104

    
105
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
106

    
107
new File(binDir,"HTML/$corpusname").deleteDir();
108
new File(binDir,"HTML/$corpusname").mkdirs();
109
if (build_edition) {
110
        
111
        println "-- EDITION - Building edition"
112
        if (MONITOR != null) MONITOR.worked(20, "EDITION - Building edition")
113

    
114
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
115
        outdir.mkdirs();
116

    
117
        print "."
118

    
119
        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
120
        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
121

    
122
        File binWtcFile = new File(binDir,"wtc/"+corpusname+".wtc");
123
        def ed = new pager(binWtcFile, outdir, NoSpaceBefore, NoSpaceAfter, wordsPerPage, basename, registryFile !=null, c.pAttributesList);
124
        def allIndexes = ed.getIdx();
125
        def allPages = ed.getPageFiles();
126
        for (String txtname : ed.getTextNames()) {
127
                def txtIndexes = allIndexes[txtname];
128
                def txtPages = allPages[txtname];
129

    
130
                Element text = params.addText(params.corpora.get(corpusname), txtname, wtcFile);
131
                Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
132

    
133
                for (i = 0 ; i < txtPages.size();) {
134
                        File f = txtPages.get(i);
135
                        String wordid = txtIndexes.get(i);
136
                        params.addPage(edition, ""+(++i), wordid);
137
                }
138
        }
139
}
140
println ""
141

    
142
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
143
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
144
File paramFile = new File(binDir, "import.xml");
145
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;