Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / bfm / bfmLoader.groovy @ 1488

History | View | Annotate | Download (6.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
//
7
// This file is part of the TXM platform.
8
//
9
// The TXM platform is free software: you can redistribute it and/or modif y
10
// it under the terms of the GNU General Public License as published by
11
// the Free Software Foundation, either version 3 of the License, or
12
// (at your option) any later version.
13
//
14
// The TXM platform is distributed in the hope that it will be useful,
15
// but WITHOUT ANY WARRANTY; without even the implied warranty of
16
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
// GNU General Public License for more details.
18
//
19
// You should have received a copy of the GNU General Public License
20
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-03-01 10:10:58 +0100 (mar. 01 mars 2016) $
25
// $LastChangedRevision: 3133 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.bfm;
29

    
30
import javax.xml.stream.XMLStreamReader;
31
import org.txm.importer.ApplyXsl2;
32
import org.txm.scripts.importer.bfm.importer;
33
import org.txm.scripts.importer.bfm.compiler;
34
import org.txm.scripts.importer.bfm.pager;
35
import org.txm.objects.*;
36
import org.txm.*;
37
import org.txm.core.engines.*;
38
import org.txm.utils.i18n.*;
39
import org.txm.importer.scripts.xmltxm.*;
40
import org.txm.scripts.importer.*;
41
import org.w3c.dom.Element
42
import org.txm.utils.xml.DomUtils;
43
import org.txm.utils.*
44
import org.txm.utils.io.*;
45
import org.txm.importer.*
46

    
47
String userDir = System.getProperty("user.home");
48
Project project;
49

    
50
try {project=projectBinding;MONITOR=monitor} catch (Exception)
51
{        }
52
if (project == null) { println "no project set. Aborting"; return; }
53

    
54
String corpusname = project.getName();
55
String basename = corpusname
56
String rootDir = project.getSrcdir();
57
String lang = project.getLang()
58
String model = lang
59
String encoding = project.getEncoding()
60
boolean annotate = project.getAnnotate()
61
String xsl = project.getFrontXSL();
62
def xslParams = project.getXsltParameters();
63
int wordsPerPage = project.getEditionDefinition("default").getWordsPerPage()
64
String page_element = project.getEditionDefinition("default").getPageElement()
65
boolean build_edition = project.getEditionDefinition("default").getBuildEdition()
66

    
67
File srcDir = new File(rootDir);
68
File binDir = project.getProjectDirectory();
69
binDir.mkdirs();
70
if (!binDir.exists()) {
71
        println "Could not create binDir "+binDir
72
        return;
73
}
74

    
75
File txmDir = new File(binDir,"txm/$corpusname");
76
txmDir.deleteDir();
77
txmDir.mkdirs();
78

    
79
// BFM XPATH PARAMETERS
80
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
81
if (MONITOR != null) MONITOR.worked(1, "READ XPATH PARAMETERS")
82
File paramDeclarationFile = new File(srcDir, "import.properties");
83
Properties metadataXPath = new Properties();
84
if (paramDeclarationFile.exists() && paramDeclarationFile.canRead()) {
85
        InputStreamReader input = new InputStreamReader(new FileInputStream(paramDeclarationFile) , "UTF-8");
86
        metadataXPath.load(input);
87
        input.close();
88

    
89
        if (!metadataXPath.containsKey("titre"))
90
                println "Warning: parameters property file does not contain the 'titre' metadata"
91
        if (!metadataXPath.containsKey("forme"))
92
                println "Warning: parameters property file does not contain the 'forme' metadata in param values. The default value is 'prose'"
93
} else {
94
        println "No '$paramDeclarationFile' file found"
95
}
96
// Apply XSL
97
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
98
if (MONITOR != null) MONITOR.worked(1, "APPLYING XSL")
99
if (xsl != null && xslParams != null && xsl.trim().length() > 0) {
100
        if (ApplyXsl2.processImportSources(new File(xsl), srcDir, new File(binDir, "src"), xslParams))
101
        // return; // error during process
102
        srcDir = new File(binDir, "src");
103
        println ""
104
}
105

    
106
// copy txm files
107
println "-- VALIDATION - checking XML source files well-formedness"
108
List<File> srcfiles = srcDir.listFiles();
109
for (File f : srcfiles) { // check XML format, and copy file into binDir
110
        if (f.isHidden() || f.getName().equals("import.xml") || f.getName().matches("metadata\\.....?") || f.getName().endsWith(".properties"))
111
                continue;
112
        if (ValidateXml.test(f)) {
113
                FileCopy.copy(f, new File(txmDir, f.getName()));
114
        } else {
115
                println "Won't process file "+f;
116
        }
117
}
118

    
119
if (txmDir.listFiles() == null) {
120
        println "No txm file to process"
121
        return;
122
}
123

    
124
if (MONITOR != null) MONITOR.worked(1, "IMPORTER")
125
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
126
println "-- IMPORTER - Reading source files"
127
def imp = new importer()
128
if (!imp.run(srcDir, binDir, txmDir, basename, metadataXPath)) {
129
        println "import process stopped";
130
        return;
131
}
132

    
133
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
134
if (MONITOR != null) MONITOR.worked(20, "ANNOTATE")
135

    
136
boolean annotate_status = true;
137
if (annotate) {
138
        println "-- ANNOTATE - Running NLP tools - $model model"
139
        def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine("TreeTagger")
140
        if (engine.processDirectory(txmDir, binDir, ["lang":model])) {
141
                annotate_status = true;
142
        }
143
}
144

    
145
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
146
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
147
println "-- COMPILING - Building Search Engine indexes"
148
def c = new compiler();
149
if (debug) c.setDebug();
150
//c.setCwbPath("~/TXM/cwb/bin");
151
c.setLang(lang);
152
c.setAnnotationDone(annotate_status)
153
if (!c.run(project, binDir, txmDir, corpusname, metadataXPath)) {
154
        println "import process stopped";
155
        return;
156
}
157

    
158
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
159

    
160
new File(binDir,"HTML/$corpusname").deleteDir();
161
new File(binDir,"HTML/$corpusname").mkdirs();
162
if (build_edition) {
163

    
164
        println "-- EDITION"
165
        if (MONITOR != null) MONITOR.worked(25, "EDITION")
166
        File outdir = new File(binDir,"/HTML/$corpusname/default/");
167
        outdir.mkdirs();
168
        files = c.getOrderedTxmFiles();
169

    
170
        println("Building editions: "+files.size()+" files" );
171
        for (File txmFile : files) {
172
                print "."
173
                String txtname = txmFile.getName();
174
                int idx = txtname.lastIndexOf(".");
175
                if(idx > 0) txtname = txtname.substring(0, idx);
176
                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
177
                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);//["'","(","[","{","«"];
178

    
179
                Text t = new Text(project);
180
                t.setName(txtname);
181
                t.setSourceFile(txmFile)
182
                t.setTXMFile(txmFile)
183
                
184
                def ed = new pager(txmFile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, wordsPerPage, corpusname, metadataXPath);
185
                Edition edition = new Edition(t);
186
                edition.setName("default");
187
                edition.setIndex(outdir.getAbsolutePath());
188
                for (i = 0 ; i < ed.getPageFiles().size();) {
189
                        File f = ed.getPageFiles().get(i);
190
                        String wordid = "w_0";
191
                                if (i < ed.getIdx().size()) wordid = ed.getIdx().get(i);
192
                        edition.addPage(""+(++i), wordid);
193
                }
194
        }
195
}
196
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
197
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
198

    
199
readyToLoad = project.save();