root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZCompiler.groovy @ 2126
History | View | Annotate | Download (8.1 kB)
1 |
package org.txm.scripts.importer.xtz
|
---|---|
2 |
|
3 |
import java.io.File; |
4 |
|
5 |
import org.txm.* |
6 |
import org.txm.scripts.importer.SAttributesListener |
7 |
import org.txm.importer.cwb.* |
8 |
import org.txm.utils.ConsoleProgressBar |
9 |
import org.txm.core.preferences.TXMPreferences |
10 |
import org.txm.core.preferences.TBXPreferences |
11 |
import org.txm.libs.cqp.CQPLibPreferences |
12 |
|
13 |
import javax.xml.stream.* |
14 |
import org.txm.importer.xtz.* |
15 |
import org.txm.objects.* |
16 |
import org.txm.searchengine.cqp.corpus.* |
17 |
|
18 |
class XTZCompiler extends Compiler { |
19 |
|
20 |
SAttributesListener sattrsListener; // store scanned structures
|
21 |
private def anatypes = new HashSet<String>() // store scanned word attributes |
22 |
|
23 |
String regPath;
|
24 |
String corpusname;
|
25 |
String wtag;
|
26 |
|
27 |
boolean doNormalizeAttributeValues = false; |
28 |
boolean doNormalizeAnaValues = true; |
29 |
|
30 |
public XTZCompiler(ImportModule module) {
|
31 |
super(module);
|
32 |
|
33 |
corpusname = module.getProject().getName(); |
34 |
regPath = module.getBinaryDirectory().getAbsolutePath() + "/registry/"+corpusname.toLowerCase()
|
35 |
|
36 |
wtag = module.getProject().getTokenizerWordElement(); |
37 |
|
38 |
doNormalizeAttributeValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEATTRIBUTEVALUES, "false")) |
39 |
doNormalizeAnaValues = "true".equals(module.getProject().getPreferencesScope().getNode("import").get(TBXPreferences.NORMALISEANAVALUES, "false")) |
40 |
} |
41 |
|
42 |
public void process(ArrayList<File> files) { |
43 |
super.process(files); // set member |
44 |
|
45 |
if (files == null) files = inputDirectory.listFiles(); |
46 |
|
47 |
Project project = module.getProject(); |
48 |
CorpusBuild corpus = project.getCorpusBuild(project.getName(), MainCorpus.class); |
49 |
if (corpus != null) { |
50 |
if (project.getDoUpdate()) {
|
51 |
corpus.clean(); // remove old files
|
52 |
} else {
|
53 |
corpus.delete(); // remove old files and TXMResult children
|
54 |
} |
55 |
} else {
|
56 |
corpus = new MainCorpus(project);
|
57 |
corpus.setID(project.getName()); |
58 |
corpus.setName(project.getName()); |
59 |
} |
60 |
corpus.setDescription("Built with the XTZ import module");
|
61 |
|
62 |
if (!doScanStep()) return; |
63 |
if (!doCQPStep()) return; |
64 |
if (!doCWBEncodeStep()) return; |
65 |
if (!doCWBMakeAllStep()) return; |
66 |
|
67 |
if (module.getProject().getCleanAfterBuild()) {
|
68 |
new File(module.getBinaryDirectory(), "cqp").deleteDir() |
69 |
} |
70 |
|
71 |
isSuccessFul = true;
|
72 |
} |
73 |
|
74 |
/**
|
75 |
* Scan all XML-TXM files to find out structures and word properties
|
76 |
*/
|
77 |
public boolean doScanStep() { |
78 |
// get all anatypes
|
79 |
sattrsListener = SAttributesListener.scanFiles(inputDirectory, wtag) |
80 |
println "-- Listing structures&properties to create for "+files.size()+" XML-TXM files..." |
81 |
ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
|
82 |
for (File f : files) { |
83 |
try {
|
84 |
cpb.tick(); |
85 |
getAnaTypes(f) |
86 |
} catch (Exception e) { |
87 |
println "Error while processing $f text: "+e
|
88 |
e.printStackTrace(); |
89 |
return false; |
90 |
} |
91 |
} |
92 |
println ""
|
93 |
return true; |
94 |
} |
95 |
|
96 |
private void getAnaTypes(File xmlFile) { |
97 |
def inputData = xmlFile.toURI().toURL().openStream();
|
98 |
def factory = XMLInputFactory.newInstance();
|
99 |
def parser = factory.createXMLStreamReader(inputData);
|
100 |
boolean start = false; |
101 |
String ANA = "ana" |
102 |
String TYPE = "type" |
103 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
104 |
if (event == XMLStreamConstants.START_ELEMENT) { // start elem |
105 |
if (wtag.equals(parser.getLocalName())) {
|
106 |
start = true;
|
107 |
} else if (start && ANA.equals(parser.getLocalName())) { // ana elem |
108 |
for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type |
109 |
if (TYPE.equals(parser.getAttributeLocalName(i))) { // @type |
110 |
anatypes.add(parser.getAttributeValue(i).substring(1)); //remove the # |
111 |
break;
|
112 |
} |
113 |
} |
114 |
} |
115 |
} else if (event == XMLStreamConstants.END_ELEMENT) { // start elem |
116 |
if (wtag.equals(parser.getLocalName())) {
|
117 |
start = false;
|
118 |
} |
119 |
} |
120 |
} |
121 |
|
122 |
if (parser != null) parser.close(); |
123 |
if (inputData != null) inputData.close(); |
124 |
|
125 |
// for (String type : types)
|
126 |
// if (!anatypes.contains(type))
|
127 |
// anatypes << type
|
128 |
} |
129 |
|
130 |
def cqpFiles = [] |
131 |
public boolean doCQPStep() { |
132 |
println "-- Building CQP files $inputDirectory..."
|
133 |
cqpDirectory.mkdir(); // if not created
|
134 |
|
135 |
ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
|
136 |
for (File xmlFile : files) { |
137 |
cpb.tick(); |
138 |
String textname = xmlFile.getName();
|
139 |
int idx = textname.lastIndexOf(".") |
140 |
if (idx > 0) textname = textname.substring(0, idx) |
141 |
|
142 |
File cqpFile = new File(cqpDirectory, textname + ".cqp") |
143 |
cqpFiles << cqpFile |
144 |
// skip step if cqpFile is more recent than xmlFile
|
145 |
if (cqpFile.exists() && cqpFile.lastModified() >= xmlFile.lastModified()) continue; |
146 |
|
147 |
XTZCompilerStep step = new XTZCompilerStep(xmlFile, cqpFile, textname, corpusname, "default", anatypes, wtag) |
148 |
step.setNormalizeAnaValues(doNormalizeAnaValues) |
149 |
step.setNormalizeAttributeValues(doNormalizeAttributeValues) |
150 |
if (!step.process()) {
|
151 |
reason = "Fail to process $xmlFile."
|
152 |
return false;
|
153 |
} |
154 |
} |
155 |
println ""
|
156 |
return true; |
157 |
} |
158 |
|
159 |
public boolean doCWBEncodeStep() { |
160 |
println "-- Running cwb-encode..."
|
161 |
CwbEncode cwbEn = new CwbEncode()
|
162 |
cwbEn.setDebug(debug) |
163 |
|
164 |
List<String> pargs = [] |
165 |
pargs.add("id")
|
166 |
for (String ana : anatypes) |
167 |
pargs.add(ana) |
168 |
|
169 |
String[] pAttrs = pargs |
170 |
|
171 |
def structs = sattrsListener.getStructs()
|
172 |
def structsProf = sattrsListener.getProfs()
|
173 |
|
174 |
if (debug) {
|
175 |
println structs |
176 |
println structsProf |
177 |
} |
178 |
|
179 |
List<String> sargs = new ArrayList<String>() |
180 |
def tmpTextAttrs = [] |
181 |
for (String name : structs.keySet()) { |
182 |
if (name == "text") { |
183 |
for (String value : structs.get(name)) // append the attributes |
184 |
tmpTextAttrs << value // added after
|
185 |
continue;
|
186 |
} |
187 |
|
188 |
String concat = name+":"+structsProf.get(name); // append the depth |
189 |
for (String attributeName : structs.get(name)) // append the attributes |
190 |
concat += "+"+attributeName.toLowerCase();
|
191 |
|
192 |
if (structs.get(name).size() == 0) { |
193 |
concat += "+n";
|
194 |
} else {
|
195 |
if (!structs.get(name).contains("n")) |
196 |
concat += "+n"
|
197 |
} |
198 |
|
199 |
if ((name == "p" || name == "body" || name == "back" || name == "front") |
200 |
&& !concat.contains("+n+") && !concat.endsWith("+n")) |
201 |
concat += "+n"
|
202 |
|
203 |
sargs.add(concat) |
204 |
} |
205 |
|
206 |
String textSAttributes = "text:0+id+base+project"; |
207 |
for (String name : tmpTextAttrs) { |
208 |
if (!("id".equals(name) || "base".equals(name) || "project".equals(name))) |
209 |
textSAttributes += "+"+name.toLowerCase()
|
210 |
} |
211 |
|
212 |
sargs.add(textSAttributes) |
213 |
sargs.add("txmcorpus:0+lang")
|
214 |
|
215 |
sargs.sort() |
216 |
|
217 |
String[] sAttributes = sargs |
218 |
String[] pAttributes = pAttrs |
219 |
println " Word properties: "+pAttributes
|
220 |
println " Structures: "+sargs
|
221 |
File allcqpFile = new File(cqpDirectory, "all.cqp"); |
222 |
allcqpFile.delete() |
223 |
try {
|
224 |
if (!CwbEncode.concat(cqpFiles, allcqpFile)) {
|
225 |
println "Fail to write the master cqp file: "+allcqpFile
|
226 |
return false; |
227 |
} |
228 |
|
229 |
if (!cwbEn.run(outputDirectory.getAbsolutePath() + "/$corpusname", |
230 |
allcqpFile.getAbsolutePath(), |
231 |
regPath, pAttributes, sAttributes, false)) {
|
232 |
println "** cwb-encode did not ends well. Activate finer logs to see details."
|
233 |
return false; |
234 |
} |
235 |
|
236 |
allcqpFile.delete(); // clean
|
237 |
} catch (Exception e) { |
238 |
println "Error while running cwb-encode: "+e
|
239 |
e.printStackTrace() |
240 |
allcqpFile.delete(); // clean
|
241 |
return false; |
242 |
} |
243 |
println ""
|
244 |
return true; |
245 |
} |
246 |
|
247 |
public boolean doCWBMakeAllStep() { |
248 |
println "-- Running cwb-makeall..."
|
249 |
try {
|
250 |
CwbMakeAll cwbMa = new CwbMakeAll();
|
251 |
cwbMa.setDebug(debug); |
252 |
|
253 |
if (!new File(regPath).exists()) { |
254 |
println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
|
255 |
return false; |
256 |
} |
257 |
if (!cwbMa.run(corpusname, new File(regPath).getParent())) { |
258 |
println "** cwb-makeall did not ends well. Activate finer logs to see details."
|
259 |
return false; |
260 |
} |
261 |
|
262 |
// remove milestones from CWB registry and data files
|
263 |
FixMilestoneDeclarations fm = new FixMilestoneDeclarations(
|
264 |
new File(regPath), new File(outputDirectory.getAbsolutePath(), corpusname)); |
265 |
if (!fm.process()) {
|
266 |
println "Fail to verify&fix milestone declarations"
|
267 |
return false |
268 |
} |
269 |
} catch (Exception e) { |
270 |
println "Error while running cwb-makeall: "+e
|
271 |
return false; |
272 |
} |
273 |
return true; |
274 |
} |
275 |
} |