root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xmltxm / compiler.groovy @ 1000
History | View | Annotate | Download (12.5 kB)
1 |
// Copyright © 2010-2013 ENS de Lyon.
|
---|---|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice
|
4 |
// Sophia Antipolis, University of Paris 3.
|
5 |
//
|
6 |
// The TXM platform is free software: you can redistribute it
|
7 |
// and/or modify it under the terms of the GNU General Public
|
8 |
// License as published by the Free Software Foundation,
|
9 |
// either version 2 of the License, or (at your option) any
|
10 |
// later version.
|
11 |
//
|
12 |
// The TXM platform is distributed in the hope that it will be
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 |
// PURPOSE. See the GNU General Public License for more
|
16 |
// details.
|
17 |
//
|
18 |
// You should have received a copy of the GNU General
|
19 |
// Public License along with the TXM platform. If not, see
|
20 |
// http://www.gnu.org/licenses.
|
21 |
//
|
22 |
//
|
23 |
//
|
24 |
// $LastChangedDate: 2016-05-26 17:42:36 +0200 (jeu. 26 mai 2016) $
|
25 |
// $LastChangedRevision: 3219 $
|
26 |
// $LastChangedBy: mdecorde $
|
27 |
//
|
28 |
|
29 |
|
30 |
package org.txm.scripts.importer.xmltxm;
|
31 |
|
32 |
import java.util.ArrayList;; |
33 |
|
34 |
import org.txm.*; |
35 |
import org.txm.core.engines.*; |
36 |
import org.txm.importer.cwb.BuildCwbEncodeArgs; |
37 |
import org.txm.importer.cwb.CwbEncode |
38 |
import org.txm.importer.cwb.CwbMakeAll |
39 |
import org.txm.scripts.importer.*; |
40 |
import org.txm.scripts.*; |
41 |
import org.txm.importer.scripts.xmltxm.*; |
42 |
import org.txm.utils.treetagger.TreeTagger; |
43 |
|
44 |
import javax.xml.stream.*; |
45 |
import java.net.URL; |
46 |
import java.io.File; |
47 |
import java.util.HashMap; |
48 |
import java.util.List; |
49 |
|
50 |
// TODO: Auto-generated Javadoc
|
51 |
/**
|
52 |
* The Class compiler.
|
53 |
*/
|
54 |
class compiler |
55 |
{ |
56 |
String sortMetadata;
|
57 |
/** The debug. */
|
58 |
private boolean debug= false; |
59 |
|
60 |
/** The input data. */
|
61 |
private def inputData; |
62 |
|
63 |
/** The factory. */
|
64 |
private def factory; |
65 |
|
66 |
/** The parser. */
|
67 |
private XMLStreamReader parser;
|
68 |
|
69 |
/** The dir. */
|
70 |
private def dir; |
71 |
|
72 |
/** The output. */
|
73 |
private Writer output; |
74 |
|
75 |
/** The url. */
|
76 |
private def url; |
77 |
|
78 |
/** The text. */
|
79 |
String text=""; |
80 |
|
81 |
/** The base. */
|
82 |
String base=""; |
83 |
|
84 |
/** The project. */
|
85 |
String project=""; |
86 |
|
87 |
/** The lang. */
|
88 |
private String lang ="fr"; |
89 |
|
90 |
/** The s attribs. */
|
91 |
private static HashMap<String, List<String>> sAttribs; |
92 |
|
93 |
/** The anatypes. */
|
94 |
private static anatypes = [] |
95 |
private static anavalues = [:] |
96 |
|
97 |
/**
|
98 |
* initialize.
|
99 |
*
|
100 |
*/
|
101 |
public compiler(){}
|
102 |
|
103 |
/**
|
104 |
* Instantiates a new compiler.
|
105 |
*
|
106 |
* @param url the url
|
107 |
* @param text the text
|
108 |
* @param base the base
|
109 |
* @param project the project
|
110 |
*/
|
111 |
public compiler(URL url,String text,String base, String project) |
112 |
{ |
113 |
this.text = text
|
114 |
this.base = base;
|
115 |
this.project = project;
|
116 |
try {
|
117 |
this.url = url;
|
118 |
inputData = url.openStream(); |
119 |
|
120 |
factory = XMLInputFactory.newInstance(); |
121 |
parser = factory.createXMLStreamReader(inputData); |
122 |
} catch (Exception ex) { |
123 |
System.out.println("Error while creating indexes: $ex"); |
124 |
ex.printStackTrace(); |
125 |
} |
126 |
} |
127 |
|
128 |
public void setSortMetadata(String sortMetadata) |
129 |
{ |
130 |
this.sortMetadata = sortMetadata;
|
131 |
} |
132 |
|
133 |
/**
|
134 |
* set the language of the corpus.
|
135 |
*
|
136 |
* @param lang the lang
|
137 |
* @return the java.lang. object
|
138 |
*/
|
139 |
public setLang(String lang) |
140 |
{ |
141 |
this.lang = lang;
|
142 |
} |
143 |
|
144 |
/**
|
145 |
* Creates the output.
|
146 |
*
|
147 |
* @param dirPathName the dir path name
|
148 |
* @param fileName the file name
|
149 |
* @return true, if successful
|
150 |
*/
|
151 |
private boolean createOutput(File f) { |
152 |
try {
|
153 |
//File f = new File(dirPathName, fileName)
|
154 |
output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f,f.exists())) , "UTF-8"); |
155 |
return true; |
156 |
} catch (Exception e) { |
157 |
println "Error while create CQP otput file: "+e
|
158 |
e.printStackTrace(); |
159 |
|
160 |
return false; |
161 |
} |
162 |
} |
163 |
|
164 |
/**
|
165 |
* Go to text.
|
166 |
*/
|
167 |
private boolean GoToText() |
168 |
{ |
169 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
170 |
if (event == XMLStreamConstants.END_ELEMENT)
|
171 |
if (parser.getLocalName().equals("teiHeader") || parser.getLocalName().equals("teiheader")) { |
172 |
return true; |
173 |
} |
174 |
} |
175 |
return false; |
176 |
} |
177 |
|
178 |
/**
|
179 |
* Transfom file cqp.
|
180 |
*
|
181 |
* @param dirPathName the dir path name
|
182 |
* @param fileName the file name
|
183 |
* @return true, if successful
|
184 |
*/
|
185 |
public boolean transfomFileCqp(File cqpFile) |
186 |
{ |
187 |
createOutput(cqpFile); |
188 |
String headvalue="" |
189 |
String vAna = ""; |
190 |
String vForm = ""; |
191 |
String wordid= ""; |
192 |
String vHead = ""; |
193 |
String anatype = null; |
194 |
String anavalue = null; |
195 |
int p_id = 0; |
196 |
int s_id = 0; |
197 |
|
198 |
boolean captureword = false; |
199 |
boolean flagForm = false; |
200 |
boolean flagAna = false; |
201 |
boolean inW = false; |
202 |
int wcounter = 1; |
203 |
if (!GoToText()) {
|
204 |
println "Error: no teiHeader tag found in text '"+this.text+"' (please check file format or content)" |
205 |
return false; |
206 |
} |
207 |
int wc = 0; // TEMP FOR TEST ONLY |
208 |
try {
|
209 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
210 |
{ |
211 |
switch (event) {
|
212 |
case XMLStreamConstants.START_ELEMENT:
|
213 |
//println "start: "+parser.getLocalName()
|
214 |
switch (parser.getLocalName()) {
|
215 |
case "w": |
216 |
|
217 |
inW = true;
|
218 |
anavalues = [:] |
219 |
wordid = parser.getAttributeValue(null, "id") |
220 |
if (wordid == null) |
221 |
wordid = "w_"+text+"_"+(wcounter++) |
222 |
|
223 |
vAna ="";
|
224 |
break;
|
225 |
|
226 |
case "form": |
227 |
String type2 = parser.getAttributeValue(null, "type"); |
228 |
if(type2 == null || type2.equals("default")) { |
229 |
flagForm = true;
|
230 |
vForm = "";
|
231 |
} else {
|
232 |
flagAna = true;
|
233 |
vAna += "\t";
|
234 |
if(!anatypes.contains(type2))
|
235 |
anatypes << type2; |
236 |
} |
237 |
|
238 |
break;
|
239 |
|
240 |
case "ana": |
241 |
flagAna = true;
|
242 |
anavalue = "";
|
243 |
anatype = parser.getAttributeValue(null, "type"); |
244 |
if (anatype != null) { |
245 |
if(anatype.startsWith("#")) |
246 |
anatype = anatype.substring(1)
|
247 |
break;
|
248 |
} |
249 |
break;
|
250 |
|
251 |
default:
|
252 |
if (!inW) {
|
253 |
output.write("<"+parser.getLocalName().toLowerCase());
|
254 |
if (!sAttribs.containsKey(parser.getLocalName()))
|
255 |
sAttribs.put(parser.getLocalName().toLowerCase(), []);
|
256 |
|
257 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
258 |
String attrname = parser.getAttributeLocalName(i).toLowerCase();
|
259 |
String attrvalue = parser.getAttributeValue(i);
|
260 |
if (!(parser.getLocalName() == "text" && attrname == "id")) |
261 |
output.write(" "+attrname+"=\""+attrvalue.replace("\"", "'")+"\""); |
262 |
|
263 |
if (!sAttribs.get(parser.getLocalName().toLowerCase()).contains(attrname))
|
264 |
sAttribs.get(parser.getLocalName().toLowerCase()).add(attrname) |
265 |
} |
266 |
|
267 |
if (parser.getLocalName() == "text") { |
268 |
output.write(" id=\""+text+"\" base=\""+base+"\" project=\""+project+"\""); |
269 |
} |
270 |
output.write(">\n");
|
271 |
} |
272 |
} |
273 |
break;
|
274 |
|
275 |
case XMLStreamConstants.END_ELEMENT:
|
276 |
switch (parser.getLocalName()) {
|
277 |
case "TEI": |
278 |
break;
|
279 |
case "w": |
280 |
for (String t : anatypes) { |
281 |
def v = anavalues.get(t);
|
282 |
if (v != null) vAna +="\t"+v; |
283 |
else vAna +="\t"; |
284 |
} |
285 |
|
286 |
output.write( vForm.replaceAll("&", "&").replaceAll("<", "<") +"\t"+wordid+vAna+"\n"); |
287 |
vAna = "";
|
288 |
vForm = "";
|
289 |
inW = false;
|
290 |
break;
|
291 |
|
292 |
case "form": |
293 |
flagForm = false;
|
294 |
flagAna = false;
|
295 |
break;
|
296 |
|
297 |
case "ana": |
298 |
anavalues.put(anatype, anavalue) |
299 |
flagAna = false;
|
300 |
break;
|
301 |
|
302 |
default:
|
303 |
if(!inW)
|
304 |
output.write("</"+parser.getLocalName().toLowerCase()+">\n"); |
305 |
|
306 |
} |
307 |
break;
|
308 |
|
309 |
case XMLStreamConstants.CHARACTERS:
|
310 |
if(inW)
|
311 |
{ |
312 |
if(flagForm) {
|
313 |
vForm += parser.getText().trim(); |
314 |
} |
315 |
else if (flagAna) { |
316 |
anavalue += parser.getText().trim(); |
317 |
} |
318 |
} |
319 |
break;
|
320 |
} |
321 |
} |
322 |
|
323 |
output.close(); |
324 |
parser.close(); |
325 |
} |
326 |
catch (Exception ex) { |
327 |
System.out.println("Error while writing CQP file $ex"); |
328 |
ex.printStackTrace(); |
329 |
return false; |
330 |
} |
331 |
return true; |
332 |
} |
333 |
|
334 |
|
335 |
|
336 |
/**
|
337 |
* Run.
|
338 |
*
|
339 |
* @param rootDirFile the root dir file
|
340 |
* @param basename the basename
|
341 |
* @return true, if successful
|
342 |
*/
|
343 |
public boolean run(File binDir, File txmDir, String basename, String corpusname, List<File> files) |
344 |
{ |
345 |
anatypes = new ArrayList<String>();// init only 1 time |
346 |
anavalues = [:] |
347 |
sAttribs = new HashMap<String, List<String>>();// init only 1 time |
348 |
String rootDir = binDir.getAbsolutePath();
|
349 |
|
350 |
if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
|
351 |
println ("Error: CWB executables not well set.")
|
352 |
return false; |
353 |
} |
354 |
if(!new File(rootDir).exists()) { |
355 |
println ("binary directory does not exists: "+rootDir)
|
356 |
return false; |
357 |
} |
358 |
|
359 |
File cqpFile = new File(binDir,"cqp/"+corpusname.toLowerCase()+".cqp"); |
360 |
new File(rootDir,"/cqp/").deleteDir(); |
361 |
new File(rootDir,"/cqp/").mkdir(); |
362 |
new File(rootDir,"data/"+corpusname).deleteDir(); |
363 |
new File(rootDir,"data/"+corpusname).mkdir(); |
364 |
new File(rootDir,"registry/").mkdir(); |
365 |
|
366 |
String textid = ""; |
367 |
int counttext = 0; |
368 |
//List<File> files = txmDir.listFiles();
|
369 |
|
370 |
// get all anatypes
|
371 |
for (File f : files) { |
372 |
getAnaTypes(f) |
373 |
} |
374 |
|
375 |
//0 set Lang
|
376 |
if (createOutput(cqpFile)) {
|
377 |
output.write("<txmcorpus lang=\""+lang+"\">\n"); |
378 |
output.close(); |
379 |
} |
380 |
//1- Transform into CQP file
|
381 |
def builder = null; |
382 |
for (File f : files) { |
383 |
counttext++; |
384 |
if (!f.exists()) {
|
385 |
println("file "+f+ " does not exists") |
386 |
} else {
|
387 |
//println("process file "+f)
|
388 |
String txtname = f.getName().substring(0,f.getName().length()-4); |
389 |
builder = new compiler(f.toURI().toURL(), txtname, corpusname.toLowerCase(), "default"); |
390 |
builder.setLang(lang); |
391 |
if(!builder.transfomFileCqp(cqpFile))
|
392 |
return false; |
393 |
} |
394 |
} |
395 |
|
396 |
//end corpus
|
397 |
if (createOutput(cqpFile)) {
|
398 |
output.write("</txmcorpus>\n");
|
399 |
output.close(); |
400 |
} |
401 |
|
402 |
//2- Import into CWB
|
403 |
|
404 |
CwbEncode cwbEn = new CwbEncode();
|
405 |
cwbEn.setDebug(debug); |
406 |
CwbMakeAll cwbMa = new CwbMakeAll();
|
407 |
cwbMa.setDebug(debug); |
408 |
List<String> pargs = ["id"]; |
409 |
for(String ana : anatypes) |
410 |
pargs.add(ana); |
411 |
|
412 |
List<String> sargs = []; |
413 |
//println "Found Sattributes "+this.sAttribs;
|
414 |
if(sAttribs.containsKey("text")) { |
415 |
if(!sAttribs.get("text").contains("id")) |
416 |
sAttribs.get("text").add("id"); |
417 |
if(!sAttribs.get("text").contains("base")) |
418 |
sAttribs.get("text").add("base"); |
419 |
if(!sAttribs.get("text").contains("project")) |
420 |
sAttribs.get("text").add("project"); |
421 |
} else {
|
422 |
sargs.add("text:0+id+base+project")
|
423 |
} |
424 |
|
425 |
if (sAttribs.containsKey("txmcorpus")) { |
426 |
if(!sAttribs.get("txmcorpus").contains("lang")) |
427 |
sAttribs.get("txmcorpus").add("lang"); |
428 |
} else {
|
429 |
sargs.add("txmcorpus:0+lang")
|
430 |
} |
431 |
|
432 |
for (String tag : this.sAttribs.keySet()) { |
433 |
String sAttr = tag;
|
434 |
if(sAttribs.get(tag).size() > 0) |
435 |
sAttr += ":";
|
436 |
for(String attr : sAttribs.get(tag)) |
437 |
sAttr +="+"+attr;
|
438 |
sargs.add(sAttr) |
439 |
} |
440 |
|
441 |
|
442 |
|
443 |
String[] sAttributes = sargs; |
444 |
String[] pAttributes = pargs; |
445 |
println "sAttributes : "+sAttributes;
|
446 |
println "pAttributes : "+pAttributes;
|
447 |
try {
|
448 |
String regPath = rootDir + "/registry/"+corpusname.toLowerCase() |
449 |
cwbEn.run( |
450 |
rootDir + "/data/$corpusname",
|
451 |
cqpFile.getAbsolutePath(), |
452 |
regPath, pAttributes, sAttributes); |
453 |
if (!new File(regPath).exists()) { |
454 |
println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
|
455 |
return false; |
456 |
} |
457 |
cwbMa.run(corpusname, rootDir + "/registry");
|
458 |
|
459 |
} catch (Exception ex) { |
460 |
System.out.println("Error while creating indexes with CQP tools: $ex"); |
461 |
ex.printStackTrace(); |
462 |
return false; |
463 |
} |
464 |
|
465 |
return true; |
466 |
} |
467 |
|
468 |
/**
|
469 |
* Sets the debug.
|
470 |
*/
|
471 |
public void setDebug() |
472 |
{ |
473 |
this.debug = true; |
474 |
} |
475 |
|
476 |
/**
|
477 |
* The main method.
|
478 |
*
|
479 |
* @param args the arguments
|
480 |
*/
|
481 |
public static void main(String[] args) |
482 |
{ |
483 |
File dir = new File("~/xml/geo"); |
484 |
def c = new compiler(); |
485 |
c.setDebug(); |
486 |
c.setCwbPath("~/TXM/cwb/bin");
|
487 |
c.run(dir,"geo");
|
488 |
} |
489 |
|
490 |
private void getAnaTypes(File xmlFile) { |
491 |
inputData = xmlFile.toURI().toURL().openStream(); |
492 |
factory = XMLInputFactory.newInstance(); |
493 |
parser = factory.createXMLStreamReader(inputData); |
494 |
String ana = "ana" |
495 |
HashSet<String> types = new HashSet<String>(); |
496 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
497 |
if (event == XMLStreamConstants.START_ELEMENT) { // start elem |
498 |
if (ana.equals(parser.getLocalName())) { // ana elem |
499 |
for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type |
500 |
if ("type".equals(parser.getAttributeLocalName(i))) { // @type |
501 |
types.add(parser.getAttributeValue(i).substring(1)); //remove the # |
502 |
break;
|
503 |
} |
504 |
} |
505 |
} |
506 |
} |
507 |
} |
508 |
parser.close() |
509 |
|
510 |
for (String type : types) |
511 |
if (!anatypes.contains(type))
|
512 |
anatypes << type |
513 |
} |
514 |
} |