root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / bfm / compiler.groovy @ 966
History | View | Annotate | Download (26.7 kB)
1 |
// Copyright © 2010-2013 ENS de Lyon.
|
---|---|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice
|
4 |
// Sophia Antipolis, University of Paris 3.
|
5 |
//
|
6 |
// The TXM platform is free software: you can redistribute it
|
7 |
// and/or modify it under the terms of the GNU General Public
|
8 |
// License as published by the Free Software Foundation,
|
9 |
// either version 2 of the License, or (at your option) any
|
10 |
// later version.
|
11 |
//
|
12 |
// The TXM platform is distributed in the hope that it will be
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 |
// PURPOSE. See the GNU General Public License for more
|
16 |
// details.
|
17 |
//
|
18 |
// You should have received a copy of the GNU General
|
19 |
// Public License along with the TXM platform. If not, see
|
20 |
// http://www.gnu.org/licenses.
|
21 |
|
22 |
//
|
23 |
// This file is part of the TXM platform.
|
24 |
//
|
25 |
// The TXM platform is free software: you can redistribute it and/or modif y
|
26 |
// it under the terms of the GNU General Public License as published by
|
27 |
// the Free Software Foundation, either version 3 of the License, or
|
28 |
// (at your option) any later version.
|
29 |
//
|
30 |
// The TXM platform is distributed in the hope that it will be useful,
|
31 |
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
32 |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
33 |
// GNU General Public License for more details.
|
34 |
//
|
35 |
// You should have received a copy of the GNU General Public License
|
36 |
// along with the TXM platform. If not, see <http://www.gnu.org/licenses/>.
|
37 |
//
|
38 |
//
|
39 |
//
|
40 |
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
|
41 |
// $LastChangedRevision: 3400 $
|
42 |
// $LastChangedBy: mdecorde $
|
43 |
//
|
44 |
package org.txm.importer.bfm
|
45 |
|
46 |
import org.txm.Toolbox; |
47 |
import org.txm.importer.cwb.* |
48 |
import org.txm.importer.*; |
49 |
import org.txm.scripts.*; |
50 |
import org.txm.importer.xmltxm.BuildTTSrc; |
51 |
import org.txm.importer.xmltxm.*; |
52 |
import org.txm.utils.treetagger.TreeTagger; |
53 |
|
54 |
import javax.xml.stream.*; |
55 |
import java.net.URL; |
56 |
import java.io.File; |
57 |
import java.util.Comparator; |
58 |
import java.util.HashMap; |
59 |
import java.util.List; |
60 |
|
61 |
// TODO: Auto-generated Javadoc
|
62 |
/**
|
63 |
* Produce CQP files from the TEI-TXM files. <br/>
|
64 |
* - Read texts metadata with XPath queries <br/>
|
65 |
* - Add the following word properties : sic, abbr, orig, lb and pb <br/>
|
66 |
* - Keep <front>, <body> and <back> for each text <br/>
|
67 |
* - Text enclosed in <q> is tokenized <br/>
|
68 |
*
|
69 |
* @author mdecorde
|
70 |
*
|
71 |
*/
|
72 |
class compiler { |
73 |
/** The debug. */
|
74 |
private boolean debug= false; |
75 |
|
76 |
/** The annotate_status. */
|
77 |
private boolean annotate_status=true; |
78 |
|
79 |
/** The input data. */
|
80 |
private def inputData; |
81 |
|
82 |
/** The factory. */
|
83 |
private def factory; |
84 |
|
85 |
/** The parser. */
|
86 |
private XMLStreamReader parser;
|
87 |
|
88 |
/** The dir. */
|
89 |
private def dir; |
90 |
|
91 |
/** The output. */
|
92 |
private def output; |
93 |
|
94 |
/** The url. */
|
95 |
private def url; |
96 |
|
97 |
/** The anahash. */
|
98 |
static boolean firstWord = true; |
99 |
static private def anaTypes = []; |
100 |
private HashMap<String,String> anahash = new HashMap<String,String>() ; |
101 |
|
102 |
private static SAttributesListener sattrsListener; |
103 |
private static HashMap<String,ArrayList<String>> structs; |
104 |
private static HashMap<String, Integer> structsProf; |
105 |
|
106 |
/** The text. */
|
107 |
private String text=""; |
108 |
|
109 |
/** The base. */
|
110 |
private String base=""; |
111 |
|
112 |
/** The project. */
|
113 |
private String project=""; |
114 |
|
115 |
/** The lang. */
|
116 |
private String lang ="fr"; |
117 |
|
118 |
/**
|
119 |
* contains the metadata xpath organize per name
|
120 |
*/
|
121 |
Properties metadataXPath;
|
122 |
|
123 |
/**
|
124 |
* initialize.
|
125 |
*
|
126 |
*/
|
127 |
public compiler(){
|
128 |
firstWord = true;
|
129 |
anaTypes = [];
|
130 |
} |
131 |
|
132 |
/**
|
133 |
* initialize the compiler.
|
134 |
*
|
135 |
* @param url the file to process
|
136 |
* @param text the Texte's name
|
137 |
* @param base the base's name
|
138 |
* @param project the Project's name
|
139 |
*/
|
140 |
public compiler(URL url,String text,String base, String project, Properties metadataXPath) |
141 |
{ |
142 |
this.metadataXPath = metadataXPath;
|
143 |
this.text = text
|
144 |
this.base = base;
|
145 |
this.project = project;
|
146 |
try {
|
147 |
this.url = url;
|
148 |
inputData = url.openStream(); |
149 |
|
150 |
factory = XMLInputFactory.newInstance(); |
151 |
parser = factory.createXMLStreamReader(inputData); |
152 |
if (sattrsListener == null) |
153 |
sattrsListener = new SAttributesListener(parser);
|
154 |
else
|
155 |
sattrsListener.start(parser) |
156 |
} catch (XMLStreamException ex) {
|
157 |
System.out.println(ex);
|
158 |
}catch (IOException ex) { |
159 |
System.out.println("IOException while parsing "); |
160 |
} |
161 |
} |
162 |
|
163 |
ArrayList<File> orderedFiles; |
164 |
public ArrayList<File> getOrderedTxmFiles() { |
165 |
return orderedFiles;
|
166 |
} |
167 |
|
168 |
/**
|
169 |
* Sets the lang.
|
170 |
*
|
171 |
* @param lang the new lang
|
172 |
*/
|
173 |
public void setLang(String lang) |
174 |
{ |
175 |
this.lang = lang;
|
176 |
} |
177 |
|
178 |
/**
|
179 |
* Sets the annotation done.
|
180 |
*
|
181 |
* @param done the new annotation done
|
182 |
*/
|
183 |
public void setAnnotationDone(boolean done) |
184 |
{ |
185 |
this.annotate_status = done;
|
186 |
} |
187 |
|
188 |
/**
|
189 |
* Creates the output.
|
190 |
*
|
191 |
* @param dirPathName the dir path name
|
192 |
* @param fileName the file name
|
193 |
* @return true, if successful
|
194 |
*/
|
195 |
private boolean createOutput(File f) { |
196 |
try {
|
197 |
output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8"); |
198 |
} catch (Exception e) { |
199 |
System.out.println(e.getLocalizedMessage());
|
200 |
return false; |
201 |
} |
202 |
return true; |
203 |
} |
204 |
|
205 |
/**
|
206 |
* Go to text.
|
207 |
*/
|
208 |
private void GoToText() |
209 |
{ |
210 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
211 |
if (event == XMLStreamConstants.END_ELEMENT)
|
212 |
if (parser.getLocalName().equals("teiHeader")) |
213 |
return;
|
214 |
} |
215 |
} |
216 |
|
217 |
/**
|
218 |
* Increment.
|
219 |
*
|
220 |
* @param parser the parser
|
221 |
* @param value the value
|
222 |
* @return the java.lang. object
|
223 |
*/
|
224 |
private def increment(XMLStreamReader parser, int value) |
225 |
{ |
226 |
String n=null; |
227 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
228 |
//System.out.println("attr name "+parser.getAttributeLocalName(i));
|
229 |
if (parser.getAttributeLocalName(i) == "n") { |
230 |
n = parser.getAttributeValue(i); |
231 |
break;
|
232 |
} |
233 |
} |
234 |
//System.out.println("inc n "+n);
|
235 |
if (n != null) |
236 |
try {
|
237 |
value = Integer.parseInt(n);
|
238 |
return value;
|
239 |
} |
240 |
catch (Exception e) {return value+1;} |
241 |
|
242 |
value = value+1;
|
243 |
return value;
|
244 |
} |
245 |
|
246 |
/**
|
247 |
* Transfom file cqp.
|
248 |
*
|
249 |
* @param dirPathName the dir path name
|
250 |
* @param fileName the file name
|
251 |
* @return true, if successful
|
252 |
*/
|
253 |
private boolean transfomFileCqp(File cqpFile) |
254 |
{ |
255 |
try {
|
256 |
if (!createOutput(cqpFile)) return false; |
257 |
|
258 |
String headvalue = "" |
259 |
String vAna = ""; |
260 |
String vForm = ""; |
261 |
String wordid = ""; |
262 |
String vHead = ""; |
263 |
|
264 |
Integer p_id = 0; |
265 |
Integer s_id = 0; |
266 |
Integer q_id = 0; |
267 |
int sp_id = 0; |
268 |
Integer body_id = 0; |
269 |
Integer front_id = 0; |
270 |
Integer back_id = 0; |
271 |
Integer lb_id = 0; |
272 |
Integer pb_id = 0; |
273 |
Integer ab_id = 0; |
274 |
int foreign_id = 0; |
275 |
int name_id = 0; |
276 |
|
277 |
boolean captureword = false; |
278 |
|
279 |
String vExpan = ""; |
280 |
String vCorr = ""; |
281 |
String vReg = ""; |
282 |
String vOrig = ""; |
283 |
String vSic = ""; |
284 |
String vAbbr = ""; |
285 |
String givenpos = ""; |
286 |
String pb_n = ""; |
287 |
String foreign_lang = ""; |
288 |
String nameType = ""; |
289 |
String anaType;
|
290 |
//String abType = "";
|
291 |
|
292 |
boolean foundtei=false, foundtext=false; |
293 |
|
294 |
boolean flaglg = false; |
295 |
int levelq = 0; |
296 |
//boolean flagq = false;
|
297 |
boolean flaghead = false; |
298 |
//Added:
|
299 |
boolean flagSp = false; |
300 |
boolean flagAuthor = false; |
301 |
boolean flagDate = false; |
302 |
boolean flagWord = false; |
303 |
boolean flagForm = false; |
304 |
boolean flagAna = false; |
305 |
|
306 |
boolean flagchoice = false; |
307 |
boolean flagcorr = false; |
308 |
boolean flagsic = false; |
309 |
boolean flagreg = false; |
310 |
boolean flagexpan = false; |
311 |
boolean flagorig = false; |
312 |
boolean flagabbr = false; |
313 |
boolean flagfw = false; |
314 |
//boolean flagSupplied = false;
|
315 |
int levelSupplied = 0; |
316 |
//boolean flagSurplus = false;
|
317 |
boolean flagForeign = false; |
318 |
//boolean flagName = false;
|
319 |
|
320 |
this.GoToText();
|
321 |
int missingId= 0 |
322 |
boolean USEVERSE = false; // switch default reference to verse references |
323 |
String titreId; // the title to use in the reference |
324 |
|
325 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
326 |
switch (event) {
|
327 |
case XMLStreamConstants.START_ELEMENT:
|
328 |
|
329 |
String localname = parser.getLocalName();
|
330 |
if (foundtext) sattrsListener.startElement(localname);
|
331 |
|
332 |
switch (localname) {
|
333 |
case "TEI": |
334 |
foundtei = true;
|
335 |
break;
|
336 |
case "text": |
337 |
foundtext = true;
|
338 |
sattrsListener.startElement(localname); |
339 |
output.write("<text id=\""+text+"\"") |
340 |
|
341 |
for (int i = 0; i < parser.getAttributeCount() ; i++) { |
342 |
String name = parser.getAttributeLocalName(i);
|
343 |
if ("id" == name || "base" == name || "project" == name) continue; |
344 |
output.write(" "+name+"=\""+parser.getAttributeValue(i)+"\""); |
345 |
|
346 |
if (name == "forme") { |
347 |
USEVERSE = (parser.getAttributeValue(i).contains("vers"))
|
348 |
} else if (name == "sigle") { |
349 |
titreId = parser.getAttributeValue(i) |
350 |
} |
351 |
} |
352 |
|
353 |
output.write(" base=\""+base+"\" project=\""+project+"\">\n"); |
354 |
captureword=true;
|
355 |
break;
|
356 |
|
357 |
case "div": |
358 |
//output.write("<div type=\""+parser.getAttributeValue(null,"type")+"\">\n");
|
359 |
String divType = "NA"; |
360 |
String divSubtype = "NA"; |
361 |
String divN = "NA"; |
362 |
String divId ="NA"; |
363 |
for(int i = 0 ; i < parser.getAttributeCount(); i++) { |
364 |
if(parser.getAttributeLocalName(i) == "type") { |
365 |
divType = parser.getAttributeValue(i); |
366 |
} else if(parser.getAttributeLocalName(i) == "subtype") { |
367 |
divSubtype = parser.getAttributeValue(i); |
368 |
} else if(parser.getAttributeLocalName(i) == "n") { |
369 |
divN = parser.getAttributeValue(i); |
370 |
} else if(parser.getAttributeLocalName(i) == "id") { |
371 |
divId = parser.getAttributeValue(i); |
372 |
break;
|
373 |
} |
374 |
} |
375 |
output.write("<div type=\""+divType+"\" subtype=\""+divSubtype+"\" n=\""+divN+"\" id=\""+divId+"\">\n"); |
376 |
break;
|
377 |
case "p": |
378 |
p_id = increment(parser, p_id); |
379 |
output.write("<p n=\""+p_id+"\">\n"); |
380 |
break;
|
381 |
case "ab": |
382 |
ab_id = increment(parser, ab_id) |
383 |
output.write("<ab n=\""+(ab_id)+"\" type=\""+parser.getAttributeValue(null,"type")+"\" subtype=\""+parser.getAttributeValue(null,"subtype")+"\" rend=\""+parser.getAttributeValue(null,"rend")+"\">\n"); |
384 |
break;
|
385 |
case "q": |
386 |
q_id = increment(parser, q_id) |
387 |
output.write("<q n=\""+(q_id)+"\">\n"); |
388 |
//flagq=true;
|
389 |
levelq = levelq + 1;
|
390 |
break;
|
391 |
case "sp": |
392 |
sp_id = increment(parser, sp_id) |
393 |
output.write("<sp n=\""+(sp_id)+"\">\n"); |
394 |
flagSp = true;
|
395 |
break;
|
396 |
case "front": |
397 |
front_id = increment(parser, front_id) |
398 |
output.write("<front n=\""+front_id+"\">\n"); |
399 |
break;
|
400 |
case "body": |
401 |
body_id= increment(parser, body_id) |
402 |
output.write("<body n=\""+body_id+"\">\n"); |
403 |
break;
|
404 |
case "back": |
405 |
back_id = increment(parser, back_id) |
406 |
output.write("<back n=\""+back_id+"\">\n"); |
407 |
break;
|
408 |
case "lb": |
409 |
lb_id = increment(parser, lb_id) |
410 |
break;
|
411 |
case "pb": |
412 |
pb_id = increment(parser, pb_id) |
413 |
for (int i = 0 ; i < parser.getAttributeCount(); i++) { |
414 |
if (parser.getAttributeLocalName(i) == "n") { |
415 |
pb_n = parser.getAttributeValue(i); |
416 |
} |
417 |
} |
418 |
break;
|
419 |
case "s": |
420 |
s_id = increment(parser, s_id) |
421 |
output.write("<s n=\""+s_id+"\">\n"); |
422 |
break;
|
423 |
case "choice": |
424 |
flagchoice = true;
|
425 |
break;
|
426 |
case "corr": |
427 |
flagcorr = true;
|
428 |
vCorr= "";
|
429 |
break;
|
430 |
case "reg": |
431 |
flagreg = true;
|
432 |
vReg= "";
|
433 |
break;
|
434 |
case "expan": |
435 |
flagexpan = true;
|
436 |
vExpan= "";
|
437 |
break;
|
438 |
case "orig": |
439 |
flagreg = true;
|
440 |
vOrig= "";
|
441 |
break;
|
442 |
case "sic": |
443 |
flagsic = true;
|
444 |
vSic= "";
|
445 |
break;
|
446 |
case "abbr": |
447 |
flagreg = true;
|
448 |
vAbbr= "";
|
449 |
break;
|
450 |
case "foreign": |
451 |
flagForeign = true;
|
452 |
for (int i = 0 ; i < parser.getAttributeCount(); i++) { |
453 |
if (parser.getAttributeLocalName(i) == "lang") { |
454 |
lang = parser.getAttributeValue(i); |
455 |
break;
|
456 |
} |
457 |
} |
458 |
|
459 |
output.write("<foreign n=\""+(foreign_id++)+"\" lang=\""+lang+"\">\n"); |
460 |
//vForeign = "";
|
461 |
break;
|
462 |
|
463 |
case "name": |
464 |
//flagName = true;
|
465 |
for(int i = 0 ; i < parser.getAttributeCount(); i++) |
466 |
if(parser.getAttributeLocalName(i) == "type") |
467 |
{ |
468 |
nameType = parser.getAttributeValue(i); |
469 |
break;
|
470 |
} |
471 |
|
472 |
output.write("<name n=\""+(name_id++)+"\" type=\""+nameType+"\">\n"); |
473 |
break;
|
474 |
case "supplied": |
475 |
//flagSupplied = true;
|
476 |
levelSupplied = levelSupplied + 1;
|
477 |
break;
|
478 |
|
479 |
case "surplus": |
480 |
flagfw = true;
|
481 |
break;
|
482 |
|
483 |
case "del": |
484 |
flagfw = true;
|
485 |
break;
|
486 |
|
487 |
case "w": |
488 |
givenpos = "";
|
489 |
wordid = "w_"+text+"_m"+missingId++ |
490 |
for (int i = 0 ; i < parser.getAttributeCount(); i++) { |
491 |
if (parser.getAttributeLocalName(i) == "id") { |
492 |
wordid = parser.getAttributeValue(i); |
493 |
} else if (parser.getAttributeLocalName(i) == "type") { |
494 |
givenpos = parser.getAttributeValue(i); |
495 |
} |
496 |
} |
497 |
if (wordid.startsWith("w")) { |
498 |
if (!wordid.startsWith("w_")) |
499 |
wordid = "w_"+wordid.substring(1) |
500 |
} else {
|
501 |
wordid = "w_"+wordid;
|
502 |
} |
503 |
|
504 |
if (givenpos == null || givenpos == "") |
505 |
givenpos = "NA";
|
506 |
vForm = "";
|
507 |
anahash.clear(); // remove previous word ana values
|
508 |
flagWord = true;
|
509 |
break;
|
510 |
case "form": |
511 |
flagForm = true;
|
512 |
vForm = "";
|
513 |
break;
|
514 |
|
515 |
case "ana": |
516 |
flagAna = true;
|
517 |
anaType = parser.getAttributeValue(null, "type") |
518 |
anahash.put(anaType, "");
|
519 |
if (firstWord) {
|
520 |
anaTypes << anaType; |
521 |
} |
522 |
break;
|
523 |
} |
524 |
break;
|
525 |
|
526 |
case XMLStreamConstants.END_ELEMENT:
|
527 |
String localname = parser.getLocalName();
|
528 |
if (foundtext) sattrsListener.endElement(localname);
|
529 |
|
530 |
switch (localname) {
|
531 |
case "div": |
532 |
output.write("</div>\n");
|
533 |
break;
|
534 |
case "text": |
535 |
output.write("</text>\n");
|
536 |
captureword=false;
|
537 |
break;
|
538 |
case "p": |
539 |
output.write("</p>\n");
|
540 |
break;
|
541 |
case "s": |
542 |
output.write("</s>\n");
|
543 |
break;
|
544 |
case "ab": |
545 |
output.write("</ab>\n");
|
546 |
break;
|
547 |
case "q": |
548 |
output.write("</q>\n");
|
549 |
//flagq= false;
|
550 |
levelq = levelq - 1;
|
551 |
break;
|
552 |
case "sp": |
553 |
output.write("</sp>\n");
|
554 |
flagSp = false;
|
555 |
break;
|
556 |
case "front": |
557 |
output.write("</front>\n");
|
558 |
break;
|
559 |
case "body": |
560 |
output.write("</body>\n");
|
561 |
break;
|
562 |
case "back": |
563 |
output.write("</back>\n");
|
564 |
break;
|
565 |
|
566 |
// case "fw":
|
567 |
// flagfw = false;
|
568 |
// break;
|
569 |
|
570 |
case "choice": |
571 |
if(vOrig == "") |
572 |
vOrig="NA";
|
573 |
if(vSic == "") |
574 |
vSic="NA";
|
575 |
if(vAbbr == "") |
576 |
vAbbr="NA";
|
577 |
|
578 |
String ref;
|
579 |
if(USEVERSE)
|
580 |
ref = titreId+", p."+pb_n+", v."+lb_id; |
581 |
else
|
582 |
ref = titreId+", p."+pb_n;
|
583 |
|
584 |
if (flagfw) {
|
585 |
// on est hors texte
|
586 |
} else {
|
587 |
String vFormToWrite = vForm;
|
588 |
if (vCorr != "") { |
589 |
vFormToWrite = vCorr; |
590 |
} else if(vReg != "") { |
591 |
vFormToWrite = vReg; |
592 |
} else if(vExpan != "") { |
593 |
vFormToWrite = vExpan |
594 |
} |
595 |
firstWord = false;
|
596 |
output.write( vFormToWrite +"\t"+wordid+"\t"+levelq.toString().substring(0,1)+"\t"+flagSp.toString().substring(0,1)+"\t"+pb_n+"\t"+lb_id+"\t"+vOrig+ |
597 |
"\t"+vSic+"\t"+vAbbr+"\t"+ref+"\t"+givenpos+"\t"+levelSupplied.toString().substring(0,1)+"\t"+lang+"\t"+nameType); |
598 |
for(String type : anaTypes) { |
599 |
output.write("\t"+anahash.get(type));
|
600 |
} |
601 |
output.write("\n")
|
602 |
} |
603 |
flagchoice = false;
|
604 |
vCorr= "";
|
605 |
vSic= "";
|
606 |
break;
|
607 |
case "corr": |
608 |
flagcorr = false;
|
609 |
|
610 |
break;
|
611 |
case "reg": |
612 |
flagreg = false;
|
613 |
vReg = "";
|
614 |
break;
|
615 |
case "expan": |
616 |
flagexpan = false;
|
617 |
vExpan= "";
|
618 |
break;
|
619 |
case "orig": |
620 |
flagreg = false;
|
621 |
vOrig= "";
|
622 |
break;
|
623 |
case "sic": |
624 |
flagsic = false;
|
625 |
|
626 |
break;
|
627 |
case "abbr": |
628 |
flagreg = false;
|
629 |
vAbbr= "";
|
630 |
break;
|
631 |
|
632 |
case "foreign": |
633 |
flagForeign = false;
|
634 |
lang = "";
|
635 |
output.write("</foreign>\n");
|
636 |
break;
|
637 |
|
638 |
case "name": |
639 |
//flagName = false;
|
640 |
nameType = "";
|
641 |
output.write("</name>\n");
|
642 |
break;
|
643 |
|
644 |
case "supplied": |
645 |
//flagSupplied = false;
|
646 |
levelSupplied = levelSupplied - 1;
|
647 |
break;
|
648 |
|
649 |
case "surplus": |
650 |
flagfw = false;
|
651 |
break;
|
652 |
|
653 |
case "del": |
654 |
flagfw = false;
|
655 |
break;
|
656 |
|
657 |
case "w": |
658 |
if (captureword) {
|
659 |
if (flagchoice) {
|
660 |
|
661 |
} else if(flagfw) { |
662 |
|
663 |
} else {
|
664 |
if (vOrig == "") |
665 |
vOrig="NA";
|
666 |
if(vSic == "") |
667 |
vSic="NA";
|
668 |
if(vAbbr == "") |
669 |
vAbbr="NA";
|
670 |
if (nameType == "") |
671 |
nameType = "NA";
|
672 |
if(lang == "") |
673 |
lang="fr"
|
674 |
|
675 |
String ref;
|
676 |
if(USEVERSE)
|
677 |
ref = titreId+", p."+pb_n+", v."+lb_id; |
678 |
else
|
679 |
ref = titreId+", p."+pb_n;
|
680 |
|
681 |
firstWord = false;
|
682 |
output.write(vForm.replaceAll("&", "&").replaceAll("<", "<") +"\t"+wordid+"\t"+levelq.toString().substring(0,1)+"\t"+flagSp.toString().substring(0,1)+"\t"+pb_n+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\t"+ref+"\t"+givenpos+"\t"+levelSupplied.toString().substring(0,1)+"\t"+lang+"\t"+nameType); |
683 |
for(String type : anaTypes) { |
684 |
output.write("\t"+anahash.get(type));
|
685 |
} |
686 |
output.write("\n")
|
687 |
} |
688 |
|
689 |
flagWord = false;
|
690 |
} |
691 |
break;
|
692 |
|
693 |
case "form": |
694 |
flagForm = false;
|
695 |
break;
|
696 |
|
697 |
case "ana": |
698 |
flagAna = false;
|
699 |
anahash.put(anaType, vAna); |
700 |
vAna = "";
|
701 |
break;
|
702 |
} |
703 |
break; // end elem |
704 |
|
705 |
case XMLStreamConstants.CHARACTERS:
|
706 |
if (flagAna) {
|
707 |
vAna += parser.getText().trim() |
708 |
} |
709 |
|
710 |
if (flagForm) {
|
711 |
vForm += parser.getText().trim(); |
712 |
if (flagchoice) {
|
713 |
if (flagsic) {
|
714 |
vSic += parser.getText().trim(); |
715 |
} |
716 |
if (flagorig) {
|
717 |
vOrig += parser.getText().trim(); |
718 |
} |
719 |
if (flagabbr) {
|
720 |
vAbbr += parser.getText().trim(); |
721 |
} |
722 |
if (flagcorr) {
|
723 |
vCorr += parser.getText().trim(); |
724 |
} |
725 |
} |
726 |
} |
727 |
} |
728 |
} |
729 |
//output.write("</txmcorpus>");
|
730 |
output.close(); |
731 |
parser.close(); |
732 |
inputData.close(); |
733 |
} catch (Exception ex) { |
734 |
System.out.println("Exception while parsing " + inputData); |
735 |
ex.printStackTrace(); |
736 |
return false; |
737 |
} |
738 |
|
739 |
return true; |
740 |
} |
741 |
|
742 |
|
743 |
/**
|
744 |
* Run.
|
745 |
*
|
746 |
* @param rootDirFile contains the TEI-TXM files
|
747 |
* @param basename the basename
|
748 |
* @return true, if successful
|
749 |
*/
|
750 |
public boolean run(File binDir, File txmDir, String corpusname, Properties metadataXPath) |
751 |
{ |
752 |
sattrsListener = null; // reset SAttribute Listener for each new import |
753 |
this.metadataXPath = metadataXPath;
|
754 |
|
755 |
if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
|
756 |
println ("Error: CWB executables not well set.")
|
757 |
return false; |
758 |
} |
759 |
if (!txmDir.exists()) {
|
760 |
println ("binary directory does not exists: "+txmDir)
|
761 |
return false; |
762 |
} |
763 |
|
764 |
File cqpFile = new File(binDir, "cqp/${corpusname}.cqp"); |
765 |
new File(binDir, "/cqp/").deleteDir(); |
766 |
new File(binDir, "/cqp/").mkdir(); |
767 |
new File(binDir, "/data/${corpusname}").deleteDir(); |
768 |
new File(binDir, "/data/${corpusname}").mkdir(); |
769 |
new File(binDir, "registry/").mkdir(); |
770 |
|
771 |
String textid = ""; |
772 |
int counttext = 0; |
773 |
List<File> files = txmDir.listFiles(); |
774 |
|
775 |
// get text siecles to be able to sort with it
|
776 |
HashMap<File,Integer[]> filesiecle = new HashMap<File, Integer[]>() |
777 |
for (File f : files) { |
778 |
Integer[] date = new Integer[3]; |
779 |
date[0] = date[1] = date[2] = 0; |
780 |
String xpath = "//tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:date[@type='compo']/@when" |
781 |
if (metadataXPath.containsKey("datecompo")) |
782 |
xpath = metadataXPath.get("datecompo")
|
783 |
String datecompo = XPathResult.getXpathResponse(f, xpath);
|
784 |
if (datecompo != null) { |
785 |
//println f.getName()+" > "+datecompo
|
786 |
String[] split = datecompo.split("-"); // yyyy-mm-dd |
787 |
if (split.length == 3) { |
788 |
date[0] = Integer.parseInt(split[0]); |
789 |
date[1] = Integer.parseInt(split[1]); |
790 |
date[2] = Integer.parseInt(split[2]); |
791 |
} |
792 |
else if (split.length == 1) { // yyyy |
793 |
date[0] = Integer.parseInt(split[0]); |
794 |
date[1] = 1; |
795 |
date[2] = 1; |
796 |
} |
797 |
} |
798 |
filesiecle.put(f, date); |
799 |
} |
800 |
//println "date compos: "+filesiecle
|
801 |
Collections.sort(files); // Alpha order |
802 |
Collections.sort(files, new Comparator<File>() { // Date order |
803 |
@Override
|
804 |
public int compare(File o1, File o2) { |
805 |
Integer[] date1 = filesiecle.get(o1); |
806 |
Integer[] date2 = filesiecle.get(o2); |
807 |
if (date1[0] < date2[0]) { |
808 |
return -1; |
809 |
} else if(date1[0] > date2[0]) { |
810 |
return 1; |
811 |
} |
812 |
|
813 |
if (date1[1] < date2[1]) { |
814 |
return -1; |
815 |
} else if(date1[1] > date2[1]) { |
816 |
return 1; |
817 |
} |
818 |
|
819 |
if (date1[2] < date2[2]) { |
820 |
return -1; |
821 |
} else if(date1[2] > date2[2]) { |
822 |
return 1; |
823 |
} |
824 |
|
825 |
return 0; |
826 |
} |
827 |
}); |
828 |
|
829 |
this.orderedFiles = files;
|
830 |
println("process "+files.size()+" files ") |
831 |
//println("files: $files")
|
832 |
//write txmcorpus
|
833 |
if (!createOutput(cqpFile)) {
|
834 |
println "Error: could not write cqp file"
|
835 |
return false; |
836 |
} else {
|
837 |
output.write("<txmcorpus lang=\""+lang+"\">\n"); |
838 |
output.close(); |
839 |
} |
840 |
|
841 |
//1- Transform into CQP file
|
842 |
for (File f : files) { |
843 |
counttext++; |
844 |
if (!f.exists()) {
|
845 |
println("file "+f+ " does not exists") |
846 |
} else {
|
847 |
print "."
|
848 |
String txtname = f.getName().substring(0, f.getName().length()-4); |
849 |
def builder = new compiler(f.toURI().toURL(), txtname, corpusname, "default", metadataXPath); |
850 |
builder.setLang(lang) |
851 |
if (!builder.transfomFileCqp(cqpFile)) {
|
852 |
println "Failed to compile "+f
|
853 |
} |
854 |
builder.setAnnotationDone(this.annotate_status);
|
855 |
} |
856 |
} |
857 |
|
858 |
//close txmcorpus
|
859 |
if (!createOutput(cqpFile)) {
|
860 |
println "Error: could not write cqp file"
|
861 |
return false; |
862 |
} else {
|
863 |
output.write("</txmcorpus>\n");
|
864 |
output.close(); |
865 |
} |
866 |
println ""
|
867 |
|
868 |
//2- Import into CWB
|
869 |
def outDir = binDir.getAbsolutePath();;
|
870 |
CwbEncode cwbEn = new CwbEncode();
|
871 |
cwbEn.setDebug(debug); |
872 |
CwbMakeAll cwbMa = new CwbMakeAll();
|
873 |
cwbMa.setDebug(debug); |
874 |
|
875 |
def pAttrs = ["id","q","sp","pb","lb","orig","sic","abbr","ref","pos","supplied","lang","nametype"]; |
876 |
for(String type : anaTypes) |
877 |
pAttrs.add(type.substring(1)); // remove # |
878 |
|
879 |
structs = sattrsListener.getStructs(); |
880 |
structsProf = sattrsListener.getProfs(); |
881 |
if (debug) {
|
882 |
println structs |
883 |
println structsProf |
884 |
} |
885 |
// add structures+properties found in sources
|
886 |
List<String> sargs = new ArrayList<String>(); |
887 |
for (String name : structs.keySet()) { |
888 |
if ( name == "text") continue; // added after |
889 |
//if ( name == "q") continue; // added after
|
890 |
//if ( name == "foreign") continue; // added after
|
891 |
String concat = name+":"+structsProf.get(name); // append the depth |
892 |
for (String value : structs.get(name)) // append the attributes |
893 |
concat += "+"+value;
|
894 |
if ((name == "p" || name == "body" || name == "back" || name == "front") && |
895 |
!(concat.endsWith("+n") || concat.contains("+n+"))) |
896 |
concat += "+n"
|
897 |
sargs.add(concat); |
898 |
} |
899 |
|
900 |
String textSAttributes = "text:0+id+base+project"; |
901 |
if (metadataXPath != null) { |
902 |
for (String meta : metadataXPath.keySet()) // text property declarations from metadata.csv |
903 |
textSAttributes+="+"+meta;
|
904 |
} |
905 |
if (!metadataXPath.keySet().contains("sigle")) |
906 |
textSAttributes+="+sigle";
|
907 |
|
908 |
sargs.add(textSAttributes) |
909 |
sargs.add("txmcorpus:0+lang")
|
910 |
//sargs.add("q:0+n+lang")
|
911 |
for (int c = 0 ; c < sargs.size() ; c++) { |
912 |
String sarg = sargs.get(c);
|
913 |
if (sarg.startsWith("q:")) { |
914 |
if (! sarg.contains("+n")) sarg +="+n" |
915 |
if (! sarg.contains("+lang")) sarg +="+lang" |
916 |
|
917 |
sargs.set(c, sarg); |
918 |
} else if(sarg.startsWith("foreign:")) { |
919 |
if (! sarg.contains("+n")) sarg +="+n" |
920 |
if (! sarg.contains("+lang")) sarg +="+lang" |
921 |
sargs.set(c, sarg); |
922 |
} else if(sarg.startsWith("ab:") || sarg.startsWith("sp:")) { |
923 |
if (! sarg.contains("+n")) sarg +="+n" |
924 |
if (! sarg.contains("+subtype")) sarg +="+subtype" |
925 |
if (! sarg.contains("+rend")) sarg +="+rend" |
926 |
sargs.set(c, sarg); |
927 |
} else if(sarg.startsWith("div:")) { |
928 |
if (! sarg.contains("+n")) sarg +="+n" |
929 |
if (! sarg.contains("+id")) sarg +="+id" |
930 |
if (! sarg.contains("+type")) sarg +="+type" |
931 |
if (! sarg.contains("+subtype")) sarg +="+subtype" |
932 |
sargs.set(c, sarg); |
933 |
} else if(sarg.startsWith("name:")) { |
934 |
if (! sarg.contains("+n")) sarg +="+n" |
935 |
if (! sarg.contains("+type")) sarg +="+type" |
936 |
sargs.set(c, sarg); |
937 |
} |
938 |
} |
939 |
sargs.sort(); |
940 |
|
941 |
String[] sAttributes = sargs; |
942 |
String[] pAttributes = pAttrs; |
943 |
println "P-attributes: "+pAttributes
|
944 |
println "S-attributes: "+sargs
|
945 |
|
946 |
try {
|
947 |
String regPath = outDir + "/registry/"+corpusname.toLowerCase(); // CQP wants lower case registry files |
948 |
cwbEn.run(outDir + "/data/${corpusname}", outDir + "/cqp/"+corpusname+".cqp", regPath,pAttributes, sAttributes); |
949 |
if (!new File(regPath).exists()) { |
950 |
println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
|
951 |
return false; |
952 |
} |
953 |
cwbMa.run(corpusname, outDir + "/registry");
|
954 |
} catch (Exception ex) {System.out.println(ex);return false;} |
955 |
|
956 |
return true; |
957 |
} |
958 |
|
959 |
/**
|
960 |
* show cwb utils messages.
|
961 |
*/
|
962 |
public void setDebug() |
963 |
{ |
964 |
this.debug = true; |
965 |
} |
966 |
|
967 |
/**
|
968 |
* test purpose.
|
969 |
*
|
970 |
* @param args the arguments
|
971 |
*/
|
972 |
public static void main(String[] args) |
973 |
{ |
974 |
File dir = new File("~/xml/bfm"); |
975 |
def c = new compiler(); |
976 |
c.setDebug(); |
977 |
c.setCwbPath("~/TXM/cwb/bin");
|
978 |
c.run(dir); |
979 |
} |
980 |
} |