root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZDefaultPagerStep.groovy @ 2126
History | View | Annotate | Download (16.8 kB)
1 |
package org.txm.scripts.importer.xtz;
|
---|---|
2 |
|
3 |
import java.io.File; |
4 |
import java.io.OutputStreamWriter; |
5 |
import java.util.ArrayList; |
6 |
import java.util.List; |
7 |
|
8 |
import javax.xml.stream.*; |
9 |
|
10 |
import org.txm.scripts.importer.StaxStackWriter; |
11 |
import org.eclipse.ui.part.PageSwitcher |
12 |
import org.txm.importer.xtz.* |
13 |
|
14 |
public class XTZDefaultPagerStep { |
15 |
|
16 |
List<String> NoSpaceBefore; |
17 |
|
18 |
/** The No space after. */
|
19 |
List<String> NoSpaceAfter; |
20 |
|
21 |
/** The wordcount. */
|
22 |
int wordcount = 0; |
23 |
|
24 |
/** The pagecount. */
|
25 |
int pagecount = 0; |
26 |
|
27 |
/** The wordmax. */
|
28 |
int wordmax = 0; |
29 |
|
30 |
/** The basename. */
|
31 |
String basename = ""; |
32 |
String txtname = ""; |
33 |
File outdir;
|
34 |
|
35 |
/** The wordid. */
|
36 |
String wordid;
|
37 |
|
38 |
/** The first word. */
|
39 |
boolean firstWord = true; |
40 |
|
41 |
/** The wordvalue. */
|
42 |
String wordvalue = ""; |
43 |
|
44 |
/** The interpvalue. */
|
45 |
String interpvalue = ""; |
46 |
|
47 |
/** The lastword. */
|
48 |
String lastword = " "; |
49 |
|
50 |
/** The wordtype. */
|
51 |
String wordtype;
|
52 |
|
53 |
/** The flagform. */
|
54 |
boolean flagform = false; |
55 |
|
56 |
/** The flaginterp. */
|
57 |
boolean flaginterp = false; |
58 |
|
59 |
/** The url. */
|
60 |
private def url; |
61 |
|
62 |
/** The input data. */
|
63 |
private def inputData; |
64 |
|
65 |
/** The factory. */
|
66 |
private def factory; |
67 |
|
68 |
/** The parser. */
|
69 |
private XMLStreamReader parser;
|
70 |
|
71 |
/** The writer. */
|
72 |
OutputStreamWriter writer;
|
73 |
|
74 |
/** The pagedWriter. */
|
75 |
StaxStackWriter pagedWriter = null;
|
76 |
|
77 |
/** The infile. */
|
78 |
File infile;
|
79 |
|
80 |
/** The outfile. */
|
81 |
File outfile;
|
82 |
|
83 |
/** The pages. */
|
84 |
//TODO enhance this to store the page name/id as well
|
85 |
ArrayList<File> pages = new ArrayList<File>(); |
86 |
|
87 |
/** The idxstart. */
|
88 |
ArrayList<String> idxstart = new ArrayList<String>(); |
89 |
String paginationElement;
|
90 |
def cssList;
|
91 |
def wordTag = "w"; |
92 |
def noteElements = new HashSet<String>(); |
93 |
def outOfTextElements = new HashSet<String>(); |
94 |
XTZPager pager; |
95 |
|
96 |
/**
|
97 |
* Instantiates a new pager.
|
98 |
*
|
99 |
* @param infile the infile
|
100 |
* @param outfile the outfile
|
101 |
* @param NoSpaceBefore the no space before
|
102 |
* @param NoSpaceAfter the no space after
|
103 |
* @param max the max
|
104 |
* @param basename the basename
|
105 |
*/
|
106 |
public XTZDefaultPagerStep(XTZPager pager, File infile, String txtname, List<String> NoSpaceBefore, |
107 |
List<String> NoSpaceAfter, def cssList) { |
108 |
this.pager = pager;
|
109 |
this.paginationElement = pager.page_element;
|
110 |
this.cssList = cssList;
|
111 |
this.basename = pager.corpusname;
|
112 |
this.txtname = txtname;
|
113 |
this.outdir = pager.outputDirectory;
|
114 |
this.wordmax = pager.wordsPerPage;
|
115 |
this.NoSpaceBefore = NoSpaceBefore;
|
116 |
this.NoSpaceAfter = NoSpaceAfter;
|
117 |
this.url = infile.toURI().toURL();
|
118 |
this.infile = infile;
|
119 |
this.wordTag= pager.wordTag;
|
120 |
outdir.mkdirs() |
121 |
|
122 |
inputData = new BufferedInputStream(url.openStream()); |
123 |
factory = XMLInputFactory.newInstance(); |
124 |
parser = factory.createXMLStreamReader(inputData); |
125 |
|
126 |
String notesListString = pager.getImportModule().getProject().getTextualPlan("Note") |
127 |
if (notesListString != null) for (def s : notesListString.split(",")) noteElements << s; |
128 |
|
129 |
String elems = pager.getImportModule().getProject().getTextualPlan("OutSideTextTagsAndKeepContent") |
130 |
if (elems != null) for (def s : elems.split(",")) outOfTextElements << s; |
131 |
|
132 |
//process();
|
133 |
} |
134 |
|
135 |
public String getAttributeValue(def parser, String ns, String name) { |
136 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
137 |
if (name == parser.getAttributeLocalName(i)) {
|
138 |
return parser.getAttributeValue(i).toString()
|
139 |
} |
140 |
} |
141 |
return ""; |
142 |
} |
143 |
|
144 |
private def closeMultiWriter() { |
145 |
if (pagedWriter != null) { |
146 |
def tags = pagedWriter.getTagStack().clone();
|
147 |
// println "STACK="+pagedWriter.getTagStack()
|
148 |
// def stack = Thread.currentThread().getStackTrace();
|
149 |
// int m = Math.min(15, stack.size()-1)
|
150 |
// for (def s : stack[1..m]) println s
|
151 |
// println "FILE ="+outfile
|
152 |
if (firstWord) { // there was no words |
153 |
pagedWriter.writeCharacters("");
|
154 |
this.idxstart.add("${wordTag}_0") |
155 |
pagedWriter.write("<span id=\"${wordTag}_0\"/>");
|
156 |
// }
|
157 |
} |
158 |
pagedWriter.writeEndElements(); |
159 |
// write notes
|
160 |
if (notes.size() > 0) { |
161 |
pagedWriter.writeEmptyElement("hr", ["id":"notes", "width":"20%", "align":"left"]); |
162 |
//pagedWriter.writeStartElement("ol");
|
163 |
int i = 1; |
164 |
for (String note : notes) { |
165 |
//pagedWriter.writeStartElement("li");
|
166 |
pagedWriter.writeStartElement("a", ["href":"#noteref_"+i, "name":"note_"+i]); |
167 |
pagedWriter.writeStartElement("sup")
|
168 |
pagedWriter.writeCharacters(""+i)
|
169 |
pagedWriter.writeEndElement() // </sub>
|
170 |
pagedWriter.writeEndElement() // </a>
|
171 |
pagedWriter.writeCharacters(note) |
172 |
pagedWriter.writeEmptyElement("br")
|
173 |
i++; |
174 |
} |
175 |
notes.clear() |
176 |
} |
177 |
|
178 |
pagedWriter.close(); |
179 |
|
180 |
// println "STACK TO REWRITE: $tags"
|
181 |
for (int i = 0 ; i < tags.size() ; i++) { |
182 |
String tag = tags.remove(0) |
183 |
i-- |
184 |
// println " tag=$tag"
|
185 |
if (tag == "div") { |
186 |
break; // remove elements until first "div" tag |
187 |
} |
188 |
} |
189 |
// println "STACK TO REWRITE2: $tags"
|
190 |
|
191 |
return tags;
|
192 |
} else {
|
193 |
return []; |
194 |
} |
195 |
} |
196 |
|
197 |
/**
|
198 |
* Creates the next output.
|
199 |
*
|
200 |
* @return true, if successful
|
201 |
*/
|
202 |
private boolean createNextOutput() { |
203 |
wordcount = 0;
|
204 |
try {
|
205 |
def tags = closeMultiWriter();
|
206 |
|
207 |
outfile = new File(outdir, txtname+"_"+(++pagecount)+".html") |
208 |
pages.add(outfile); |
209 |
firstWord = true; // waiting for next word |
210 |
|
211 |
pagedWriter = new StaxStackWriter(outfile, "UTF-8"); |
212 |
|
213 |
//pagedWriter.writeStartDocument()
|
214 |
pagedWriter.writeDTD("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">")
|
215 |
pagedWriter.writeCharacters("\n")
|
216 |
pagedWriter.writeStartElement("html");
|
217 |
pagedWriter.writeCharacters("\n")
|
218 |
pagedWriter.writeEmptyElement("meta", ["http-equiv":"Content-Type", "content":"text/html","charset":"UTF-8"]); |
219 |
for (String css : cssList) { |
220 |
pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "type":"text/css","href":"$css"]); |
221 |
} |
222 |
pagedWriter.writeStartElement("head");
|
223 |
pagedWriter.writeStartElement("title")
|
224 |
pagedWriter.writeCharacters(basename.toUpperCase()+" Edition - Page "+pagecount)
|
225 |
pagedWriter.writeEndElement(); // </title>
|
226 |
pagedWriter.writeEndElement() // </head>
|
227 |
pagedWriter.writeCharacters("\n")
|
228 |
pagedWriter.writeStartElement("body") //<body> |
229 |
pagedWriter.writeStartElement("div", ["class": "txmeditionpage"]) //<div> |
230 |
// println "OPENING: $tags"
|
231 |
pagedWriter.writeStartElements(tags); |
232 |
return true; |
233 |
} catch (Exception e) { |
234 |
System.out.println(e.getLocalizedMessage());
|
235 |
e.printStackTrace() |
236 |
return false; |
237 |
} |
238 |
} |
239 |
|
240 |
/**
|
241 |
* Creates the output.
|
242 |
*
|
243 |
* @param outfile the outfile
|
244 |
* @return true, if successful
|
245 |
*/
|
246 |
private boolean createOutput() { |
247 |
try {
|
248 |
return createNextOutput();
|
249 |
} catch (Exception e) { |
250 |
System.out.println(e.getLocalizedMessage());
|
251 |
return false; |
252 |
} |
253 |
} |
254 |
|
255 |
/**
|
256 |
* Gets the page files.
|
257 |
*
|
258 |
* @return the page files
|
259 |
*/
|
260 |
public ArrayList<File> getPageFiles() { |
261 |
return pages;
|
262 |
} |
263 |
|
264 |
/**
|
265 |
* Gets the idx.
|
266 |
*
|
267 |
* @return the idx
|
268 |
*/
|
269 |
public ArrayList<String> getIdx() { |
270 |
return idxstart;
|
271 |
} |
272 |
|
273 |
/**
|
274 |
* Go to text.
|
275 |
*/
|
276 |
private void goToText() { |
277 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
278 |
if (event == XMLStreamConstants.END_ELEMENT)
|
279 |
if (parser.getLocalName().matches("teiHeader")) |
280 |
return;
|
281 |
} |
282 |
} |
283 |
|
284 |
def notes = [] |
285 |
def currentOutOfTextElements = [] // stack of element with out of text to edit opened element |
286 |
def writeOutOfTextToEditText = false |
287 |
/**
|
288 |
* Process.
|
289 |
*/
|
290 |
public boolean process() { |
291 |
|
292 |
try {
|
293 |
boolean flagNote = false; |
294 |
String noteContent = ""; |
295 |
String rend = "" |
296 |
goToText(); |
297 |
|
298 |
String localname = ""; |
299 |
if (!createNextOutput()) {
|
300 |
return false; |
301 |
} |
302 |
|
303 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
304 |
rend = "";
|
305 |
switch (event) {
|
306 |
case XMLStreamConstants.START_ELEMENT:
|
307 |
localname = parser.getLocalName(); |
308 |
if (outOfTextElements.contains(localname)) {
|
309 |
currentOutOfTextElements << localname |
310 |
writeOutOfTextToEditText = true;
|
311 |
} else if (currentOutOfTextElements.size() > 0) { |
312 |
currentOutOfTextElements << localname |
313 |
} |
314 |
|
315 |
if (localname == paginationElement) {
|
316 |
createNextOutput(); |
317 |
wordcount=0;
|
318 |
pagedWriter.write("\n");
|
319 |
if (getAttributeValue(parser, null,"n") != null) { |
320 |
pagedWriter.writeElement("p", ["class":"txmeditionpb", "align":"center"], getAttributeValue(parser, null,"n")) |
321 |
} |
322 |
} |
323 |
|
324 |
rend = getAttributeValue(parser, null, "rend") |
325 |
if (rend == null) rend = ""; |
326 |
switch (localname) {
|
327 |
case "text": |
328 |
LinkedHashMap attributes = new LinkedHashMap(); |
329 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
330 |
attributes[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i).toString() |
331 |
} |
332 |
|
333 |
pagedWriter.write("\n");
|
334 |
pagedWriter.writeStartElement("p")
|
335 |
pagedWriter.writeAttribute("class", rend);
|
336 |
if (attributes.containsKey("id")) { |
337 |
pagedWriter.writeElement("h3", attributes["id"]) |
338 |
} |
339 |
|
340 |
pagedWriter.writeStartElement("table");
|
341 |
for (String k : attributes.keySet()) { |
342 |
if (k == "id") continue; |
343 |
if (k == "rend") continue; |
344 |
|
345 |
pagedWriter.writeStartElement("tr");
|
346 |
pagedWriter.writeElement("td", k);
|
347 |
pagedWriter.writeElement("td", attributes[k]);
|
348 |
pagedWriter.writeEndElement(); //tr
|
349 |
} |
350 |
pagedWriter.writeEndElement() // table
|
351 |
pagedWriter.writeEndElement() // p
|
352 |
pagedWriter.writeCharacters("\n");
|
353 |
break;
|
354 |
case "ref": |
355 |
pagedWriter.writeStartElement("a")
|
356 |
pagedWriter.writeAttribute("href", getAttributeValue(parser, null, "target")); |
357 |
pagedWriter.writeAttribute("target", "_blank"); |
358 |
pagedWriter.writeAttribute("class", rend);
|
359 |
break;
|
360 |
case "head": |
361 |
pagedWriter.write("\n");
|
362 |
pagedWriter.writeStartElement("h2", ["class":rend]) |
363 |
break;
|
364 |
case "graphic": |
365 |
pagedWriter.write("\n");
|
366 |
String url = getAttributeValue(parser, null, "url") |
367 |
if (url != null) { |
368 |
// TEI <graphic rend="left-image" url="image.png"/> -> <center class="left-image"><img href="image.png"/></center> + <moncorpus>.css avec rule ".left-image"
|
369 |
pagedWriter.writeStartElement("center", ["class":rend]) // css -> .<rend> { ... } styles OR |
370 |
pagedWriter.writeEmptyElement("img", ["src":url, "align":"middle"]) |
371 |
pagedWriter.writeEndElement() // center
|
372 |
} |
373 |
break;
|
374 |
case "table": |
375 |
pagedWriter.writeStartElement("table", ["class":rend]) |
376 |
pagedWriter.write("\n");
|
377 |
break;
|
378 |
case "row": |
379 |
pagedWriter.writeStartElement("tr", ["class":rend]) |
380 |
break;
|
381 |
case "cell": |
382 |
pagedWriter.writeStartElement("td", ["class":rend]) |
383 |
break;
|
384 |
case "list": |
385 |
String type = getAttributeValue(parser, null,"type"); |
386 |
if ("unordered" == type) { |
387 |
pagedWriter.writeStartElement("ul", ["class":rend]) |
388 |
} else {
|
389 |
pagedWriter.writeStartElement("ol", ["class":rend]) |
390 |
} |
391 |
break
|
392 |
case "item": |
393 |
pagedWriter.writeStartElement("li", ["class":rend]) |
394 |
break;
|
395 |
case "hi": |
396 |
case "emph": |
397 |
if ("i".equals(rend) || "italic".equals(rend)) { |
398 |
pagedWriter.writeStartElement("i", ["class":rend]) |
399 |
} else if ("b".equals(rend) || "bold".equals(rend)) { |
400 |
pagedWriter.writeStartElement("b", ["class":rend]) |
401 |
} else {
|
402 |
if ("emph".equals(localname)) { |
403 |
pagedWriter.writeStartElement("i", ["class":rend]) |
404 |
} else { // hi |
405 |
pagedWriter.writeStartElement("b", ["class":rend]) |
406 |
} |
407 |
} |
408 |
break;
|
409 |
case "p": |
410 |
//case "lg":
|
411 |
pagedWriter.write("\n");
|
412 |
pagedWriter.writeStartElement("p", ["class":rend]) |
413 |
break;
|
414 |
case "div": |
415 |
case "div1": |
416 |
case "div2": |
417 |
case "div3": |
418 |
case "div4": |
419 |
case "div5": |
420 |
pagedWriter.writeStartElement("div", ["class":rend, "type":localname]) |
421 |
break;
|
422 |
case "lb": |
423 |
//case "l":
|
424 |
pagedWriter.writeEmptyElement("br", ["class":rend]) |
425 |
break;
|
426 |
case wordTag: |
427 |
wordid = getAttributeValue(parser, null,"id"); |
428 |
|
429 |
wordcount++; |
430 |
if (wordcount >= wordmax) {
|
431 |
createNextOutput(); |
432 |
} |
433 |
|
434 |
if (firstWord) {
|
435 |
firstWord = false;
|
436 |
this.idxstart.add(wordid);
|
437 |
} |
438 |
|
439 |
break;
|
440 |
case "ana": |
441 |
flaginterp=true;
|
442 |
interpvalue+=" "+getAttributeValue(parser, null, "type").substring(1)+":" |
443 |
break;
|
444 |
case "form": |
445 |
wordvalue="";
|
446 |
interpvalue ="";
|
447 |
flagform=true;
|
448 |
break;
|
449 |
default:
|
450 |
if (noteElements.contains(localname)) {
|
451 |
flagNote = true;
|
452 |
noteContent = ""
|
453 |
} |
454 |
// else {
|
455 |
// pagedWriter.writeStartElement("span", ["class":localname])
|
456 |
// }
|
457 |
break;
|
458 |
} |
459 |
break;
|
460 |
case XMLStreamConstants.END_ELEMENT:
|
461 |
localname = parser.getLocalName(); |
462 |
if (currentOutOfTextElements.size() > 0) currentOutOfTextElements.pop() |
463 |
writeOutOfTextToEditText = currentOutOfTextElements.size() > 0
|
464 |
|
465 |
switch (localname) {
|
466 |
case "text": |
467 |
break;
|
468 |
case "p": |
469 |
//case "lg":
|
470 |
pagedWriter.writeEndElement() // </p>
|
471 |
pagedWriter.write("\n");
|
472 |
break;
|
473 |
case "div": |
474 |
case "div1": |
475 |
case "div2": |
476 |
case "div3": |
477 |
case "div4": |
478 |
case "div5": |
479 |
pagedWriter.writeEndElement() // </div>
|
480 |
pagedWriter.write("\n");
|
481 |
break;
|
482 |
case "head": |
483 |
pagedWriter.writeEndElement() // </h2>
|
484 |
pagedWriter.write("\n");
|
485 |
break;
|
486 |
case "list": |
487 |
pagedWriter.writeEndElement(); // ul or ol
|
488 |
pagedWriter.write("\n");
|
489 |
break
|
490 |
case "item": |
491 |
pagedWriter.writeEndElement(); // li
|
492 |
pagedWriter.write("\n");
|
493 |
break;
|
494 |
case "hi": |
495 |
pagedWriter.writeEndElement(); // b
|
496 |
break;
|
497 |
case "emph": |
498 |
pagedWriter.writeEndElement(); // i
|
499 |
break;
|
500 |
case "table": |
501 |
pagedWriter.writeEndElement(); // table
|
502 |
pagedWriter.write("\n");
|
503 |
break;
|
504 |
case "row": |
505 |
pagedWriter.writeEndElement(); // tr
|
506 |
break;
|
507 |
case "cell": |
508 |
pagedWriter.writeEndElement(); // td
|
509 |
break;
|
510 |
case "ref": |
511 |
pagedWriter.writeEndElement() // </a>
|
512 |
break;
|
513 |
case "form": |
514 |
flagform = false
|
515 |
break;
|
516 |
case "ana": |
517 |
flaginterp = false
|
518 |
break;
|
519 |
case wordTag: |
520 |
int l = lastword.length();
|
521 |
String endOfLastWord = ""; |
522 |
if (l > 0) |
523 |
endOfLastWord = lastword.subSequence(l-1, l);
|
524 |
|
525 |
if (interpvalue != null) |
526 |
interpvalue = interpvalue; |
527 |
|
528 |
if (NoSpaceBefore.contains(wordvalue) ||
|
529 |
NoSpaceAfter.contains(lastword) || |
530 |
wordvalue.startsWith("-") ||
|
531 |
NoSpaceAfter.contains(endOfLastWord)) { |
532 |
pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]); |
533 |
} else {
|
534 |
pagedWriter.writeCharacters("\n");
|
535 |
pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]); |
536 |
} |
537 |
|
538 |
pagedWriter.writeCharacters(wordvalue); |
539 |
pagedWriter.writeEndElement(); |
540 |
//pagedWriter.writeComment("\n")
|
541 |
lastword=wordvalue; |
542 |
break;
|
543 |
default:
|
544 |
if (noteElements.contains(localname)) {
|
545 |
flagNote = false;
|
546 |
if (noteContent.length() > 0) { |
547 |
notes << noteContent; |
548 |
pagedWriter.writeStartElement("a", ["href":"#note_"+notes.size(), "name":"noteref_"+notes.size(), "title":noteContent]); |
549 |
pagedWriter.writeStartElement("sup")
|
550 |
pagedWriter.writeCharacters(""+notes.size())
|
551 |
pagedWriter.writeEndElement() // </sub>
|
552 |
pagedWriter.writeEndElement() // </a>
|
553 |
} |
554 |
} |
555 |
// else {
|
556 |
// pagedWriter.writeEndElement() // the element
|
557 |
// }
|
558 |
break;
|
559 |
} |
560 |
break;
|
561 |
case XMLStreamConstants.CHARACTERS:
|
562 |
if (flagform && parser.getText().length() > 0) { |
563 |
wordvalue+=(parser.getText()); |
564 |
if (flagNote == parser.getText().length() > 0) |
565 |
noteContent += parser.getText().replace("\n", " "); |
566 |
} else if (flaginterp && parser.getText().length() > 0) { |
567 |
interpvalue+=(parser.getText()); |
568 |
} else if (flagNote == parser.getText().length() > 0) { |
569 |
noteContent += parser.getText().replace("\n", " "); |
570 |
} else if (writeOutOfTextToEditText) { |
571 |
pagedWriter.writeCharacters(parser.getText()) |
572 |
} |
573 |
break;
|
574 |
} |
575 |
} |
576 |
closeMultiWriter(); |
577 |
if (parser != null) parser.close(); |
578 |
if (inputData != null) inputData.close(); |
579 |
} catch(Exception e) { |
580 |
println "** Fail to build $infile edition: $e at "+parser.getLocation()
|
581 |
println "** resulting file: $outfile"
|
582 |
println "** Stax stack: "+pagedWriter.getTagStack()
|
583 |
e.printStackTrace(); |
584 |
pagedWriter.close() |
585 |
if (parser != null) parser.close(); |
586 |
if (inputData != null) inputData.close(); |
587 |
return false; |
588 |
} |
589 |
return true; |
590 |
} |
591 |
} |