Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZDefaultPagerStep.groovy @ 1688

History | View | Annotate | Download (16.8 kB)

1
package org.txm.scripts.importer.xtz;
2

    
3
import java.io.File;
4
import java.io.OutputStreamWriter;
5
import java.util.ArrayList;
6
import java.util.List;
7

    
8
import javax.xml.stream.*;
9

    
10
import org.txm.scripts.importer.StaxStackWriter;
11
import org.eclipse.ui.part.PageSwitcher
12
import org.txm.importer.xtz.*
13

    
14
public class XTZDefaultPagerStep {
15

    
16
        List<String> NoSpaceBefore;
17

    
18
        /** The No space after. */
19
        List<String> NoSpaceAfter;
20

    
21
        /** The wordcount. */
22
        int wordcount = 0;
23

    
24
        /** The pagecount. */
25
        int pagecount = 0;
26

    
27
        /** The wordmax. */
28
        int wordmax = 0;
29

    
30
        /** The basename. */
31
        String basename = "";
32
        String txtname = "";
33
        File outdir;
34

    
35
        /** The wordid. */
36
        String wordid;
37

    
38
        /** The first word. */
39
        boolean firstWord = true;
40

    
41
        /** The wordvalue. */
42
        String wordvalue = "";
43

    
44
        /** The interpvalue. */
45
        String interpvalue = "";
46

    
47
        /** The lastword. */
48
        String lastword = " ";
49

    
50
        /** The wordtype. */
51
        String wordtype;
52

    
53
        /** The flagform. */
54
        boolean flagform = false;
55

    
56
        /** The flaginterp. */
57
        boolean flaginterp = false;
58

    
59
        /** The url. */
60
        private def url;
61

    
62
        /** The input data. */
63
        private def inputData;
64

    
65
        /** The factory. */
66
        private def factory;
67

    
68
        /** The parser. */
69
        private XMLStreamReader parser;
70

    
71
        /** The writer. */
72
        OutputStreamWriter writer;
73

    
74
        /** The pagedWriter. */
75
        StaxStackWriter pagedWriter = null;
76

    
77
        /** The infile. */
78
        File infile;
79

    
80
        /** The outfile. */
81
        File outfile;
82

    
83
        /** The pages. */
84
        //TODO enhance this to store the page name/id as well
85
        ArrayList<File> pages = new ArrayList<File>();
86

    
87
        /** The idxstart. */
88
        ArrayList<String> idxstart = new ArrayList<String>();
89
        String paginationElement;
90
        def cssList;
91
        def wordTag = "w";
92
        def noteElements = new HashSet<String>();
93
        def outOfTextElements = new HashSet<String>();
94
        XTZPager pager;
95

    
96
        /**
97
         * Instantiates a new pager.
98
         *
99
         * @param infile the infile
100
         * @param outfile the outfile
101
         * @param NoSpaceBefore the no space before
102
         * @param NoSpaceAfter the no space after
103
         * @param max the max
104
         * @param basename the basename
105
         */
106
        public XTZDefaultPagerStep(XTZPager pager, File infile, String txtname, List<String> NoSpaceBefore,
107
        List<String> NoSpaceAfter, def cssList) {
108
                this.pager = pager;
109
                this.paginationElement = pager.page_element;
110
                this.cssList = cssList;
111
                this.basename = pager.corpusname;
112
                this.txtname = txtname;
113
                this.outdir = pager.outputDirectory;
114
                this.wordmax = pager.wordsPerPage;
115
                this.NoSpaceBefore = NoSpaceBefore;
116
                this.NoSpaceAfter = NoSpaceAfter;
117
                this.url = infile.toURI().toURL();
118
                this.infile = infile;
119
                this.wordTag= pager.wordTag;
120
                outdir.mkdirs()
121

    
122
                inputData = new BufferedInputStream(url.openStream());
123
                factory = XMLInputFactory.newInstance();
124
                parser = factory.createXMLStreamReader(inputData);
125

    
126
                String notesListString = pager.getImportModule().getProject().getTextualPlan("Note")
127
                if (notesListString != null) for (def s : notesListString.split(",")) noteElements << s;
128

    
129
                String elems = pager.getImportModule().getProject().getTextualPlan("OutSideTextTagsAndKeepContent")
130
                if (elems != null) for (def s : elems.split(",")) outOfTextElements << s;
131

    
132
                //process();
133
        }
134

    
135
        public String getAttributeValue(def parser, String ns, String name) {
136
                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
137
                        if (name == parser.getAttributeLocalName(i)) {
138
                                return parser.getAttributeValue(i).toString()
139
                        }
140
                }
141
                return "";
142
        }
143

    
144
        private def closeMultiWriter() {
145
                if (pagedWriter != null) {
146
                        def tags = pagedWriter.getTagStack().clone();
147
//                        println "STACK="+pagedWriter.getTagStack()
148
//                        def stack = Thread.currentThread().getStackTrace();
149
//                        int m = Math.min(15, stack.size()-1)
150
//                        for (def s : stack[1..m]) println s
151
//                        println "FILE ="+outfile
152
                        if (firstWord) { // there was no words
153
                                pagedWriter.writeCharacters("");
154
                                this.idxstart.add("${wordTag}_0")
155
                                pagedWriter.write("<span id=\"${wordTag}_0\"/>");
156
                                //                                }
157
                        }
158
                        pagedWriter.writeEndElements();
159
                        // write notes
160
                        if (notes.size() > 0) {
161
                                pagedWriter.writeEmptyElement("hr", ["id":"notes", "width":"20%", "align":"left"]);
162
                                //pagedWriter.writeStartElement("ol");
163
                                int i = 1;
164
                                for (String note : notes) {
165
                                        //pagedWriter.writeStartElement("li");
166
                                        pagedWriter.writeStartElement("a", ["href":"#noteref_"+i, "name":"note_"+i]);
167
                                        pagedWriter.writeStartElement("sup")
168
                                        pagedWriter.writeCharacters(""+i)
169
                                        pagedWriter.writeEndElement() // </sub>
170
                                        pagedWriter.writeEndElement() // </a>
171
                                        pagedWriter.writeCharacters(note)
172
                                        pagedWriter.writeEmptyElement("br")
173
                                        i++;
174
                                }
175
                                notes.clear()
176
                        }
177

    
178
                        pagedWriter.close();
179
                        
180
//                        println "STACK TO REWRITE: $tags"
181
                        for (int i = 0 ; i < tags.size() ; i++) {
182
                                String tag = tags.remove(0)
183
                                i--
184
//                                println "        tag=$tag"
185
                                if (tag == "div") {
186
                                        break; // remove elements until first "div" tag
187
                                }
188
                        }
189
//                        println "STACK TO REWRITE2: $tags"
190
                        
191
                        return tags;
192
                } else {
193
                        return [];
194
                }
195
        }
196

    
197
        /**
198
         * Creates the next output.
199
         *
200
         * @return true, if successful
201
         */
202
        private boolean createNextOutput() {
203
                wordcount = 0;
204
                try {
205
                        def tags = closeMultiWriter();
206
                        
207
                        outfile = new File(outdir, txtname+"_"+(++pagecount)+".html")
208
                        pages.add(outfile);
209
                        firstWord = true; // waiting for next word
210

    
211
                        pagedWriter = new StaxStackWriter(outfile, "UTF-8");
212

    
213
                        //pagedWriter.writeStartDocument()
214
                        pagedWriter.writeDTD("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">")
215
                        pagedWriter.writeCharacters("\n")
216
                        pagedWriter.writeStartElement("html");
217
                        pagedWriter.writeCharacters("\n")
218
                        pagedWriter.writeEmptyElement("meta", ["http-equiv":"Content-Type", "content":"text/html","charset":"UTF-8"]);
219
                        for (String css : cssList) {
220
                                pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "type":"text/css","href":"$css"]);
221
                        }
222
                        pagedWriter.writeStartElement("head");
223
                        pagedWriter.writeStartElement("title")
224
                        pagedWriter.writeCharacters(basename.toUpperCase()+" Edition - Page "+pagecount)
225
                        pagedWriter.writeEndElement(); // </title>
226
                        pagedWriter.writeEndElement() // </head>
227
                        pagedWriter.writeCharacters("\n")
228
                        pagedWriter.writeStartElement("body") //<body>
229
                        pagedWriter.writeStartElement("div", ["class": "txmeditionpage"]) //<div>
230
//                        println "OPENING: $tags"
231
                        pagedWriter.writeStartElements(tags);
232
                        return true;
233
                } catch (Exception e) {
234
                        System.out.println(e.getLocalizedMessage());
235
                        e.printStackTrace()
236
                        return false;
237
                }
238
        }
239

    
240
        /**
241
         * Creates the output.
242
         *
243
         * @param outfile the outfile
244
         * @return true, if successful
245
         */
246
        private boolean createOutput() {
247
                try {
248
                        return createNextOutput();
249
                } catch (Exception e) {
250
                        System.out.println(e.getLocalizedMessage());
251
                        return false;
252
                }
253
        }
254

    
255
        /**
256
         * Gets the page files.
257
         *
258
         * @return the page files
259
         */
260
        public ArrayList<File> getPageFiles() {
261
                return pages;
262
        }
263

    
264
        /**
265
         * Gets the idx.
266
         *
267
         * @return the idx
268
         */
269
        public ArrayList<String> getIdx() {
270
                return idxstart;
271
        }
272

    
273
        /**
274
         * Go to text.
275
         */
276
        private void goToText() {
277
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
278
                        if (event == XMLStreamConstants.END_ELEMENT)
279
                                if (parser.getLocalName().matches("teiHeader"))
280
                                        return;
281
                }
282
        }
283

    
284
        def notes = []
285
        def currentOutOfTextElements = [] // stack of element with out of text to edit opened element
286
        def writeOutOfTextToEditText = false
287
        /**
288
         * Process.
289
         */
290
        public boolean process() {
291

    
292
                try {
293
                        boolean flagNote = false;
294
                        String noteContent = "";
295
                        String rend = ""
296
                        goToText();
297

    
298
                        String localname = "";
299
                        if (!createNextOutput()) {
300
                                return false;
301
                        }
302

    
303
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
304
                                rend = "";
305
                                switch (event) {
306
                                        case XMLStreamConstants.START_ELEMENT:
307
                                                localname = parser.getLocalName();
308
                                                if (outOfTextElements.contains(localname)) {
309
                                                        currentOutOfTextElements << localname
310
                                                        writeOutOfTextToEditText = true;
311
                                                } else if (currentOutOfTextElements.size() > 0) {
312
                                                        currentOutOfTextElements << localname
313
                                                }
314

    
315
                                                if (localname == paginationElement) {
316
                                                        createNextOutput();
317
                                                        wordcount=0;
318
                                                        pagedWriter.write("\n");
319
                                                        if (getAttributeValue(parser, null,"n") != null) {
320
                                                                pagedWriter.writeElement("p", ["class":"txmeditionpb", "align":"center"], getAttributeValue(parser, null,"n"))
321
                                                        }
322
                                                }
323

    
324
                                                rend = getAttributeValue(parser, null, "rend")
325
                                                if (rend == null) rend = "";
326
                                                switch (localname) {
327
                                                        case "text":
328
                                                                LinkedHashMap attributes = new LinkedHashMap();
329
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
330
                                                                        attributes[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i).toString()
331
                                                                }
332

    
333
                                                                pagedWriter.write("\n");
334
                                                                pagedWriter.writeStartElement("p")
335
                                                                pagedWriter.writeAttribute("class", rend);
336
                                                                if (attributes.containsKey("id")) {
337
                                                                        pagedWriter.writeElement("h3", attributes["id"])
338
                                                                }
339

    
340
                                                                pagedWriter.writeStartElement("table");
341
                                                                for (String k : attributes.keySet()) {
342
                                                                        if (k == "id") continue;
343
                                                                        if (k == "rend") continue;
344

    
345
                                                                        pagedWriter.writeStartElement("tr");
346
                                                                        pagedWriter.writeElement("td", k);
347
                                                                        pagedWriter.writeElement("td", attributes[k]);
348
                                                                        pagedWriter.writeEndElement(); //tr
349
                                                                }
350
                                                                pagedWriter.writeEndElement() // table
351
                                                                pagedWriter.writeEndElement() // p
352
                                                                pagedWriter.writeCharacters("\n");
353
                                                                break;
354
                                                        case "ref":
355
                                                                pagedWriter.writeStartElement("a")
356
                                                                pagedWriter.writeAttribute("href", getAttributeValue(parser, null, "target"));
357
                                                                pagedWriter.writeAttribute("target", "_blank");
358
                                                                pagedWriter.writeAttribute("class", rend);
359
                                                                break;
360
                                                        case "head":
361
                                                                pagedWriter.write("\n");
362
                                                                pagedWriter.writeStartElement("h2", ["class":rend])
363
                                                                break;
364
                                                        case "graphic":
365
                                                                pagedWriter.write("\n");
366
                                                                String url = getAttributeValue(parser, null, "url")
367
                                                                if (url != null) {
368
                                                                        // TEI <graphic rend="left-image" url="image.png"/> -> <center class="left-image"><img href="image.png"/></center> + <moncorpus>.css avec rule ".left-image"
369
                                                                        pagedWriter.writeStartElement("center", ["class":rend]) // css -> .<rend> { ... } styles OR
370
                                                                        pagedWriter.writeEmptyElement("img", ["src":url, "align":"middle"])
371
                                                                        pagedWriter.writeEndElement() // center
372
                                                                }
373
                                                                break;
374
                                                        case "table":
375
                                                                pagedWriter.writeStartElement("table", ["class":rend])
376
                                                                pagedWriter.write("\n");
377
                                                                break;
378
                                                        case "row":
379
                                                                pagedWriter.writeStartElement("tr", ["class":rend])
380
                                                                break;
381
                                                        case "cell":
382
                                                                pagedWriter.writeStartElement("td", ["class":rend])
383
                                                                break;
384
                                                        case "list":
385
                                                                String type = getAttributeValue(parser, null,"type");
386
                                                                if ("unordered" == type) {
387
                                                                        pagedWriter.writeStartElement("ul", ["class":rend])
388
                                                                } else {
389
                                                                        pagedWriter.writeStartElement("ol", ["class":rend])
390
                                                                }
391
                                                                break
392
                                                        case "item":
393
                                                                pagedWriter.writeStartElement("li", ["class":rend])
394
                                                                break;
395
                                                        case "hi":
396
                                                        case "emph":
397
                                                                if ("i".equals(rend) || "italic".equals(rend)) {
398
                                                                        pagedWriter.writeStartElement("i", ["class":rend])
399
                                                                } else if ("b".equals(rend) || "bold".equals(rend)) {
400
                                                                        pagedWriter.writeStartElement("b", ["class":rend])
401
                                                                } else {
402
                                                                        if ("emph".equals(localname)) {
403
                                                                                pagedWriter.writeStartElement("i", ["class":rend])
404
                                                                        } else { // hi
405
                                                                                pagedWriter.writeStartElement("b", ["class":rend])
406
                                                                        }
407
                                                                }
408
                                                                break;
409
                                                        case "p":
410
                                                        //case "lg":
411
                                                                pagedWriter.write("\n");
412
                                                                pagedWriter.writeStartElement("p", ["class":rend])
413
                                                                break;
414
                                                        case "div":
415
                                                        case "div1":
416
                                                        case "div2":
417
                                                        case "div3":
418
                                                        case "div4":
419
                                                        case "div5":
420
                                                                pagedWriter.writeStartElement("div", ["class":rend, "type":localname])
421
                                                                break;
422
                                                        case "lb":
423
                                                        //case "l":
424
                                                                pagedWriter.writeEmptyElement("br", ["class":rend])
425
                                                                break;
426
                                                        case wordTag:
427
                                                                wordid = getAttributeValue(parser, null,"id");
428

    
429
                                                                wordcount++;
430
                                                                if (wordcount >= wordmax) {
431
                                                                        createNextOutput();
432
                                                                }
433

    
434
                                                                if (firstWord) {
435
                                                                        firstWord = false;
436
                                                                        this.idxstart.add(wordid);
437
                                                                }
438

    
439
                                                                break;
440
                                                        case "ana":
441
                                                                flaginterp=true;
442
                                                                interpvalue+=" "+getAttributeValue(parser, null, "type").substring(1)+":"
443
                                                                break;
444
                                                        case "form":
445
                                                                wordvalue="";
446
                                                                interpvalue ="";
447
                                                                flagform=true;
448
                                                                break;
449
                                                        default:
450
                                                                if (noteElements.contains(localname)) {
451
                                                                        flagNote = true;
452
                                                                        noteContent = ""
453
                                                                }
454
                                                        //                                                         else {
455
                                                        //                                                                pagedWriter.writeStartElement("span", ["class":localname])
456
                                                        //                                                        }
457
                                                                break;
458
                                                }
459
                                                break;
460
                                        case XMLStreamConstants.END_ELEMENT:
461
                                                localname = parser.getLocalName();
462
                                                if (currentOutOfTextElements.size() > 0) currentOutOfTextElements.pop()
463
                                                writeOutOfTextToEditText = currentOutOfTextElements.size() > 0
464

    
465
                                                switch (localname) {
466
                                                        case "text":
467
                                                                break;
468
                                                        case "p":
469
                                                        //case "lg":
470
                                                                pagedWriter.writeEndElement() // </p>
471
                                                                pagedWriter.write("\n");
472
                                                                break;
473
                                                        case "div":
474
                                                        case "div1":
475
                                                        case "div2":
476
                                                        case "div3":
477
                                                        case "div4":
478
                                                        case "div5":
479
                                                                pagedWriter.writeEndElement() // </div>
480
                                                                pagedWriter.write("\n");
481
                                                                break;
482
                                                        case "head":
483
                                                                pagedWriter.writeEndElement() // </h2>
484
                                                                pagedWriter.write("\n");
485
                                                                break;
486
                                                        case "list":
487
                                                                pagedWriter.writeEndElement(); // ul or ol
488
                                                                pagedWriter.write("\n");
489
                                                                break
490
                                                        case "item":
491
                                                                pagedWriter.writeEndElement(); // li
492
                                                                pagedWriter.write("\n");
493
                                                                break;
494
                                                        case "hi":
495
                                                                pagedWriter.writeEndElement(); // b
496
                                                                break;
497
                                                        case "emph":
498
                                                                pagedWriter.writeEndElement(); // i
499
                                                                break;
500
                                                        case "table":
501
                                                                pagedWriter.writeEndElement(); // table
502
                                                                pagedWriter.write("\n");
503
                                                                break;
504
                                                        case "row":
505
                                                                pagedWriter.writeEndElement(); // tr
506
                                                                break;
507
                                                        case "cell":
508
                                                                pagedWriter.writeEndElement(); // td
509
                                                                break;
510
                                                        case "ref":
511
                                                                pagedWriter.writeEndElement() // </a>
512
                                                                break;
513
                                                        case "form":
514
                                                                flagform = false
515
                                                                break;
516
                                                        case "ana":
517
                                                                flaginterp = false
518
                                                                break;
519
                                                        case wordTag:
520
                                                                int l = lastword.length();
521
                                                                String endOfLastWord = "";
522
                                                                if (l > 0)
523
                                                                        endOfLastWord = lastword.subSequence(l-1, l);
524

    
525
                                                                if (interpvalue != null)
526
                                                                        interpvalue = interpvalue;
527

    
528
                                                                if (NoSpaceBefore.contains(wordvalue) ||
529
                                                                NoSpaceAfter.contains(lastword) ||
530
                                                                wordvalue.startsWith("-") ||
531
                                                                NoSpaceAfter.contains(endOfLastWord)) {
532
                                                                        pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
533
                                                                } else {
534
                                                                        pagedWriter.writeCharacters("\n");
535
                                                                        pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
536
                                                                }
537

    
538
                                                                pagedWriter.writeCharacters(wordvalue);
539
                                                                pagedWriter.writeEndElement();
540
                                                                //pagedWriter.writeComment("\n")
541
                                                                lastword=wordvalue;
542
                                                                break;
543
                                                        default:
544
                                                                if (noteElements.contains(localname)) {
545
                                                                        flagNote = false;
546
                                                                        if (noteContent.length() > 0) {
547
                                                                                notes << noteContent;
548
                                                                                pagedWriter.writeStartElement("a", ["href":"#note_"+notes.size(), "name":"noteref_"+notes.size(), "title":noteContent]);
549
                                                                                pagedWriter.writeStartElement("sup")
550
                                                                                pagedWriter.writeCharacters(""+notes.size())
551
                                                                                pagedWriter.writeEndElement() // </sub>
552
                                                                                pagedWriter.writeEndElement() // </a>
553
                                                                        }
554
                                                                }
555
                                                        //                                                        else {
556
                                                        //                                                                pagedWriter.writeEndElement() // the element
557
                                                        //                                                        }
558
                                                                break;
559
                                                }
560
                                                break;
561
                                        case XMLStreamConstants.CHARACTERS:
562
                                                if (flagform && parser.getText().length() > 0) {
563
                                                        wordvalue+=(parser.getText());
564
                                                        if (flagNote == parser.getText().length() > 0)
565
                                                                noteContent += parser.getText().replace("\n", " ");
566
                                                } else        if (flaginterp && parser.getText().length() > 0) {
567
                                                        interpvalue+=(parser.getText());
568
                                                } else if (flagNote == parser.getText().length() > 0) {
569
                                                        noteContent += parser.getText().replace("\n", " ");
570
                                                } else if (writeOutOfTextToEditText) {
571
                                                        pagedWriter.writeCharacters(parser.getText())
572
                                                }
573
                                                break;
574
                                }
575
                        }
576
                        closeMultiWriter();
577
                        if (parser != null) parser.close();
578
                        if (inputData != null) inputData.close();
579
                } catch(Exception e) {
580
                        println "** Fail to build $infile edition: $e at "+parser.getLocation()
581
                        println "** resulting file: $outfile"
582
                        println "** Stax stack: "+pagedWriter.getTagStack()
583
                        e.printStackTrace();
584
                        pagedWriter.close()
585
                        if (parser != null) parser.close();
586
                        if (inputData != null) inputData.close();
587
                        return false;
588
                }
589
                return true;
590
        }
591
}