Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZDefaultPagerStep.groovy @ 1137

History | View | Annotate | Download (15.8 kB)

1
package org.txm.scripts.importer.xtz;
2

    
3
import java.io.File;
4
import java.io.OutputStreamWriter;
5
import java.util.ArrayList;
6
import java.util.List;
7

    
8
import javax.xml.stream.*;
9

    
10
import org.txm.scripts.importer.StaxStackWriter;
11
import org.eclipse.ui.part.PageSwitcher
12
import org.txm.importer.xtz.*
13

    
14
public class XTZDefaultPagerStep {
15

    
16
        List<String> NoSpaceBefore;
17

    
18
        /** The No space after. */
19
        List<String> NoSpaceAfter;
20

    
21
        /** The wordcount. */
22
        int wordcount = 0;
23

    
24
        /** The pagecount. */
25
        int pagecount = 0;
26

    
27
        /** The wordmax. */
28
        int wordmax = 0;
29

    
30
        /** The basename. */
31
        String basename = "";
32
        String txtname = "";
33
        File outdir;
34

    
35
        /** The wordid. */
36
        String wordid;
37

    
38
        /** The first word. */
39
        boolean firstWord = true;
40

    
41
        /** The wordvalue. */
42
        String wordvalue = "";
43

    
44
        /** The interpvalue. */
45
        String interpvalue = "";
46

    
47
        /** The lastword. */
48
        String lastword = " ";
49

    
50
        /** The wordtype. */
51
        String wordtype;
52

    
53
        /** The flagform. */
54
        boolean flagform = false;
55

    
56
        /** The flaginterp. */
57
        boolean flaginterp = false;
58

    
59
        /** The url. */
60
        private def url;
61

    
62
        /** The input data. */
63
        private def inputData;
64

    
65
        /** The factory. */
66
        private def factory;
67

    
68
        /** The parser. */
69
        private XMLStreamReader parser;
70

    
71
        /** The writer. */
72
        OutputStreamWriter writer;
73

    
74
        /** The pagedWriter. */
75
        StaxStackWriter pagedWriter = null;
76

    
77
        /** The infile. */
78
        File infile;
79

    
80
        /** The outfile. */
81
        File outfile;
82

    
83
        /** The pages. */
84
        //TODO enhance this to store the page name/id as well
85
        ArrayList<File> pages = new ArrayList<File>();
86

    
87
        /** The idxstart. */
88
        ArrayList<String> idxstart = new ArrayList<String>();
89
        String paginationElement;
90
        def cssList;
91
        def wordTag = "w";
92
        def noteElements = new HashSet<String>();
93
        def outOfTextElements = new HashSet<String>();
94
        XTZPager pager;
95
        
96

    
97
        /**
98
         * Instantiates a new pager.
99
         *
100
         * @param infile the infile
101
         * @param outfile the outfile
102
         * @param NoSpaceBefore the no space before
103
         * @param NoSpaceAfter the no space after
104
         * @param max the max
105
         * @param basename the basename
106
         */
107
        public XTZDefaultPagerStep(XTZPager pager, File infile, String txtname, List<String> NoSpaceBefore,
108
        List<String> NoSpaceAfter, def cssList) {
109
                this.pager = pager;
110
                this.paginationElement = pager.page_element;
111
                this.cssList = cssList;
112
                this.basename = pager.corpusname;
113
                this.txtname = txtname;
114
                this.outdir = pager.outputDirectory;
115
                this.wordmax = pager.wordsPerPage;
116
                this.NoSpaceBefore = NoSpaceBefore;
117
                this.NoSpaceAfter = NoSpaceAfter;
118
                this.url = infile.toURI().toURL();
119
                this.infile = infile;
120
                this.wordTag= pager.wordTag;
121
                outdir.mkdirs()
122

    
123
                inputData = new BufferedInputStream(url.openStream());
124
                factory = XMLInputFactory.newInstance();
125
                parser = factory.createXMLStreamReader(inputData);
126
                
127
                String notesListString = pager.getImportModule().getProject().getTextualPlan("Note")
128
                if (notesListString != null) for (def s : notesListString.split(",")) noteElements << s;
129
                
130
                String elems = pager.getImportModule().getProject().getTextualPlan("OutSideTextTagsAndKeepContent")
131
                if (elems != null) for (def s : elems.split(",")) outOfTextElements << s;
132
                
133
                //process();
134
        }
135
        
136
        public String getAttributeValue(def parser, String ns, String name) {
137
                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
138
                        if (name == parser.getAttributeLocalName(i)) {
139
                                return parser.getAttributeValue(i).toString()
140
                        }
141
                }
142
                return "";
143
        }
144

    
145

    
146
        private def closeMultiWriter() {
147
                if (pagedWriter != null) {
148
                        def tags = pagedWriter.getTagStack().clone();
149

    
150
                        if (firstWord) { // there was no words
151
                                pagedWriter.writeCharacters("");
152
                                this.idxstart.add("${wordTag}_0")
153
                                pagedWriter.write("<span id=\"${wordTag}_0\"/>");
154
                                //                                }
155
                        }
156
                        pagedWriter.writeEndElements();
157

    
158
                        if (notes.size() > 0) {
159
                                pagedWriter.writeEmptyElement("hr", ["id":"notes", "width":"20%", "align":"left"]);
160
                                //pagedWriter.writeStartElement("ol");
161
                                int i = 1;
162
                                for (String note : notes) {
163
                                        //pagedWriter.writeStartElement("li");
164
                                        pagedWriter.writeStartElement("a", ["href":"#noteref_"+i, "name":"note_"+i]);
165
                                        pagedWriter.writeStartElement("sup")
166
                                        pagedWriter.writeCharacters(""+i)
167
                                        pagedWriter.writeEndElement() // </sub>
168
                                        pagedWriter.writeEndElement() // </a>
169
                                        pagedWriter.writeCharacters(note)
170
                                        pagedWriter.writeEmptyElement("br")
171
                                        //pagedWriter.writeEndElement() // </li>
172
                                        i++;
173
                                }
174
                                //pagedWriter.writeEndElement() // </ol>
175
                                notes.clear()
176
                        }
177

    
178
                        pagedWriter.close();
179
                        return tags;
180
                } else {
181
                        return [];
182
                }
183
        }
184

    
185
        /**
186
         * Creates the next output.
187
         *
188
         * @return true, if successful
189
         */
190
        private boolean createNextOutput() {
191
                wordcount = 0;
192
                try {
193

    
194
                        def tags = closeMultiWriter();
195
                        for (int i = 0 ; i < tags.size() ; i++) {
196
                                String tag = tags[i]
197
                                if ("div" != tag) {
198
                                        tags.remove(i--)
199
                                } else {
200
                                        tags.remove(i--) // remove first "div"
201
                                        break; // remove elements until first "div" tag
202
                                }
203
                        }
204
                        File outfile = new File(outdir, txtname+"_"+(++pagecount)+".html")
205
                        pages.add(outfile);
206
                        firstWord = true; // waiting for next word
207

    
208
                        pagedWriter = new StaxStackWriter(outfile, "UTF-8");
209

    
210
                        //pagedWriter.writeStartDocument()
211
                        pagedWriter.writeDTD("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">")
212
                        pagedWriter.writeCharacters("\n")
213
                        pagedWriter.writeStartElement("html");
214
                        pagedWriter.writeCharacters("\n")
215
                        pagedWriter.writeEmptyElement("meta", ["http-equiv":"Content-Type", "content":"text/html","charset":"UTF-8"]);
216
                        for (String css : cssList) {
217
                                pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "type":"text/css","href":"$css"]);
218
                        }
219
                        pagedWriter.writeStartElement("head");
220
                        pagedWriter.writeStartElement("title")
221
                        pagedWriter.writeCharacters(basename.toUpperCase()+" Edition - Page "+pagecount)
222
                        pagedWriter.writeEndElement(); // </title>
223
                        pagedWriter.writeEndElement() // </head>
224
                        pagedWriter.writeCharacters("\n")
225
                        pagedWriter.writeStartElement("body") //<body>
226
                        pagedWriter.writeStartElement("div", ["class": "txmeditionpage"]) //<div>
227
                        pagedWriter.writeStartElements(tags);
228
                        return true;
229
                } catch (Exception e) {
230
                        System.out.println(e.getLocalizedMessage());
231
                        e.printStackTrace()
232
                        return false;
233
                }
234
        }
235

    
236
        /**
237
         * Creates the output.
238
         *
239
         * @param outfile the outfile
240
         * @return true, if successful
241
         */
242
        private boolean createOutput() {
243
                try {
244
                        return createNextOutput();
245
                } catch (Exception e) {
246
                        System.out.println(e.getLocalizedMessage());
247
                        return false;
248
                }
249
        }
250

    
251
        /**
252
         * Gets the page files.
253
         *
254
         * @return the page files
255
         */
256
        public ArrayList<File> getPageFiles() {
257
                return pages;
258
        }
259

    
260
        /**
261
         * Gets the idx.
262
         *
263
         * @return the idx
264
         */
265
        public ArrayList<String> getIdx() {
266
                return idxstart;
267
        }
268

    
269
        /**
270
         * Go to text.
271
         */
272
        private void goToText() {
273
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
274
                        if (event == XMLStreamConstants.END_ELEMENT)
275
                                if (parser.getLocalName().matches("teiHeader"))
276
                                        return;
277
                }
278
        }
279

    
280
        def notes = []
281
        def currentOutOfTextElements = [] // stack of element with out of text to edit opened element
282
        def writeOutOfTextToEditText = false
283
        /**
284
         * Process.
285
         */
286
        public boolean process() {
287
                boolean flagNote = false;
288
                String noteContent = "";
289
                String rend = ""
290
                goToText();
291

    
292
                String localname = "";
293
                createNextOutput();
294

    
295
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
296
                        rend = "";
297
                        switch (event) {
298
                                case XMLStreamConstants.START_ELEMENT:
299
                                        localname = parser.getLocalName();
300
                                        if (outOfTextElements.contains(localname)) {
301
                                                currentOutOfTextElements << localname
302
                                                writeOutOfTextToEditText = true;
303
                                        } else if (currentOutOfTextElements.size() > 0) {
304
                                                currentOutOfTextElements << localname
305
                                        }                        
306
                                        
307
                                        if (localname == paginationElement) {
308
                                                createNextOutput();
309
                                                wordcount=0;
310
                                                pagedWriter.write("\n");
311
                                                if (getAttributeValue(parser, null,"n") != null) {
312
                                                        pagedWriter.writeElement("p", ["class":"txmeditionpb", "align":"center"], getAttributeValue(parser, null,"n"))
313
                                                }
314
                                        }
315
                                        
316
                                        rend = getAttributeValue(parser, null, "rend")
317
                                        if (rend == null) rend = "";
318
                                        switch (localname) {
319
                                                case "text":
320
                                                        LinkedHashMap attributes = new LinkedHashMap();
321
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
322
                                                                attributes[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i).toString()
323
                                                        }
324
                                                        
325
                                                        pagedWriter.write("\n");
326
                                                        pagedWriter.writeStartElement("p")
327
                                                        pagedWriter.writeAttribute("class", rend);
328
                                                        if (attributes.containsKey("id")) {
329
                                                                pagedWriter.writeElement("h3", attributes["id"])
330
                                                        }
331

    
332
                                                        pagedWriter.writeStartElement("table");
333
                                                        for (String k : attributes.keySet()) {
334
                                                                if (k == "id") continue;
335
                                                                if (k == "rend") continue;
336

    
337
                                                                pagedWriter.writeStartElement("tr");
338
                                                                pagedWriter.writeElement("td", k);
339
                                                                pagedWriter.writeElement("td", attributes[k]);
340
                                                                pagedWriter.writeEndElement();
341
                                                        }
342
                                                        pagedWriter.writeEndElement() // table
343
                                                        pagedWriter.writeEndElement() // p
344

    
345
                                                        pagedWriter.writeCharacters("");
346
                                                        break;
347
                                                case "ref":
348
                                                        pagedWriter.writeStartElement("a")
349
                                                        pagedWriter.writeAttribute("href", getAttributeValue(parser, null, "target"));
350

    
351
                                                        pagedWriter.writeAttribute("target", "_blank");
352
                                                        pagedWriter.writeAttribute("class", rend);
353
                                                        break;
354
                                                case "head":
355
                                                        pagedWriter.write("\n");
356
                                                        pagedWriter.writeStartElement("h2", ["class":rend])
357
                                                        break;
358
                                                case "graphic":
359
                                                        pagedWriter.write("\n");
360
                                                        String url = getAttributeValue(parser, null, "url")
361
                                                        if (url != null) {
362
                                                                // TEI <graphic rend="left-image" url="image.png"/> -> <center class="left-image"><img href="image.png"/></center> + <moncorpus>.css avec rule ".left-image"
363
                                                                pagedWriter.writeStartElement("center", ["class":rend]) // css -> .<rend> { ... } styles OR
364
                                                                pagedWriter.writeEmptyElement("img", ["src":url, "align":"middle"])
365
                                                                pagedWriter.writeEndElement() // div
366
                                                        }
367
                                                        break;
368
                                                case "table":
369
                                                        pagedWriter.writeStartElement("table", ["class":rend])
370
                                                        pagedWriter.write("\n");
371
                                                        break;
372
                                                case "row":
373
                                                        pagedWriter.writeStartElement("tr", ["class":rend])
374
                                                        break;
375
                                                case "cell":
376
                                                        pagedWriter.writeStartElement("td", ["class":rend])
377
                                                        break;
378
                                                case "list":
379
                                                        String type = getAttributeValue(parser, null,"type");
380
                                                        if ("unordered" == type) {
381
                                                                pagedWriter.writeStartElement("ul", ["class":rend])
382
                                                        } else {
383
                                                                pagedWriter.writeStartElement("ol", ["class":rend])
384
                                                        }
385
                                                        break
386
                                                case "item":
387
                                                        pagedWriter.writeStartElement("li", ["class":rend])
388
                                                        break;
389
                                                case "hi":
390
                                                case "emph":
391
                                                        if ("i".equals(rend) || "italic".equals(rend)) {
392
                                                                pagedWriter.writeStartElement("i", ["class":rend])
393
                                                        } else if ("b".equals(rend) || "bold".equals(rend)) {
394
                                                                pagedWriter.writeStartElement("b", ["class":rend])
395
                                                        } else {
396
                                                                if ("emph".equals(localname)) {
397
                                                                        pagedWriter.writeStartElement("i", ["class":rend])
398
                                                                } else { // hi
399
                                                                        pagedWriter.writeStartElement("b", ["class":rend])
400
                                                                }
401
                                                        }
402

    
403
                                                        break;
404
                                                case "p":
405
                                                //case "lg":
406
                                                        pagedWriter.write("\n");
407
                                                        pagedWriter.writeStartElement("p", ["class":rend])
408
                                                        break;
409
                                                case "div":
410
                                                case "div1":
411
                                                case "div2":
412
                                                case "div3":
413
                                                case "div4":
414
                                                case "div5":
415
                                                        pagedWriter.writeStartElement("div", ["class":rend, "type":localname])
416
                                                        break;
417
                                                case "lb":
418
                                                //case "l":
419
                                                        pagedWriter.writeEmptyElement("br", ["class":rend])
420
                                                        break;
421
                                                case wordTag:
422
                                                        wordid = getAttributeValue(parser, null,"id");
423
                                                        
424
                                                        wordcount++;
425
                                                        if (wordcount >= wordmax) {
426
                                                                createNextOutput();
427
                                                        }
428
                                                        
429
                                                        if (firstWord) {
430
                                                                firstWord = false;
431
                                                                this.idxstart.add(wordid);
432
                                                        }
433
                                                        
434
                                                        break;
435
                                                case "ana":
436
                                                        flaginterp=true;
437
                                                        interpvalue+=" "+getAttributeValue(parser, null, "type").substring(1)+":"
438
                                                        break;
439
                                                case "form":
440
                                                        wordvalue="";
441
                                                        interpvalue ="";
442
                                                        flagform=true;
443
                                                        break;
444
                                                default:
445
                                                        if (noteElements.contains(localname)) {
446
                                                                flagNote = true;
447
                                                                noteContent = ""
448
                                                        }
449
//                                                         else {
450
//                                                                pagedWriter.writeStartElement("span", ["class":localname])
451
//                                                        }
452
                                                        break;
453
                                        }
454
                                        break;
455
                                case XMLStreamConstants.END_ELEMENT:
456
                                        localname = parser.getLocalName();
457
                                        if (currentOutOfTextElements.size() > 0) currentOutOfTextElements.pop()
458
                                        writeOutOfTextToEditText = currentOutOfTextElements.size() > 0
459
                                        
460
                                        switch (localname) {
461
                                                case "p":
462
                                                //case "lg":
463
                                                        pagedWriter.writeEndElement() // </p>
464
                                                        pagedWriter.write("\n");
465
                                                        break;
466
                                                case "div":
467
                                                case "div1":
468
                                                case "div2":
469
                                                case "div3":
470
                                                case "div4":
471
                                                case "div5":
472
                                                        pagedWriter.writeEndElement() // </div>
473
                                                        pagedWriter.write("\n");
474
                                                case "head":
475
                                                        pagedWriter.writeEndElement() // </h2>
476
                                                        pagedWriter.write("\n");
477
                                                        break;
478
                                                case "list":
479
                                                        pagedWriter.writeEndElement(); // ul or ol
480
                                                        pagedWriter.write("\n");
481
                                                        break
482
                                                case "item":
483
                                                        pagedWriter.writeEndElement(); // li
484
                                                        pagedWriter.write("\n");
485
                                                        break;
486
                                                case "hi":
487
                                                        pagedWriter.writeEndElement(); // b
488
                                                        break;
489
                                                case "emph":
490
                                                        pagedWriter.writeEndElement(); // i
491
                                                        break;
492
                                                case "table":
493
                                                        pagedWriter.writeEndElement(); // table
494
                                                        pagedWriter.write("\n");
495
                                                        break;
496
                                                case "row":
497
                                                        pagedWriter.writeEndElement(); // tr
498
                                                        break;
499
                                                case "cell":
500
                                                        pagedWriter.writeEndElement(); // td
501
                                                        break;
502
                                                case "ref":
503
                                                        pagedWriter.writeEndElement() // </a>
504
                                                        break;
505
                                                case "form":
506
                                                        flagform = false
507
                                                        break;
508
                                                case "ana":
509
                                                        flaginterp = false
510
                                                        break;
511
                                                case wordTag:
512
                                                        int l = lastword.length();
513
                                                        String endOfLastWord = "";
514
                                                        if (l > 0)
515
                                                                endOfLastWord = lastword.subSequence(l-1, l);
516

    
517
                                                        if (interpvalue != null)
518
                                                                interpvalue = interpvalue;
519

    
520
                                                        if (NoSpaceBefore.contains(wordvalue) ||
521
                                                        NoSpaceAfter.contains(lastword) ||
522
                                                        wordvalue.startsWith("-") ||
523
                                                        NoSpaceAfter.contains(endOfLastWord)) {
524
                                                                pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
525
                                                        } else {
526
                                                                pagedWriter.writeCharacters(" ");
527
                                                                pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
528
                                                        }
529

    
530
                                                        pagedWriter.writeCharacters(wordvalue);
531
                                                        pagedWriter.writeEndElement();
532
                                                        pagedWriter.writeCharacters("\n")
533
                                                        lastword=wordvalue;
534
                                                        break;
535
                                                default:
536
                                                        if (noteElements.contains(localname)) {
537
                                                                flagNote = false;
538
                                                                if (noteContent.length() > 0) {
539
                                                                        notes << noteContent;
540
                                                                        pagedWriter.writeStartElement("a", ["href":"#note_"+notes.size(), "name":"noteref_"+notes.size(), "title":noteContent]);
541
                                                                        pagedWriter.writeStartElement("sup")
542
                                                                        pagedWriter.writeCharacters(""+notes.size())
543
                                                                        pagedWriter.writeEndElement() // </sub>
544
                                                                        pagedWriter.writeEndElement() // </span>
545
                                                                }
546
                                                        } 
547
//                                                        else {
548
//                                                                pagedWriter.writeEndElement() // the element
549
//                                                        }
550
                                                        break;
551
                                        }
552
                                        break;
553
                                case XMLStreamConstants.CHARACTERS:
554
                                        if (flagform && parser.getText().length() > 0) {
555
                                                wordvalue+=(parser.getText());
556
                                                if (flagNote == parser.getText().length() > 0)
557
                                                        noteContent += parser.getText().replace("\n", " ");
558
                                        } else        if (flaginterp && parser.getText().length() > 0) {
559
                                                interpvalue+=(parser.getText());
560
                                        } else if (flagNote == parser.getText().length() > 0) {
561
                                                noteContent += parser.getText().replace("\n", " ");
562
                                        } else if (writeOutOfTextToEditText) {
563
                                                pagedWriter.writeCharacters(parser.getText())
564
                                        }
565
                                        break;
566
                        }
567
                }
568
                closeMultiWriter();
569
        }
570

    
571
}