Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / xtz / XTZDefaultPagerStep.groovy @ 479

History | View | Annotate | Download (14.5 kB)

1
package org.txm.importer.xtz;
2

    
3
import java.io.File;
4
import java.io.OutputStreamWriter;
5
import java.util.ArrayList;
6
import java.util.List;
7

    
8
import javax.xml.stream.*;
9

    
10
import org.txm.importer.StaxStackWriter;
11

    
12
public class XTZDefaultPagerStep {
13

    
14
        List<String> NoSpaceBefore;
15

    
16
        /** The No space after. */
17
        List<String> NoSpaceAfter;
18

    
19
        /** The wordcount. */
20
        int wordcount = 0;
21

    
22
        /** The pagecount. */
23
        int pagecount = 0;
24

    
25
        /** The wordmax. */
26
        int wordmax = 0;
27

    
28
        /** The basename. */
29
        String basename = "";
30
        String txtname = "";
31
        File outdir;
32

    
33
        /** The wordid. */
34
        String wordid;
35

    
36
        /** The first word. */
37
        boolean firstWord = true;
38

    
39
        /** The wordvalue. */
40
        String wordvalue = "";
41

    
42
        /** The interpvalue. */
43
        String interpvalue = "";
44

    
45
        /** The lastword. */
46
        String lastword = " ";
47

    
48
        /** The wordtype. */
49
        String wordtype;
50

    
51
        /** The flagform. */
52
        boolean flagform = false;
53

    
54
        /** The flaginterp. */
55
        boolean flaginterp = false;
56

    
57
        /** The url. */
58
        private def url;
59

    
60
        /** The input data. */
61
        private def inputData;
62

    
63
        /** The factory. */
64
        private def factory;
65

    
66
        /** The parser. */
67
        private XMLStreamReader parser;
68

    
69
        /** The writer. */
70
        OutputStreamWriter writer;
71

    
72
        /** The pagedWriter. */
73
        StaxStackWriter pagedWriter = null;
74

    
75
        /** The infile. */
76
        File infile;
77

    
78
        /** The outfile. */
79
        File outfile;
80

    
81
        /** The pages. */
82
        ArrayList<File> pages = new ArrayList<File>();
83

    
84
        /** The idxstart. */
85
        ArrayList<String> idxstart = new ArrayList<String>();
86
        String paginationElement;
87
        def cssList;
88
        def wordTag = "w";
89
        def noteElements = new HashSet<String>();
90
        def outOfTextElements = new HashSet<String>();
91
        XTZPager pager;
92
        
93

    
94
        /**
95
         * Instantiates a new pager.
96
         *
97
         * @param infile the infile
98
         * @param outfile the outfile
99
         * @param NoSpaceBefore the no space before
100
         * @param NoSpaceAfter the no space after
101
         * @param max the max
102
         * @param basename the basename
103
         */
104
        public XTZDefaultPagerStep(XTZPager pager, File infile, String txtname, List<String> NoSpaceBefore,
105
        List<String> NoSpaceAfter, def cssList) {
106
                this.pager = pager;
107
                this.paginationElement = pager.page_element;
108
                this.cssList = cssList;
109
                this.basename = pager.corpusname;
110
                this.txtname = txtname;
111
                this.outdir = pager.outputDirectory;
112
                this.wordmax = pager.wordsPerPage;
113
                this.NoSpaceBefore = NoSpaceBefore;
114
                this.NoSpaceAfter = NoSpaceAfter;
115
                this.url = infile.toURI().toURL();
116
                this.infile = infile;
117
                this.wordTag= pager.wordTag;
118
                outdir.mkdirs()
119

    
120
                inputData = new BufferedInputStream(url.openStream());
121
                factory = XMLInputFactory.newInstance();
122
                parser = factory.createXMLStreamReader(inputData);
123
                
124
                String notesListString = pager.getImportModule().getParameters().getNoteElement().getTextContent()
125
                if (notesListString != null) for (def s : notesListString.split(",")) noteElements << s;
126
                
127
                String elems = pager.getImportModule().getParameters().getOutSideTextTagsAndKeepContentElement().getTextContent()
128
                if (elems != null) for (def s : elems.split(",")) outOfTextElements << s;
129
                
130
                process();
131
        }
132
        
133
        public String getAttributeValue(def parser, String ns, String name) {
134
                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
135
                        if (name == parser.getAttributeLocalName(i)) {
136
                                return parser.getAttributeValue(i).toString()
137
                        }
138
                }
139
                return "";
140
        }
141

    
142

    
143
        private def closeMultiWriter() {
144
                if (pagedWriter != null) {
145
                        def tags = pagedWriter.getTagStack().clone();
146

    
147
                        if (firstWord) { // there was no words
148
                                pagedWriter.writeCharacters("");
149
                                this.idxstart.add("${wordTag}_0")
150
                                pagedWriter.write("<span id=\"${wordTag}_0\"/>");
151
                                //                                }
152
                        }
153
                        pagedWriter.writeEndElements();
154

    
155
                        if (notes.size() > 0) {
156
                                pagedWriter.writeEmptyElement("hr", ["id":"notes", "width":"20%", "align":"left"]);
157
                                //pagedWriter.writeStartElement("ol");
158
                                int i = 1;
159
                                for (String note : notes) {
160
                                        //pagedWriter.writeStartElement("li");
161
                                        pagedWriter.writeStartElement("a", ["href":"#noteref_"+i, "name":"note_"+i]);
162
                                        pagedWriter.writeStartElement("sup")
163
                                        pagedWriter.writeCharacters(""+i)
164
                                        pagedWriter.writeEndElement() // </sub>
165
                                        pagedWriter.writeEndElement() // </a>
166
                                        pagedWriter.writeCharacters(note)
167
                                        pagedWriter.writeEmptyElement("br")
168
                                        //pagedWriter.writeEndElement() // </li>
169
                                        i++;
170
                                }
171
                                //pagedWriter.writeEndElement() // </ol>
172
                                notes.clear()
173
                        }
174

    
175
                        pagedWriter.close();
176
                        return tags;
177
                } else {
178
                        return [];
179
                }
180
        }
181

    
182
        /**
183
         * Creates the next output.
184
         *
185
         * @return true, if successful
186
         */
187
        private boolean createNextOutput() {
188
                wordcount = 0;
189
                try {
190

    
191
                        def tags = closeMultiWriter();
192
                        for (int i = 0 ; i < tags.size() ; i++) {
193
                                String tag = tags[i]
194
                                if ("div" != tag) {
195
                                        tags.remove(i--)
196
                                } else {
197
                                        tags.remove(i--) // remove first "div"
198
                                        break; // remove elements until first "div" tag
199
                                }
200
                        }
201
                        File outfile = new File(outdir, txtname+"_"+(++pagecount)+".html")
202
                        pages.add(outfile);
203
                        firstWord = true; // waiting for next word
204

    
205
                        pagedWriter = new StaxStackWriter(outfile, "UTF-8");
206

    
207
                        pagedWriter.writeStartDocument("UTF-8", "1.0")
208
                        pagedWriter.writeStartElement("html");
209
                        pagedWriter.writeEmptyElement("meta", ["http-equiv":"Content-Type", "content":"text/html","charset":"UTF-8"]);
210
                        for (String css : cssList) {
211
                                pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "type":"text/css","href":"$css"]);
212
                        }
213
                        pagedWriter.writeStartElement("head");
214
                        pagedWriter.writeStartElement("title")
215
                        pagedWriter.writeCharacters(basename.toUpperCase()+" Edition - Page "+pagecount)
216
                        pagedWriter.writeEndElement(); // </title>
217
                        pagedWriter.writeEndElement() // </head>
218
                        pagedWriter.writeStartElement("body") //<body>
219
                        pagedWriter.writeStartElement("div", ["class": "txmeditionpage"]) //<div>
220
                        pagedWriter.writeStartElements(tags);
221
                        return true;
222
                } catch (Exception e) {
223
                        System.out.println(e.getLocalizedMessage());
224
                        return false;
225
                }
226
        }
227

    
228
        /**
229
         * Creates the output.
230
         *
231
         * @param outfile the outfile
232
         * @return true, if successful
233
         */
234
        private boolean createOutput() {
235
                try {
236
                        return createNextOutput();
237
                } catch (Exception e) {
238
                        System.out.println(e.getLocalizedMessage());
239
                        return false;
240
                }
241
        }
242

    
243
        /**
244
         * Gets the page files.
245
         *
246
         * @return the page files
247
         */
248
        public ArrayList<File> getPageFiles() {
249
                return pages;
250
        }
251

    
252
        /**
253
         * Gets the idx.
254
         *
255
         * @return the idx
256
         */
257
        public ArrayList<String> getIdx() {
258
                return idxstart;
259
        }
260

    
261
        /**
262
         * Go to text.
263
         */
264
        private void goToText() {
265
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
266
                        if (event == XMLStreamConstants.END_ELEMENT)
267
                                if (parser.getLocalName().matches("teiHeader"))
268
                                        return;
269
                }
270
        }
271

    
272
        def notes = []
273
        def currentOutOfTextElements = [] // stack of element with out of text to edit opened element
274
        def writeOutOfTextToEditText = false
275
        /**
276
         * Process.
277
         */
278
        public boolean process() {
279
                boolean flagNote = false;
280
                String noteContent = "";
281
                String rend = ""
282
                goToText();
283

    
284
                String localname = "";
285
                createNextOutput();
286

    
287
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
288
                        rend = "";
289
                        switch (event) {
290
                                case XMLStreamConstants.START_ELEMENT:
291
                                        localname = parser.getLocalName();
292
                                        if (outOfTextElements.contains(localname)) {
293
                                                currentOutOfTextElements << localname
294
                                                writeOutOfTextToEditText = true;
295
                                        } else if (currentOutOfTextElements.size() > 0) {
296
                                                currentOutOfTextElements << localname
297
                                        }                        
298
                                        
299
                                        if (localname == paginationElement) {
300
                                                createNextOutput();
301
                                                wordcount=0;
302
                                                pagedWriter.write("\n");
303
                                                if (getAttributeValue(parser, null,"n") != null) {
304
                                                        pagedWriter.writeElement("p", ["class":"txmeditionpb", "align":"center"], getAttributeValue(parser, null,"n"))
305
                                                }
306
                                        }
307
                                        
308
                                        rend = getAttributeValue(parser, null, "rend")
309
                                        if (rend == null) rend = "";
310
                                        switch (localname) {
311
                                                case "text":
312
                                                        LinkedHashMap attributes = new LinkedHashMap();
313
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
314
                                                                attributes[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i).toString()
315
                                                        }
316
                                                        
317
                                                        pagedWriter.write("\n");
318
                                                        pagedWriter.writeStartElement("p")
319
                                                        pagedWriter.writeAttribute("class", rend);
320
                                                        if (attributes.containsKey("id")) {
321
                                                                pagedWriter.writeElement("h3", attributes["id"])
322
                                                        }
323

    
324
                                                        pagedWriter.writeStartElement("table");
325
                                                        for (String k : attributes.keySet()) {
326
                                                                if (k == "id") continue;
327
                                                                if (k == "rend") continue;
328

    
329
                                                                pagedWriter.writeStartElement("tr");
330
                                                                pagedWriter.writeElement("td", k);
331
                                                                pagedWriter.writeElement("td", attributes[k]);
332
                                                                pagedWriter.writeEndElement();
333
                                                        }
334
                                                        pagedWriter.writeEndElement() // table
335
                                                        pagedWriter.writeEndElement() // p
336

    
337
                                                        pagedWriter.writeCharacters("");
338
                                                        break;
339
                                                case "ref":
340
                                                        pagedWriter.writeStartElement("a")
341
                                                        pagedWriter.writeAttribute("href", getAttributeValue(parser, null, "target"));
342

    
343
                                                        pagedWriter.writeAttribute("target", "_blank");
344
                                                        pagedWriter.writeAttribute("class", rend);
345
                                                        break;
346
                                                case "head":
347
                                                        pagedWriter.write("\n");
348
                                                        pagedWriter.writeStartElement("h2", ["class":rend])
349
                                                        break;
350
                                                case "graphic":
351
                                                        pagedWriter.write("\n");
352
                                                        String url = getAttributeValue(parser, null, "url")
353
                                                        if (url != null) {
354
                                                                pagedWriter.writeStartElement("center", ["class":rend])
355
                                                                pagedWriter.writeEmptyElement("img", ["src":url, "align":"middle"])
356
                                                                pagedWriter.writeEndElement() // div
357
                                                        }
358
                                                        break;
359
                                                case "table":
360
                                                        pagedWriter.writeStartElement("table", ["class":rend])
361
                                                        pagedWriter.write("\n");
362
                                                        break;
363
                                                case "row":
364
                                                        pagedWriter.writeStartElement("tr", ["class":rend])
365
                                                        break;
366
                                                case "cell":
367
                                                        pagedWriter.writeStartElement("td", ["class":rend])
368
                                                        break;
369
                                                case "list":
370
                                                        String type = getAttributeValue(parser, null,"type");
371
                                                        if ("unordered" == type) {
372
                                                                pagedWriter.writeStartElement("ul", ["class":rend])
373
                                                        } else {
374
                                                                pagedWriter.writeStartElement("ol", ["class":rend])
375
                                                        }
376
                                                        break
377
                                                case "item":
378
                                                        pagedWriter.writeStartElement("li", ["class":rend])
379
                                                        break;
380
                                                case "hi":
381
                                                case "emph":
382
                                                        if ("i".equals(rend) || "italic".equals(rend)) {
383
                                                                pagedWriter.writeStartElement("i", ["class":rend])
384
                                                        } else if ("b".equals(rend) || "bold".equals(rend)) {
385
                                                                pagedWriter.writeStartElement("b", ["class":rend])
386
                                                        } else {
387
                                                                if ("emph".equals(localname)) {
388
                                                                        pagedWriter.writeStartElement("i", ["class":rend])
389
                                                                } else { // hi
390
                                                                        pagedWriter.writeStartElement("b", ["class":rend])
391
                                                                }
392
                                                        }
393

    
394
                                                        break;
395
                                                case "p":
396
                                                        pagedWriter.write("\n");
397
                                                        pagedWriter.writeStartElement("p", ["class":rend])
398
                                                        break;
399
                                                case "lb":
400
                                                        pagedWriter.writeEmptyElement("br")
401
                                                        break;
402
                                                case wordTag:
403
                                                        wordid = getAttributeValue(parser, null,"id");
404
                                                        
405
                                                        wordcount++;
406
                                                        if (wordcount >= wordmax) {
407
                                                                createNextOutput();
408
                                                        }
409
                                                        
410
                                                        if (firstWord) {
411
                                                                firstWord = false;
412
                                                                this.idxstart.add(wordid);
413
                                                        }
414
                                                        
415
                                                        break;
416
                                                case "ana":
417
                                                        flaginterp=true;
418
                                                        interpvalue+=" "+getAttributeValue(parser, null, "type").substring(1)+":"
419
                                                        break;
420
                                                case "form":
421
                                                        wordvalue="";
422
                                                        interpvalue ="";
423
                                                        flagform=true;
424
                                                        break;
425
                                                default:
426
                                                        if (noteElements.contains(localname)) {
427
                                                                flagNote = true;
428
                                                                noteContent = ""
429
                                                        }
430
                                                        break;
431
                                        }
432
                                        break;
433
                                case XMLStreamConstants.END_ELEMENT:
434
                                        localname = parser.getLocalName();
435
                                        if (currentOutOfTextElements.size() > 0) currentOutOfTextElements.pop()
436
                                        writeOutOfTextToEditText = currentOutOfTextElements.size() > 0
437
                                        
438
                                        switch (localname) {
439
                                                case "p":
440
                                                        pagedWriter.writeEndElement() // </p>
441
                                                        pagedWriter.write("\n");
442
                                                        break;
443
                                                case "head":
444
                                                        pagedWriter.writeEndElement() // </h2>
445
                                                        pagedWriter.write("\n");
446
                                                        break;
447
                                                case "list":
448
                                                        pagedWriter.writeEndElement(); // ul or ol
449
                                                        pagedWriter.write("\n");
450
                                                        break
451
                                                case "item":
452
                                                        pagedWriter.writeEndElement(); // li
453
                                                        pagedWriter.write("\n");
454
                                                        break;
455
                                                case "hi":
456
                                                        pagedWriter.writeEndElement(); // b
457
                                                        break;
458
                                                case "emph":
459
                                                        pagedWriter.writeEndElement(); // i
460
                                                        break;
461
                                                case "table":
462
                                                        pagedWriter.writeEndElement(); // table
463
                                                        pagedWriter.write("\n");
464
                                                        break;
465
                                                case "row":
466
                                                        pagedWriter.writeEndElement(); // tr
467
                                                        break;
468
                                                case "cell":
469
                                                        pagedWriter.writeEndElement(); // td
470
                                                        break;
471
                                                case "ref":
472
                                                        pagedWriter.writeEndElement() // </a>
473
                                                        break;
474
                                                case "form":
475
                                                        flagform = false
476
                                                        break;
477
                                                case "ana":
478
                                                        flaginterp = false
479
                                                        break;
480
                                                case wordTag:
481
                                                        int l = lastword.length();
482
                                                        String endOfLastWord = "";
483
                                                        if (l > 0)
484
                                                                endOfLastWord = lastword.subSequence(l-1, l);
485

    
486
                                                        if (interpvalue != null)
487
                                                                interpvalue = interpvalue;
488

    
489
                                                        if (NoSpaceBefore.contains(wordvalue) ||
490
                                                        NoSpaceAfter.contains(lastword) ||
491
                                                        wordvalue.startsWith("-") ||
492
                                                        NoSpaceAfter.contains(endOfLastWord)) {
493
                                                                pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
494
                                                        } else {
495
                                                                pagedWriter.writeCharacters(" ");
496
                                                                pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
497
                                                        }
498

    
499
                                                        pagedWriter.writeCharacters(wordvalue);
500
                                                        pagedWriter.writeEndElement();
501
                                                        lastword=wordvalue;
502
                                                        break;
503
                                                default:
504
                                                        if (noteElements.contains(localname)) {
505
                                                                flagNote = false;
506
                                                                if (noteContent.length() > 0) {
507
                                                                        notes << noteContent;
508
                                                                        pagedWriter.writeStartElement("a", ["href":"#note_"+notes.size(), "name":"noteref_"+notes.size(), "title":noteContent]);
509
                                                                        pagedWriter.writeStartElement("sup")
510
                                                                        pagedWriter.writeCharacters(""+notes.size())
511
                                                                        pagedWriter.writeEndElement() // </sub>
512
                                                                        pagedWriter.writeEndElement() // </span>
513
                                                                }
514
                                                        }
515
                                                        break;
516
                                        }
517
                                        break;
518
                                case XMLStreamConstants.CHARACTERS:
519
                                        if (flagform && parser.getText().length() > 0) {
520
                                                wordvalue+=(parser.getText());
521
                                                if (flagNote == parser.getText().length() > 0)
522
                                                        noteContent += parser.getText().replace("\n", " ");
523
                                        } else        if (flaginterp && parser.getText().length() > 0) {
524
                                                interpvalue+=(parser.getText());
525
                                        } else if (flagNote == parser.getText().length() > 0) {
526
                                                noteContent += parser.getText().replace("\n", " ");
527
                                        } else if (writeOutOfTextToEditText) {
528
                                                pagedWriter.writeCharacters(parser.getText())
529
                                        }
530
                                        break;
531
                        }
532
                }
533
                closeMultiWriter();
534
        }
535
}