Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / xtz / XTZDefaultPagerStep.groovy @ 187

History | View | Annotate | Download (14.2 kB)

1
package org.txm.importer.xtz;
2

    
3
import java.io.File;
4
import java.io.OutputStreamWriter;
5
import java.util.ArrayList;
6
import java.util.List;
7

    
8
import javax.xml.stream.*;
9

    
10
import org.txm.importer.StaxStackWriter;
11

    
12
public class XTZDefaultPagerStep {
13

    
14
        List<String> NoSpaceBefore;
15

    
16
        /** The No space after. */
17
        List<String> NoSpaceAfter;
18

    
19
        /** The wordcount. */
20
        int wordcount = 0;
21

    
22
        /** The pagecount. */
23
        int pagecount = 0;
24

    
25
        /** The wordmax. */
26
        int wordmax = 0;
27

    
28
        /** The basename. */
29
        String basename = "";
30
        String txtname = "";
31
        File outdir;
32

    
33
        /** The wordid. */
34
        String wordid;
35

    
36
        /** The first word. */
37
        boolean firstWord = true;
38

    
39
        /** The wordvalue. */
40
        String wordvalue = "";
41

    
42
        /** The interpvalue. */
43
        String interpvalue = "";
44

    
45
        /** The lastword. */
46
        String lastword = " ";
47

    
48
        /** The wordtype. */
49
        String wordtype;
50

    
51
        /** The flagform. */
52
        boolean flagform = false;
53

    
54
        /** The flaginterp. */
55
        boolean flaginterp = false;
56

    
57
        /** The url. */
58
        private def url;
59

    
60
        /** The input data. */
61
        private def inputData;
62

    
63
        /** The factory. */
64
        private def factory;
65

    
66
        /** The parser. */
67
        private XMLStreamReader parser;
68

    
69
        /** The writer. */
70
        OutputStreamWriter writer;
71

    
72
        /** The pagedWriter. */
73
        StaxStackWriter pagedWriter = null;
74

    
75
        /** The infile. */
76
        File infile;
77

    
78
        /** The outfile. */
79
        File outfile;
80

    
81
        /** The pages. */
82
        ArrayList<File> pages = new ArrayList<File>();
83

    
84
        /** The idxstart. */
85
        ArrayList<String> idxstart = new ArrayList<String>();
86
        String paginationElement;
87
        def cssList;
88
        def wordTag = "w";
89
        def noteElements = new HashSet<String>();
90
        def outOfTextElements = new HashSet<String>();
91
        XTZPager pager;
92

    
93
        /**
94
         * Instantiates a new pager.
95
         *
96
         * @param infile the infile
97
         * @param outfile the outfile
98
         * @param NoSpaceBefore the no space before
99
         * @param NoSpaceAfter the no space after
100
         * @param max the max
101
         * @param basename the basename
102
         */
103
        public XTZDefaultPagerStep(XTZPager pager, File infile, String txtname, List<String> NoSpaceBefore,
104
        List<String> NoSpaceAfter, def cssList) {
105
                this.pager = pager;
106
                this.paginationElement = pager.page_element;
107
                this.cssList = cssList;
108
                this.basename = pager.corpusname;
109
                this.txtname = txtname;
110
                this.outdir = pager.outputDirectory;
111
                this.wordmax = pager.wordsPerPage;
112
                this.NoSpaceBefore = NoSpaceBefore;
113
                this.NoSpaceAfter = NoSpaceAfter;
114
                this.url = infile.toURI().toURL();
115
                this.infile = infile;
116
                this.wordTag= pager.wordTag;
117
                outdir.mkdirs()
118

    
119
                inputData = new BufferedInputStream(url.openStream());
120
                factory = XMLInputFactory.newInstance();
121
                parser = factory.createXMLStreamReader(inputData);
122
                
123
                String notesListString = pager.getImportModule().getParameters().getNoteElement().getTextContent()
124
                if (notesListString != null) for (def s : notesListString.split(",")) noteElements << s;
125
                
126
                String elems = pager.getImportModule().getParameters().getOutSideTextTagsAndKeepContentElement().getTextContent()
127
                if (elems != null) for (def s : elems.split(",")) outOfTextElements << s;
128
                
129
                process();
130
        }
131

    
132
        private def closeMultiWriter() {
133
                if (pagedWriter != null) {
134
                        def tags = pagedWriter.getTagStack().clone();
135

    
136
                        if (firstWord) { // there was no words
137
                                pagedWriter.writeCharacters("");
138
                                this.idxstart.add("${wordTag}_0")
139
                                pagedWriter.write("<span id=\"${wordTag}_0\"/>");
140
                                //                                }
141
                        }
142
                        pagedWriter.writeEndElements();
143

    
144
                        if (notes.size() > 0) {
145
                                pagedWriter.writeEmptyElement("hr", ["id":"notes", "width":"20%", "align":"left"]);
146
                                //pagedWriter.writeStartElement("ol");
147
                                int i = 1;
148
                                for (String note : notes) {
149
                                        //pagedWriter.writeStartElement("li");
150
                                        pagedWriter.writeStartElement("a", ["href":"#noteref_"+i, "name":"note_"+i]);
151
                                        pagedWriter.writeStartElement("sup")
152
                                        pagedWriter.writeCharacters(""+i)
153
                                        pagedWriter.writeEndElement() // </sub>
154
                                        pagedWriter.writeEndElement() // </a>
155
                                        pagedWriter.writeCharacters(note)
156
                                        pagedWriter.writeEmptyElement("br")
157
                                        //pagedWriter.writeEndElement() // </li>
158
                                        i++;
159
                                }
160
                                //pagedWriter.writeEndElement() // </ol>
161
                                notes.clear()
162
                        }
163

    
164
                        pagedWriter.close();
165
                        return tags;
166
                } else {
167
                        return [];
168
                }
169
        }
170

    
171
        /**
172
         * Creates the next output.
173
         *
174
         * @return true, if successful
175
         */
176
        private boolean createNextOutput() {
177
                wordcount = 0;
178
                try {
179

    
180
                        def tags = closeMultiWriter();
181
                        for (int i = 0 ; i < tags.size() ; i++) {
182
                                String tag = tags[i]
183
                                if ("body" != tag) {
184
                                        tags.remove(i--)
185
                                } else {
186
                                        tags.remove(i--) // remove "body"
187
                                        break; // remove elements until "body tag
188
                                }
189
                        }
190
                        File outfile = new File(outdir, txtname+"_"+(++pagecount)+".html")
191
                        pages.add(outfile);
192
                        firstWord = true; // waiting for next word
193

    
194
                        pagedWriter = new StaxStackWriter(outfile, "UTF-8");
195

    
196
                        pagedWriter.writeStartDocument("UTF-8", "1.0")
197
                        pagedWriter.writeStartElement("html");
198
                        pagedWriter.writeEmptyElement("meta", ["http-equiv":"Content-Type", "content":"text/html","charset":"UTF-8"]);
199
                        for (String css : cssList) {
200
                                pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "type":"text/css","href":"$css"]);
201
                        }
202
                        pagedWriter.writeStartElement("head");
203
                        pagedWriter.writeStartElement("title")
204
                        pagedWriter.writeCharacters(basename.toUpperCase()+" Edition - Page "+pagecount)
205
                        pagedWriter.writeEndElement(); // </title>
206
                        pagedWriter.writeEndElement() // </head>
207
                        pagedWriter.writeStartElement("body") //<body>
208
                        pagedWriter.writeStartElement("div", ["class": "txmeditionpage"]) //<div>
209
                        pagedWriter.writeStartElements(tags);
210
                        return true;
211
                } catch (Exception e) {
212
                        System.out.println(e.getLocalizedMessage());
213
                        return false;
214
                }
215
        }
216

    
217
        /**
218
         * Creates the output.
219
         *
220
         * @param outfile the outfile
221
         * @return true, if successful
222
         */
223
        private boolean createOutput() {
224
                try {
225
                        return createNextOutput();
226
                } catch (Exception e) {
227
                        System.out.println(e.getLocalizedMessage());
228
                        return false;
229
                }
230
        }
231

    
232
        /**
233
         * Gets the page files.
234
         *
235
         * @return the page files
236
         */
237
        public ArrayList<File> getPageFiles() {
238
                return pages;
239
        }
240

    
241
        /**
242
         * Gets the idx.
243
         *
244
         * @return the idx
245
         */
246
        public ArrayList<String> getIdx() {
247
                return idxstart;
248
        }
249

    
250
        /**
251
         * Go to text.
252
         */
253
        private void goToText() {
254
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
255
                        if (event == XMLStreamConstants.END_ELEMENT)
256
                                if (parser.getLocalName().matches("teiHeader"))
257
                                        return;
258
                }
259
        }
260

    
261
        def notes = []
262
        def currentOutOfTextElements = [] // stack of element with out of text to edit opened element
263
        def writeOutOfTextToEditText = false
264
        /**
265
         * Process.
266
         */
267
        public boolean process() {
268
                boolean flagNote = false;
269
                String noteContent = "";
270
                String rend = ""
271
                goToText();
272

    
273
                String localname = "";
274
                createNextOutput();
275

    
276
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
277
                        rend = "";
278
                        switch (event) {
279
                                case XMLStreamConstants.START_ELEMENT:
280
                                        localname = parser.getLocalName();
281
                                        if (outOfTextElements.contains(localname)) {
282
                                                currentOutOfTextElements << localname
283
                                                writeOutOfTextToEditText = true;
284
                                        } else if (currentOutOfTextElements.size() > 0) {
285
                                                currentOutOfTextElements << localname
286
                                        }                        
287
                                        
288
                                        rend = parser.getAttributeValue(null, "rend")
289
                                        if (rend == null) rend = "";
290
                                        switch (localname) {
291
                                                case "text":
292
                                                        LinkedHashMap attributes = new LinkedHashMap();
293
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
294
                                                                attributes[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i).toString()
295
                                                        }
296
                                                        pagedWriter.write("\n");
297
                                                        pagedWriter.writeStartElement("p")
298
                                                        pagedWriter.writeAttribute("class", rend);
299
                                                        if (attributes.containsKey("id")) {
300
                                                                pagedWriter.writeElement("h3", attributes["id"])
301
                                                        }
302

    
303
                                                        pagedWriter.writeStartElement("table");
304
                                                        for (String k : attributes.keySet()) {
305
                                                                if (k == "id") continue;
306
                                                                if (k == "rend") continue;
307

    
308
                                                                pagedWriter.writeStartElement("tr");
309
                                                                pagedWriter.writeElement("td", k);
310
                                                                pagedWriter.writeElement("td", attributes[k]);
311
                                                                pagedWriter.writeEndElement();
312
                                                        }
313
                                                        pagedWriter.writeEndElement() // table
314
                                                        pagedWriter.writeEndElement() // p
315

    
316
                                                        pagedWriter.writeCharacters("");
317
                                                        break;
318
                                                case "ref":
319
                                                        pagedWriter.writeStartElement("a")
320
                                                        pagedWriter.writeAttribute("href", parser.getAttributeValue(null, "target"));
321

    
322
                                                        pagedWriter.writeAttribute("target", "_blank");
323
                                                        pagedWriter.writeAttribute("class", rend);
324
                                                        break;
325
                                                case "head":
326
                                                        pagedWriter.write("\n");
327
                                                        pagedWriter.writeStartElement("h2", ["class":rend])
328
                                                        break;
329
                                                case "graphic":
330
                                                        pagedWriter.write("\n");
331
                                                        String url = parser.getAttributeValue(null, "url")
332
                                                        if (url != null) {
333
                                                                pagedWriter.writeStartElement("center", ["class":rend])
334
                                                                pagedWriter.writeEmptyElement("img", ["src":url, "align":"middle"])
335
                                                                pagedWriter.writeEndElement() // div
336
                                                        }
337
                                                        break;
338
                                                case "table":
339
                                                        pagedWriter.writeStartElement("table", ["class":rend])
340
                                                        pagedWriter.write("\n");
341
                                                        break;
342
                                                case "row":
343
                                                        pagedWriter.writeStartElement("tr", ["class":rend])
344
                                                        break;
345
                                                case "cell":
346
                                                        pagedWriter.writeStartElement("td", ["class":rend])
347
                                                        break;
348
                                                case "list":
349
                                                        String type = parser.getAttributeValue(null,"type");
350
                                                        if ("unordered" == type) {
351
                                                                pagedWriter.writeStartElement("ul", ["class":rend])
352
                                                        } else {
353
                                                                pagedWriter.writeStartElement("ol", ["class":rend])
354
                                                        }
355
                                                        break
356
                                                case "item":
357
                                                        pagedWriter.writeStartElement("li", ["class":rend])
358
                                                        break;
359
                                                case "hi":
360
                                                case "emph":
361
                                                        if ("i".equals(rend) || "italic".equals(rend)) {
362
                                                                pagedWriter.writeStartElement("i", ["class":rend])
363
                                                        } else if ("b".equals(rend) || "bold".equals(rend)) {
364
                                                                pagedWriter.writeStartElement("b", ["class":rend])
365
                                                        } else {
366
                                                                if ("emph".equals(localname)) {
367
                                                                        pagedWriter.writeStartElement("i", ["class":rend])
368
                                                                } else { // hi
369
                                                                        pagedWriter.writeStartElement("b", ["class":rend])
370
                                                                }
371
                                                        }
372

    
373
                                                        break;
374
                                                case "p":
375
                                                        pagedWriter.write("\n");
376
                                                        pagedWriter.writeStartElement("p", ["class":rend])
377
                                                        break;
378
                                                case paginationElement:
379
                                                        createNextOutput();
380
                                                        wordcount=0;
381
                                                        pagedWriter.write("\n");
382
                                                        if (parser.getAttributeValue(null,"n") != null) {
383
                                                                pagedWriter.writeElement("p", ["class":"txmeditionpb", "align":"center"], parser.getAttributeValue(null,"n"))
384
                                                        }
385
                                                        break;
386
                                                case "lb":
387
                                                        pagedWriter.writeEmptyElement("br")
388
                                                        break;
389
                                                case wordTag:
390
                                                        wordid = parser.getAttributeValue(null,"id");
391
                                                        if (firstWord) {
392
                                                                firstWord = false;
393
                                                                this.idxstart.add(wordid);
394
                                                        }
395
                                                        wordcount++;
396
                                                        if (wordcount >= wordmax) {
397
                                                                createNextOutput();
398
                                                        }
399
                                                        break;
400
                                                case "ana":
401
                                                        flaginterp=true;
402
                                                        interpvalue+=" "+parser.getAttributeValue(null,"type").substring(1)+":"
403
                                                        break;
404
                                                case "form":
405
                                                        wordvalue="";
406
                                                        interpvalue ="";
407
                                                        flagform=true;
408
                                                        break;
409
                                                default:
410
                                                        if (noteElements.contains(localname)) {
411
                                                                flagNote = true;
412
                                                                noteContent = ""
413
                                                        }
414
                                                        break;
415
                                        }
416
                                        break;
417
                                case XMLStreamConstants.END_ELEMENT:
418
                                        localname = parser.getLocalName();
419
                                        if (currentOutOfTextElements.size() > 0) currentOutOfTextElements.pop()
420
                                        writeOutOfTextToEditText = currentOutOfTextElements.size() > 0
421
                                        
422
                                        switch (localname) {
423
                                                case "p":
424
                                                        pagedWriter.writeEndElement() // </p>
425
                                                        pagedWriter.write("\n");
426
                                                        break;
427
                                                case "head":
428
                                                        pagedWriter.writeEndElement() // </h2>
429
                                                        pagedWriter.write("\n");
430
                                                        break;
431
                                                case "list":
432
                                                        pagedWriter.writeEndElement(); // ul or ol
433
                                                        pagedWriter.write("\n");
434
                                                        break
435
                                                case "item":
436
                                                        pagedWriter.writeEndElement(); // li
437
                                                        pagedWriter.write("\n");
438
                                                        break;
439
                                                case "hi":
440
                                                        pagedWriter.writeEndElement(); // b
441
                                                        break;
442
                                                case "emph":
443
                                                        pagedWriter.writeEndElement(); // i
444
                                                        break;
445
                                                case "table":
446
                                                        pagedWriter.writeEndElement(); // table
447
                                                        pagedWriter.write("\n");
448
                                                        break;
449
                                                case "row":
450
                                                        pagedWriter.writeEndElement(); // tr
451
                                                        break;
452
                                                case "cell":
453
                                                        pagedWriter.writeEndElement(); // td
454
                                                        break;
455
                                                case "ref":
456
                                                        pagedWriter.writeEndElement() // </a>
457
                                                        break;
458
                                                case "form":
459
                                                        flagform = false
460
                                                        break;
461
                                                case "ana":
462
                                                        flaginterp = false
463
                                                        break;
464
                                                case wordTag:
465
                                                        int l = lastword.length();
466
                                                        String endOfLastWord = "";
467
                                                        if (l > 0)
468
                                                                endOfLastWord = lastword.subSequence(l-1, l);
469

    
470
                                                        if (interpvalue != null)
471
                                                                interpvalue = interpvalue;
472

    
473
                                                        if (NoSpaceBefore.contains(wordvalue) ||
474
                                                        NoSpaceAfter.contains(lastword) ||
475
                                                        wordvalue.startsWith("-") ||
476
                                                        NoSpaceAfter.contains(endOfLastWord)) {
477
                                                                pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
478
                                                        } else {
479
                                                                pagedWriter.writeCharacters(" ");
480
                                                                pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
481
                                                        }
482

    
483
                                                        pagedWriter.writeCharacters(wordvalue);
484
                                                        pagedWriter.writeEndElement();
485
                                                        lastword=wordvalue;
486
                                                        break;
487
                                                default:
488
                                                        if (noteElements.contains(localname)) {
489
                                                                flagNote = false;
490
                                                                if (noteContent.length() > 0) {
491
                                                                        notes << noteContent;
492
                                                                        pagedWriter.writeStartElement("a", ["href":"#note_"+notes.size(), "name":"noteref_"+notes.size(), "title":noteContent]);
493
                                                                        pagedWriter.writeStartElement("sup")
494
                                                                        pagedWriter.writeCharacters(""+notes.size())
495
                                                                        pagedWriter.writeEndElement() // </sub>
496
                                                                        pagedWriter.writeEndElement() // </span>
497
                                                                }
498
                                                        }
499
                                                        break;
500
                                        }
501
                                        break;
502
                                case XMLStreamConstants.CHARACTERS:
503
                                        if (flagform && parser.getText().length() > 0) {
504
                                                wordvalue+=(parser.getText());
505
                                                if (flagNote == parser.getText().length() > 0)
506
                                                        noteContent += parser.getText().replace("\n", " ");
507
                                        } else        if (flaginterp && parser.getText().length() > 0) {
508
                                                interpvalue+=(parser.getText());
509
                                        } else if (flagNote == parser.getText().length() > 0) {
510
                                                noteContent += parser.getText().replace("\n", " ");
511
                                        } else if (writeOutOfTextToEditText) {
512
                                                pagedWriter.writeCharacters(parser.getText())
513
                                        }
514
                                        break;
515
                        }
516
                }
517
                closeMultiWriter();
518
        }
519
}