Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZDefaultPagerStep.groovy @ 1000

History | View | Annotate | Download (14.8 kB)

1
package org.txm.scripts.importer.xtz;
2

    
3
import java.io.File;
4
import java.io.OutputStreamWriter;
5
import java.util.ArrayList;
6
import java.util.List;
7

    
8
import javax.xml.stream.*;
9

    
10
import org.txm.scripts.importer.StaxStackWriter;
11
import org.txm.importer.xtz.*
12

    
13
public class XTZDefaultPagerStep {
14

    
15
        List<String> NoSpaceBefore;
16

    
17
        /** The No space after. */
18
        List<String> NoSpaceAfter;
19

    
20
        /** The wordcount. */
21
        int wordcount = 0;
22

    
23
        /** The pagecount. */
24
        int pagecount = 0;
25

    
26
        /** The wordmax. */
27
        int wordmax = 0;
28

    
29
        /** The basename. */
30
        String basename = "";
31
        String txtname = "";
32
        File outdir;
33

    
34
        /** The wordid. */
35
        String wordid;
36

    
37
        /** The first word. */
38
        boolean firstWord = true;
39

    
40
        /** The wordvalue. */
41
        String wordvalue = "";
42

    
43
        /** The interpvalue. */
44
        String interpvalue = "";
45

    
46
        /** The lastword. */
47
        String lastword = " ";
48

    
49
        /** The wordtype. */
50
        String wordtype;
51

    
52
        /** The flagform. */
53
        boolean flagform = false;
54

    
55
        /** The flaginterp. */
56
        boolean flaginterp = false;
57

    
58
        /** The url. */
59
        private def url;
60

    
61
        /** The input data. */
62
        private def inputData;
63

    
64
        /** The factory. */
65
        private def factory;
66

    
67
        /** The parser. */
68
        private XMLStreamReader parser;
69

    
70
        /** The writer. */
71
        OutputStreamWriter writer;
72

    
73
        /** The pagedWriter. */
74
        StaxStackWriter pagedWriter = null;
75

    
76
        /** The infile. */
77
        File infile;
78

    
79
        /** The outfile. */
80
        File outfile;
81

    
82
        /** The pages. */
83
        //TODO enhance this to store the page name/id as well
84
        ArrayList<File> pages = new ArrayList<File>();
85

    
86
        /** The idxstart. */
87
        ArrayList<String> idxstart = new ArrayList<String>();
88
        String paginationElement;
89
        def cssList;
90
        def wordTag = "w";
91
        def noteElements = new HashSet<String>();
92
        def outOfTextElements = new HashSet<String>();
93
        XTZPager pager;
94
        
95

    
96
        /**
97
         * Instantiates a new pager.
98
         *
99
         * @param infile the infile
100
         * @param outfile the outfile
101
         * @param NoSpaceBefore the no space before
102
         * @param NoSpaceAfter the no space after
103
         * @param max the max
104
         * @param basename the basename
105
         */
106
        public XTZDefaultPagerStep(XTZPager pager, File infile, String txtname, List<String> NoSpaceBefore,
107
        List<String> NoSpaceAfter, def cssList) {
108
                this.pager = pager;
109
                this.paginationElement = pager.page_element;
110
                this.cssList = cssList;
111
                this.basename = pager.corpusname;
112
                this.txtname = txtname;
113
                this.outdir = pager.outputDirectory;
114
                this.wordmax = pager.wordsPerPage;
115
                this.NoSpaceBefore = NoSpaceBefore;
116
                this.NoSpaceAfter = NoSpaceAfter;
117
                this.url = infile.toURI().toURL();
118
                this.infile = infile;
119
                this.wordTag= pager.wordTag;
120
                outdir.mkdirs()
121

    
122
                inputData = new BufferedInputStream(url.openStream());
123
                factory = XMLInputFactory.newInstance();
124
                parser = factory.createXMLStreamReader(inputData);
125
                
126
                String notesListString = pager.getImportModule().getParameters().getNoteElement().getTextContent()
127
                if (notesListString != null) for (def s : notesListString.split(",")) noteElements << s;
128
                
129
                String elems = pager.getImportModule().getParameters().getOutSideTextTagsAndKeepContentElement().getTextContent()
130
                if (elems != null) for (def s : elems.split(",")) outOfTextElements << s;
131
                
132
                process();
133
        }
134
        
135
        public String getAttributeValue(def parser, String ns, String name) {
136
                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
137
                        if (name == parser.getAttributeLocalName(i)) {
138
                                return parser.getAttributeValue(i).toString()
139
                        }
140
                }
141
                return "";
142
        }
143

    
144

    
145
        private def closeMultiWriter() {
146
                if (pagedWriter != null) {
147
                        def tags = pagedWriter.getTagStack().clone();
148

    
149
                        if (firstWord) { // there was no words
150
                                pagedWriter.writeCharacters("");
151
                                this.idxstart.add("${wordTag}_0")
152
                                pagedWriter.write("<span id=\"${wordTag}_0\"/>");
153
                                //                                }
154
                        }
155
                        pagedWriter.writeEndElements();
156

    
157
                        if (notes.size() > 0) {
158
                                pagedWriter.writeEmptyElement("hr", ["id":"notes", "width":"20%", "align":"left"]);
159
                                //pagedWriter.writeStartElement("ol");
160
                                int i = 1;
161
                                for (String note : notes) {
162
                                        //pagedWriter.writeStartElement("li");
163
                                        pagedWriter.writeStartElement("a", ["href":"#noteref_"+i, "name":"note_"+i]);
164
                                        pagedWriter.writeStartElement("sup")
165
                                        pagedWriter.writeCharacters(""+i)
166
                                        pagedWriter.writeEndElement() // </sub>
167
                                        pagedWriter.writeEndElement() // </a>
168
                                        pagedWriter.writeCharacters(note)
169
                                        pagedWriter.writeEmptyElement("br")
170
                                        //pagedWriter.writeEndElement() // </li>
171
                                        i++;
172
                                }
173
                                //pagedWriter.writeEndElement() // </ol>
174
                                notes.clear()
175
                        }
176

    
177
                        pagedWriter.close();
178
                        return tags;
179
                } else {
180
                        return [];
181
                }
182
        }
183

    
184
        /**
185
         * Creates the next output.
186
         *
187
         * @return true, if successful
188
         */
189
        private boolean createNextOutput() {
190
                wordcount = 0;
191
                try {
192

    
193
                        def tags = closeMultiWriter();
194
                        for (int i = 0 ; i < tags.size() ; i++) {
195
                                String tag = tags[i]
196
                                if ("div" != tag) {
197
                                        tags.remove(i--)
198
                                } else {
199
                                        tags.remove(i--) // remove first "div"
200
                                        break; // remove elements until first "div" tag
201
                                }
202
                        }
203
                        File outfile = new File(outdir, txtname+"_"+(++pagecount)+".html")
204
                        pages.add(outfile);
205
                        firstWord = true; // waiting for next word
206

    
207
                        pagedWriter = new StaxStackWriter(outfile, "UTF-8");
208

    
209
                        pagedWriter.writeStartDocument("UTF-8", "1.0")
210
                        pagedWriter.writeStartElement("html");
211
                        pagedWriter.writeEmptyElement("meta", ["http-equiv":"Content-Type", "content":"text/html","charset":"UTF-8"]);
212
                        for (String css : cssList) {
213
                                pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "type":"text/css","href":"$css"]);
214
                        }
215
                        pagedWriter.writeStartElement("head");
216
                        pagedWriter.writeStartElement("title")
217
                        pagedWriter.writeCharacters(basename.toUpperCase()+" Edition - Page "+pagecount)
218
                        pagedWriter.writeEndElement(); // </title>
219
                        pagedWriter.writeEndElement() // </head>
220
                        pagedWriter.writeStartElement("body") //<body>
221
                        pagedWriter.writeStartElement("div", ["class": "txmeditionpage"]) //<div>
222
                        pagedWriter.writeStartElements(tags);
223
                        return true;
224
                } catch (Exception e) {
225
                        System.out.println(e.getLocalizedMessage());
226
                        return false;
227
                }
228
        }
229

    
230
        /**
231
         * Creates the output.
232
         *
233
         * @param outfile the outfile
234
         * @return true, if successful
235
         */
236
        private boolean createOutput() {
237
                try {
238
                        return createNextOutput();
239
                } catch (Exception e) {
240
                        System.out.println(e.getLocalizedMessage());
241
                        return false;
242
                }
243
        }
244

    
245
        /**
246
         * Gets the page files.
247
         *
248
         * @return the page files
249
         */
250
        public ArrayList<File> getPageFiles() {
251
                return pages;
252
        }
253

    
254
        /**
255
         * Gets the idx.
256
         *
257
         * @return the idx
258
         */
259
        public ArrayList<String> getIdx() {
260
                return idxstart;
261
        }
262

    
263
        /**
264
         * Go to text.
265
         */
266
        private void goToText() {
267
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
268
                        if (event == XMLStreamConstants.END_ELEMENT)
269
                                if (parser.getLocalName().matches("teiHeader"))
270
                                        return;
271
                }
272
        }
273

    
274
        def notes = []
275
        def currentOutOfTextElements = [] // stack of element with out of text to edit opened element
276
        def writeOutOfTextToEditText = false
277
        /**
278
         * Process.
279
         */
280
        public boolean process() {
281
                boolean flagNote = false;
282
                String noteContent = "";
283
                String rend = ""
284
                goToText();
285

    
286
                String localname = "";
287
                createNextOutput();
288

    
289
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
290
                        rend = "";
291
                        switch (event) {
292
                                case XMLStreamConstants.START_ELEMENT:
293
                                        localname = parser.getLocalName();
294
                                        if (outOfTextElements.contains(localname)) {
295
                                                currentOutOfTextElements << localname
296
                                                writeOutOfTextToEditText = true;
297
                                        } else if (currentOutOfTextElements.size() > 0) {
298
                                                currentOutOfTextElements << localname
299
                                        }                        
300
                                        
301
                                        if (localname == paginationElement) {
302
                                                createNextOutput();
303
                                                wordcount=0;
304
                                                pagedWriter.write("\n");
305
                                                if (getAttributeValue(parser, null,"n") != null) {
306
                                                        pagedWriter.writeElement("p", ["class":"txmeditionpb", "align":"center"], getAttributeValue(parser, null,"n"))
307
                                                }
308
                                        }
309
                                        
310
                                        rend = getAttributeValue(parser, null, "rend")
311
                                        if (rend == null) rend = "";
312
                                        switch (localname) {
313
                                                case "text":
314
                                                        LinkedHashMap attributes = new LinkedHashMap();
315
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
316
                                                                attributes[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i).toString()
317
                                                        }
318
                                                        
319
                                                        pagedWriter.write("\n");
320
                                                        pagedWriter.writeStartElement("p")
321
                                                        pagedWriter.writeAttribute("class", rend);
322
                                                        if (attributes.containsKey("id")) {
323
                                                                pagedWriter.writeElement("h3", attributes["id"])
324
                                                        }
325

    
326
                                                        pagedWriter.writeStartElement("table");
327
                                                        for (String k : attributes.keySet()) {
328
                                                                if (k == "id") continue;
329
                                                                if (k == "rend") continue;
330

    
331
                                                                pagedWriter.writeStartElement("tr");
332
                                                                pagedWriter.writeElement("td", k);
333
                                                                pagedWriter.writeElement("td", attributes[k]);
334
                                                                pagedWriter.writeEndElement();
335
                                                        }
336
                                                        pagedWriter.writeEndElement() // table
337
                                                        pagedWriter.writeEndElement() // p
338

    
339
                                                        pagedWriter.writeCharacters("");
340
                                                        break;
341
                                                case "ref":
342
                                                        pagedWriter.writeStartElement("a")
343
                                                        pagedWriter.writeAttribute("href", getAttributeValue(parser, null, "target"));
344

    
345
                                                        pagedWriter.writeAttribute("target", "_blank");
346
                                                        pagedWriter.writeAttribute("class", rend);
347
                                                        break;
348
                                                case "head":
349
                                                        pagedWriter.write("\n");
350
                                                        pagedWriter.writeStartElement("h2", ["class":rend])
351
                                                        break;
352
                                                case "graphic":
353
                                                        pagedWriter.write("\n");
354
                                                        String url = getAttributeValue(parser, null, "url")
355
                                                        if (url != null) {
356
                                                                // TEI <graphic rend="left-image" url="image.png"/> -> <center class="left-image"><img href="image.png"/></center> + <moncorpus>.css avec rule ".left-image"
357
                                                                pagedWriter.writeStartElement("center", ["class":rend]) // css -> .<rend> { ... } styles OR
358
                                                                pagedWriter.writeEmptyElement("img", ["src":url, "align":"middle"])
359
                                                                pagedWriter.writeEndElement() // div
360
                                                        }
361
                                                        break;
362
                                                case "table":
363
                                                        pagedWriter.writeStartElement("table", ["class":rend])
364
                                                        pagedWriter.write("\n");
365
                                                        break;
366
                                                case "row":
367
                                                        pagedWriter.writeStartElement("tr", ["class":rend])
368
                                                        break;
369
                                                case "cell":
370
                                                        pagedWriter.writeStartElement("td", ["class":rend])
371
                                                        break;
372
                                                case "list":
373
                                                        String type = getAttributeValue(parser, null,"type");
374
                                                        if ("unordered" == type) {
375
                                                                pagedWriter.writeStartElement("ul", ["class":rend])
376
                                                        } else {
377
                                                                pagedWriter.writeStartElement("ol", ["class":rend])
378
                                                        }
379
                                                        break
380
                                                case "item":
381
                                                        pagedWriter.writeStartElement("li", ["class":rend])
382
                                                        break;
383
                                                case "hi":
384
                                                case "emph":
385
                                                        if ("i".equals(rend) || "italic".equals(rend)) {
386
                                                                pagedWriter.writeStartElement("i", ["class":rend])
387
                                                        } else if ("b".equals(rend) || "bold".equals(rend)) {
388
                                                                pagedWriter.writeStartElement("b", ["class":rend])
389
                                                        } else {
390
                                                                if ("emph".equals(localname)) {
391
                                                                        pagedWriter.writeStartElement("i", ["class":rend])
392
                                                                } else { // hi
393
                                                                        pagedWriter.writeStartElement("b", ["class":rend])
394
                                                                }
395
                                                        }
396

    
397
                                                        break;
398
                                                case "p":
399
                                                        pagedWriter.write("\n");
400
                                                        pagedWriter.writeStartElement("p", ["class":rend])
401
                                                        break;
402
                                                case "lb":
403
                                                        pagedWriter.writeEmptyElement("br")
404
                                                        break;
405
                                                case wordTag:
406
                                                        wordid = getAttributeValue(parser, null,"id");
407
                                                        
408
                                                        wordcount++;
409
                                                        if (wordcount >= wordmax) {
410
                                                                createNextOutput();
411
                                                        }
412
                                                        
413
                                                        if (firstWord) {
414
                                                                firstWord = false;
415
                                                                this.idxstart.add(wordid);
416
                                                        }
417
                                                        
418
                                                        break;
419
                                                case "ana":
420
                                                        flaginterp=true;
421
                                                        interpvalue+=" "+getAttributeValue(parser, null, "type").substring(1)+":"
422
                                                        break;
423
                                                case "form":
424
                                                        wordvalue="";
425
                                                        interpvalue ="";
426
                                                        flagform=true;
427
                                                        break;
428
                                                default:
429
                                                        if (noteElements.contains(localname)) {
430
                                                                flagNote = true;
431
                                                                noteContent = ""
432
                                                        }
433
                                                        break;
434
                                        }
435
                                        break;
436
                                case XMLStreamConstants.END_ELEMENT:
437
                                        localname = parser.getLocalName();
438
                                        if (currentOutOfTextElements.size() > 0) currentOutOfTextElements.pop()
439
                                        writeOutOfTextToEditText = currentOutOfTextElements.size() > 0
440
                                        
441
                                        switch (localname) {
442
                                                case "p":
443
                                                        pagedWriter.writeEndElement() // </p>
444
                                                        pagedWriter.write("\n");
445
                                                        break;
446
                                                case "head":
447
                                                        pagedWriter.writeEndElement() // </h2>
448
                                                        pagedWriter.write("\n");
449
                                                        break;
450
                                                case "list":
451
                                                        pagedWriter.writeEndElement(); // ul or ol
452
                                                        pagedWriter.write("\n");
453
                                                        break
454
                                                case "item":
455
                                                        pagedWriter.writeEndElement(); // li
456
                                                        pagedWriter.write("\n");
457
                                                        break;
458
                                                case "hi":
459
                                                        pagedWriter.writeEndElement(); // b
460
                                                        break;
461
                                                case "emph":
462
                                                        pagedWriter.writeEndElement(); // i
463
                                                        break;
464
                                                case "table":
465
                                                        pagedWriter.writeEndElement(); // table
466
                                                        pagedWriter.write("\n");
467
                                                        break;
468
                                                case "row":
469
                                                        pagedWriter.writeEndElement(); // tr
470
                                                        break;
471
                                                case "cell":
472
                                                        pagedWriter.writeEndElement(); // td
473
                                                        break;
474
                                                case "ref":
475
                                                        pagedWriter.writeEndElement() // </a>
476
                                                        break;
477
                                                case "form":
478
                                                        flagform = false
479
                                                        break;
480
                                                case "ana":
481
                                                        flaginterp = false
482
                                                        break;
483
                                                case wordTag:
484
                                                        int l = lastword.length();
485
                                                        String endOfLastWord = "";
486
                                                        if (l > 0)
487
                                                                endOfLastWord = lastword.subSequence(l-1, l);
488

    
489
                                                        if (interpvalue != null)
490
                                                                interpvalue = interpvalue;
491

    
492
                                                        if (NoSpaceBefore.contains(wordvalue) ||
493
                                                        NoSpaceAfter.contains(lastword) ||
494
                                                        wordvalue.startsWith("-") ||
495
                                                        NoSpaceAfter.contains(endOfLastWord)) {
496
                                                                pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
497
                                                        } else {
498
                                                                pagedWriter.writeCharacters(" ");
499
                                                                pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
500
                                                        }
501

    
502
                                                        pagedWriter.writeCharacters(wordvalue);
503
                                                        pagedWriter.writeEndElement();
504
                                                        lastword=wordvalue;
505
                                                        break;
506
                                                default:
507
                                                        if (noteElements.contains(localname)) {
508
                                                                flagNote = false;
509
                                                                if (noteContent.length() > 0) {
510
                                                                        notes << noteContent;
511
                                                                        pagedWriter.writeStartElement("a", ["href":"#note_"+notes.size(), "name":"noteref_"+notes.size(), "title":noteContent]);
512
                                                                        pagedWriter.writeStartElement("sup")
513
                                                                        pagedWriter.writeCharacters(""+notes.size())
514
                                                                        pagedWriter.writeEndElement() // </sub>
515
                                                                        pagedWriter.writeEndElement() // </span>
516
                                                                }
517
                                                        }
518
                                                        break;
519
                                        }
520
                                        break;
521
                                case XMLStreamConstants.CHARACTERS:
522
                                        if (flagform && parser.getText().length() > 0) {
523
                                                wordvalue+=(parser.getText());
524
                                                if (flagNote == parser.getText().length() > 0)
525
                                                        noteContent += parser.getText().replace("\n", " ");
526
                                        } else        if (flaginterp && parser.getText().length() > 0) {
527
                                                interpvalue+=(parser.getText());
528
                                        } else if (flagNote == parser.getText().length() > 0) {
529
                                                noteContent += parser.getText().replace("\n", " ");
530
                                        } else if (writeOutOfTextToEditText) {
531
                                                pagedWriter.writeCharacters(parser.getText())
532
                                        }
533
                                        break;
534
                        }
535
                }
536
                closeMultiWriter();
537
        }
538
}