Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / doc / pager.groovy @ 1688

History | View | Annotate | Download (13 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2012-04-24 14:30:47 +0200 (mar., 24 avr. 2012) $
25
// $LastChangedRevision: 2174 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.doc;
29

    
30
import org.txm.scripts.importer.*;
31
import org.xml.sax.Attributes;
32
import org.txm.importer.scripts.filters.*;
33
import java.io.File;
34
import java.io.IOException;
35
import java.util.ArrayList;
36
import javax.xml.parsers.SAXParserFactory;
37
import javax.xml.parsers.ParserConfigurationException;
38
import javax.xml.parsers.SAXParser;
39
import javax.xml.stream.*;
40
import java.net.URL;
41
import org.xml.sax.InputSource;
42
import org.xml.sax.helpers.DefaultHandler;
43

    
44
// TODO: Auto-generated Javadoc
45
/** 
46
 * Build edition from xml-tei-odt
47
 * @author mdecorde
48
 * 
49
 */
50
class pager {
51
        List<String> NoSpaceBefore;
52

    
53
        /** The No space after. */
54
        List<String> NoSpaceAfter;
55

    
56
        /** The wordcount. */
57
        int wordcount = 0;
58

    
59
        /** The pagecount. */
60
        int pagecount = 0;
61

    
62
        /** The wordmax. */
63
        int wordmax = 0;
64

    
65
        /** The basename. */
66
        String basename = "";
67
        String txtname = "";
68
        File outdir;
69

    
70
        /** The wordid. */
71
        String wordid;
72

    
73
        /** The first word. */
74
        boolean firstWord = true;
75

    
76
        /** The wordvalue. */
77
        String wordvalue;
78

    
79
        /** The interpvalue. */
80
        String interpvalue;
81

    
82
        /** The lastword. */
83
        String lastword = " ";
84

    
85
        /** The wordtype. */
86
        String wordtype;
87

    
88
        /** The flagform. */
89
        boolean flagform = false;
90

    
91
        /** The flaginterp. */
92
        boolean flaginterp = false;
93

    
94
        /** The url. */
95
        private def url;
96

    
97
        /** The input data. */
98
        private def inputData;
99

    
100
        /** The factory. */
101
        private def factory;
102

    
103
        /** The parser. */
104
        private XMLStreamReader parser;
105

    
106
        /** The writer. */
107
        OutputStreamWriter writer;
108

    
109
        /** The multiwriter. */
110
        OutputStreamWriter multiwriter = null;
111

    
112
        /** The pagedWriter. */
113
        StaxStackWriter pagedWriter = null;
114

    
115
        /** The infile. */
116
        File infile;
117

    
118
        /** The outfile. */
119
        File outfile;
120

    
121
        /** The pages. */
122
        ArrayList<File> pages = new ArrayList<File>();
123

    
124
        /** The idxstart. */
125
        ArrayList<String> idxstart = new ArrayList<String>();
126
        String editionPage;
127

    
128
        /**
129
         * Instantiates a new pager.
130
         *
131
         * @param infile the infile
132
         * @param outfile the outfile
133
         * @param NoSpaceBefore the no space before
134
         * @param NoSpaceAfter the no space after
135
         * @param max the max
136
         * @param basename the basename
137
         */
138
        pager(File infile, File outdir, String txtname, List<String> NoSpaceBefore,
139
        List<String> NoSpaceAfter, int max, String basename, String editionPage) {
140
                this.editionPage = editionPage;
141
                this.basename = basename;
142
                this.txtname = txtname;
143
                this.outdir = outdir;
144
                this.wordmax = max;
145
                this.NoSpaceBefore = NoSpaceBefore;
146
                this.NoSpaceAfter = NoSpaceAfter;
147
                this.url = infile.toURI().toURL();
148
                this.infile = infile;
149

    
150
                inputData = url.openStream();
151
                factory = XMLInputFactory.newInstance();
152
                parser = factory.createXMLStreamReader(inputData);
153
                process();
154
        }
155

    
156
        private def closeMultiWriter()
157
        {
158
                if (pagedWriter != null) {
159
                        def tags = pagedWriter.getTagStack().clone();
160
                        if (firstWord) { // there was no words
161
                                this.idxstart.add("w_0")
162
                                pagedWriter.write("<span id=\"w_0\"/>");
163
                        }
164

    
165
                        pagedWriter.writeEndElements();
166
                        pagedWriter.close();
167
                        return tags;
168

    
169
                        //                        pagedWriter.write("</body>");
170
                        //                        pagedWriter.write("</html>");
171
                        //                        pagedWriter.close();
172
                } else {
173
                        return [];
174
                }
175
        }
176

    
177
        /**
178
         * Creates the next output.
179
         *
180
         * @return true, if successful
181
         */
182
        private boolean createNextOutput()
183
        {
184
                wordcount = 0;
185
                try {
186
                        def tags = closeMultiWriter()
187
                        for (int i = 0 ; i < tags.size() ; i++) {
188
                                String tag = tags[i]
189
                                if ("body" != tag) {
190
                                        tags.remove(i--)
191
                                } else {
192
                                        tags.remove(i--) // remove "body"
193
                                        break; // remove elements until "body tag
194
                                }
195
                        }
196
                        File outfile = new File(outdir, txtname+"_"+(++pagecount)+".html")
197
                        pages.add(outfile)
198
                        firstWord = true // waiting for next word
199

    
200
                        pagedWriter = new StaxStackWriter(outfile , "UTF-8");
201
                        pagedWriter.writeStartDocument("UTF-8", "1.0")
202
                        pagedWriter.writeStartElement("html")
203
                        pagedWriter.writeEmptyElement("meta", ["http-equiv":"Content-Type", "content":"text/html","charset":"UTF-8"])
204
                        pagedWriter.writeStartElement("head")
205
                        pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "href":txtname+".css"])
206
                        pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "href":"doc.css"])
207
                        pagedWriter.writeStartElement("title")
208
                        pagedWriter.writeCharacters(basename.toUpperCase()+" Edition - Page "+pagecount)
209
                        pagedWriter.writeEndElement() // </title>
210
                        pagedWriter.writeEndElement() // </head>
211
                        pagedWriter.writeStartElement("body") //<body>
212
                        pagedWriter.writeStartElements(tags)
213
                        return true;
214
                } catch (Exception e) {
215
                        System.out.println(e.getLocalizedMessage());
216
                        return false;
217
                }
218
        }
219

    
220
        /**
221
         * Creates the output.
222
         *
223
         * @param outfile the outfile
224
         * @return true, if successful
225
         */
226
        private boolean createOutput() {
227
                try {
228
                        return createNextOutput();
229
                } catch (Exception e) {
230
                        System.out.println(e.getLocalizedMessage());
231
                        return false;
232
                }
233
        }
234

    
235
        /**
236
         * Gets the page files.
237
         *
238
         * @return the page files
239
         */
240
        public ArrayList<File> getPageFiles() {
241
                return pages;
242
        }
243

    
244
        /**
245
         * Gets the idx.
246
         *
247
         * @return the idx
248
         */
249
        public ArrayList<String> getIdx() {
250
                return idxstart;
251
        }
252

    
253
        void gotoText() {
254
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
255
                        switch (event) {
256
                                case XMLStreamConstants.END_ELEMENT:
257
                                        if(parser.getLocalName() == "teiHeader")
258
                                                return;
259
                        }
260
                }
261
        }
262
        /**
263
         * Process.
264
         */
265
        void process() {
266
                try {
267
                        String localname = "";
268
                        String figurerend = ""
269
                        boolean unordered = false;
270
                        createNextOutput();
271

    
272
                        gotoText();
273
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
274
                                switch (event) {
275
                                        case XMLStreamConstants.START_ELEMENT:
276
                                                localname = parser.getLocalName();
277

    
278
                                                if (localname == editionPage) {
279
                                                        createNextOutput();
280
                                                        wordcount=0;
281
                                                        if (parser.getAttributeValue(null,"n") != null) {
282
                                                                pagedWriter.writeStartElement("p", ["style":"color:red", "align":"center"])
283
                                                                pagedWriter.writeCharacters("- "+parser.getAttributeValue(null,"n")+" -")
284
                                                                pagedWriter.writeEndElement() // p
285
                                                        }
286
                                                }
287

    
288
                                                switch (localname) {
289
                                                        case "textunit":
290
                                                                if (parser.getAttributeValue(null,"id") != null) {
291
                                                                        pagedWriter.writeStartElement("h3")
292
                                                                        pagedWriter.writeCharacters(parser.getAttributeValue(null,"id"))
293
                                                                        pagedWriter.writeEndElement() // h3
294
                                                                }
295
                                                                pagedWriter.writeStartElement("table")
296
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
297
                                                                        pagedWriter.writeStartElement("tr")
298
                                                                        pagedWriter.writeStartElement("td")
299
                                                                        pagedWriter.writeCharacters(parser.getAttributeLocalName(i))
300
                                                                        pagedWriter.writeEndElement() // td
301
                                                                        pagedWriter.writeStartElement("td")
302
                                                                        pagedWriter.writeCharacters(parser.getAttributeValue(i))
303
                                                                        pagedWriter.writeEndElement() // td
304
                                                                        pagedWriter.writeEndElement() // tr
305
                                                                }
306
                                                                pagedWriter.writeEndElement() // table
307
                                                                pagedWriter.writeEmptyElement("br")
308
                                                                break;
309
                                                        case "head":
310
                                                                String rend = parser.getAttributeValue(null, "rend")
311
                                                                if (rend == null) rend = "normal"
312
                                                                pagedWriter.writeStartElement("h2", ["class":rend])
313
                                                                break;
314
                                                        case "figure":
315
                                                                figurerend = parser.getAttributeValue(null, "rend")
316
                                                                break;
317
                                                        case "graphic":
318
                                                                String url = parser.getAttributeValue(null, "url")
319
                                                                if (url.startsWith("..//")) url = url.substring(4);
320
                                                                else if (url.startsWith("../")) url = url.substring(3);
321

    
322
                                                                if (url != null) {
323
                                                                        pagedWriter.writeStartElement("div")
324
                                                                        pagedWriter.writeEmptyElement("img", ["class":figurerend, "src":url])
325
                                                                        pagedWriter.writeEndElement() // div
326
                                                                }
327
                                                                figurerend = "";
328
                                                                break;
329
                                                        case "lg":
330
                                                        case "p":
331
                                                        case "q":
332
                                                                String rend = parser.getAttributeValue(null, "rend")
333
                                                                if (rend == null) rend = "normal"
334
                                                                pagedWriter.writeStartElement("p", ["class":rend])
335
                                                                break;
336
                                                        //case "pb":
337
                                                        case "table":
338
                                                                pagedWriter.writeStartElement("table", ["class":parser.getAttributeValue(null, "rend")])
339
                                                                break;
340
                                                        case "row":
341
                                                                pagedWriter.writeStartElement("tr")
342
                                                                break;
343
                                                        case "cell":
344
                                                                pagedWriter.writeStartElement("td")
345
                                                                break;
346
                                                        case "lb":
347
                                                        case "br":
348
                                                                pagedWriter.writeEmptyElement("br")
349
                                                                break;
350
                                                        case "list":
351
                                                                String type = parser.getAttributeValue(null,"type");
352
                                                                if ("unordered" == type) {
353
                                                                        unordered = true;
354
                                                                        pagedWriter.writeStartElement("ul")
355
                                                                } else {
356
                                                                        unordered = false
357
                                                                        pagedWriter.writeStartElement("ol")
358
                                                                }
359
                                                                break
360
                                                        case "item":
361
                                                                pagedWriter.writeStartElement("li")
362
                                                                break;
363
                                                        case "hi":
364
                                                                pagedWriter.writeStartElement("b")
365
                                                                break;
366
                                                        case "emph":
367
                                                                pagedWriter.writeStartElement("i")
368
                                                                break;
369
                                                        case "w":
370
                                                                wordid = parser.getAttributeValue(null,"id");
371

    
372
                                                                if (firstWord) {
373
                                                                        firstWord = false;
374
                                                                        this.idxstart.add(wordid);
375
                                                                }
376
                                                                wordcount++;
377

    
378
                                                                break;
379
                                                        case "ana":
380
                                                                flaginterp=true;
381
                                                                interpvalue+=" "+parser.getAttributeValue(null,"type").substring(1)+":"
382
                                                                break;
383

    
384
                                                        case "form":
385
                                                                wordvalue="";
386
                                                                interpvalue ="";
387
                                                                flagform=true;
388
                                                }
389
                                                break;
390
                                        case XMLStreamConstants.END_ELEMENT:
391
                                                localname = parser.getLocalName();
392
                                                switch(localname)
393
                                                {
394
                                                        case "head":
395
                                                                pagedWriter.writeEndElement(); // h2
396
                                                                pagedWriter.writeCharacters("\t")
397
                                                                break;
398
                                                        case "lg":
399
                                                        case "p":
400
                                                        case "q":
401
                                                                pagedWriter.writeEndElement(); // p
402
                                                                pagedWriter.writeCharacters("\t")
403
                                                                if (wordcount >= wordmax) {
404
                                                                        createNextOutput();
405
                                                                }
406
                                                                break;
407
                                                                
408
                                                        case "list":
409
                                                                pagedWriter.writeEndElement(); // ul or ol
410
                                                                pagedWriter.writeCharacters("\t")
411
                                                                break
412
                                                        case "item":
413
                                                                pagedWriter.writeEndElement(); // li
414
                                                                pagedWriter.writeCharacters("\t")
415
                                                                break;
416
                                                        case "hi":
417
                                                                pagedWriter.writeEndElement(); // b
418
                                                                break;
419
                                                        case "emph":
420
                                                                pagedWriter.writeEndElement(); // i
421
                                                                break;
422
                                                        case "table":
423
                                                                pagedWriter.writeEndElement(); // table
424
                                                                pagedWriter.writeCharacters("\t")
425
                                                                break;
426
                                                        case "row":
427
                                                                pagedWriter.writeEndElement(); // tr
428
                                                                pagedWriter.writeCharacters("\t")
429
                                                                break;
430
                                                        case "cell":
431
                                                                pagedWriter.writeEndElement(); // td
432
                                                                break;
433
                                                        case "form":
434
                                                                flagform = false
435
                                                                break;
436
                                                        case "ana":
437
                                                                flaginterp = false
438
                                                                break;
439
                                                        case "w":
440
                                                                int l = lastword.length();
441
                                                                String endOfLastWord = "";
442
                                                                if (l > 0)
443
                                                                        endOfLastWord = lastword.subSequence(l-1, l);
444

    
445
                                                                if (interpvalue != null)
446
                                                                        interpvalue = interpvalue;
447

    
448
                                                                if (NoSpaceBefore.contains(wordvalue) ||
449
                                                                NoSpaceAfter.contains(lastword) ||
450
                                                                wordvalue.startsWith("-") ||
451
                                                                NoSpaceAfter.contains(endOfLastWord))
452
                                                                {
453
                                                                        // multiwriter.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\">");
454
                                                                } else {
455
                                                                        // multiwriter.write(" <span title=\""+interpvalue+"\" id=\""+wordid+"\">");
456
                                                                        pagedWriter.writeCharacters(" ");
457
                                                                }
458
                                                                pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
459
                                                                pagedWriter.writeCharacters(wordvalue);
460
                                                                pagedWriter.writeEndElement() // span
461
                                                                lastword=wordvalue;
462
                                                                break;
463
                                                }
464
                                                break;
465
                                        case XMLStreamConstants.CHARACTERS:
466
                                                if (flagform)
467
                                                        if (parser.getText().length() > 0)
468
                                                                wordvalue+=(parser.getText());
469
                                                if (flaginterp)
470
                                                        if (parser.getText().length() > 0)
471
                                                                interpvalue+=(parser.getText());
472
                                                break;
473
                                }
474
                        }
475
                        //writer.write("</body>");
476
                        //writer.write("</html>");
477
                        //writer.close();
478
                        closeMultiWriter();
479
                        if (parser != null) parser.close();
480
                        if (inputData != null) inputData.close();
481
                } catch(Exception e) { 
482
                        println "Error while processing ${infile} at "+parser.getLocation().getLineNumber()+": "+e
483
                        org.txm.utils.logger.Log.printStackTrace(e); 
484
                        pagedWriter.close();
485
                        if (parser != null) parser.close();
486
                        if (inputData != null) inputData.close();
487
                 }
488
        }
489
}