Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / doc / pager.groovy @ 187

History | View | Annotate | Download (12.3 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2012-04-24 14:30:47 +0200 (mar., 24 avr. 2012) $
25
// $LastChangedRevision: 2174 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.doc;
29

    
30
import org.txm.importer.*;
31
import org.xml.sax.Attributes;
32
import org.txm.importer.filters.*;
33
import java.io.File;
34
import java.io.IOException;
35
import java.util.ArrayList;
36
import javax.xml.parsers.SAXParserFactory;
37
import javax.xml.parsers.ParserConfigurationException;
38
import javax.xml.parsers.SAXParser;
39
import javax.xml.stream.*;
40
import java.net.URL;
41
import org.xml.sax.InputSource;
42
import org.xml.sax.helpers.DefaultHandler;
43

    
44
// TODO: Auto-generated Javadoc
45
/** 
46
 * Build edition from xml-tei-odt
47
 * @author mdecorde
48
 * 
49
 */
50
class pager {
51
        List<String> NoSpaceBefore;
52

    
53
        /** The No space after. */
54
        List<String> NoSpaceAfter;
55

    
56
        /** The wordcount. */
57
        int wordcount = 0;
58

    
59
        /** The pagecount. */
60
        int pagecount = 0;
61

    
62
        /** The wordmax. */
63
        int wordmax = 0;
64

    
65
        /** The basename. */
66
        String basename = "";
67
        String txtname = "";
68
        File outdir;
69

    
70
        /** The wordid. */
71
        String wordid;
72

    
73
        /** The first word. */
74
        boolean firstWord = true;
75

    
76
        /** The wordvalue. */
77
        String wordvalue;
78

    
79
        /** The interpvalue. */
80
        String interpvalue;
81

    
82
        /** The lastword. */
83
        String lastword = " ";
84

    
85
        /** The wordtype. */
86
        String wordtype;
87

    
88
        /** The flagform. */
89
        boolean flagform = false;
90

    
91
        /** The flaginterp. */
92
        boolean flaginterp = false;
93

    
94
        /** The url. */
95
        private def url;
96

    
97
        /** The input data. */
98
        private def inputData;
99

    
100
        /** The factory. */
101
        private def factory;
102

    
103
        /** The parser. */
104
        private XMLStreamReader parser;
105

    
106
        /** The writer. */
107
        OutputStreamWriter writer;
108

    
109
        /** The multiwriter. */
110
        OutputStreamWriter multiwriter = null;
111
        
112
        /** The pagedWriter. */
113
        StaxStackWriter pagedWriter = null;
114

    
115
        /** The infile. */
116
        File infile;
117

    
118
        /** The outfile. */
119
        File outfile;
120

    
121
        /** The pages. */
122
        ArrayList<File> pages = new ArrayList<File>();
123

    
124
        /** The idxstart. */
125
        ArrayList<String> idxstart = new ArrayList<String>();
126
        String editionPage;
127

    
128
        /**
129
         * Instantiates a new pager.
130
         *
131
         * @param infile the infile
132
         * @param outfile the outfile
133
         * @param NoSpaceBefore the no space before
134
         * @param NoSpaceAfter the no space after
135
         * @param max the max
136
         * @param basename the basename
137
         */
138
        pager(File infile, File outdir, String txtname, List<String> NoSpaceBefore,
139
        List<String> NoSpaceAfter, int max, String basename, String editionPage) {
140
                this.editionPage = editionPage;
141
                this.basename = basename;
142
                this.txtname = txtname;
143
                this.outdir = outdir;
144
                this.wordmax = max;
145
                this.NoSpaceBefore = NoSpaceBefore;
146
                this.NoSpaceAfter = NoSpaceAfter;
147
                this.url = infile.toURI().toURL();
148
                this.infile = infile;
149

    
150
                inputData = url.openStream();
151
                factory = XMLInputFactory.newInstance();
152
                parser = factory.createXMLStreamReader(inputData);
153
                process();
154
        }
155

    
156
        private def closeMultiWriter()
157
        {
158
                if (pagedWriter != null) {
159
                        def tags = pagedWriter.getTagStack().clone();
160
                        if (firstWord) { // there was no words
161
                                this.idxstart.add("w_0")
162
                                pagedWriter.write("<span id=\"w_0\"/>");
163
                        }
164

    
165
                        pagedWriter.writeEndElements();
166
                        pagedWriter.close();
167
                        return tags;
168

    
169
                        //                        pagedWriter.write("</body>");
170
                        //                        pagedWriter.write("</html>");
171
                        //                        pagedWriter.close();
172
                } else {
173
                        return [];
174
                }
175
        }
176

    
177
        /**
178
         * Creates the next output.
179
         *
180
         * @return true, if successful
181
         */
182
        private boolean createNextOutput()
183
        {
184
                wordcount = 0;
185
                try {
186
                        def tags = closeMultiWriter()
187
                        for (int i = 0 ; i < tags.size() ; i++) {
188
                                String tag = tags[i]
189
                                if ("body" != tag) {
190
                                        tags.remove(i--)
191
                                } else {
192
                                        tags.remove(i--) // remove "body"
193
                                        break; // remove elements until "body tag
194
                                }
195
                        }
196
                        File outfile = new File(outdir, txtname+"_"+(++pagecount)+".html")
197
                        pages.add(outfile)
198
                        firstWord = true // waiting for next word
199

    
200
                        pagedWriter = new StaxStackWriter(outfile , "UTF-8");
201
                        pagedWriter.writeStartDocument("UTF-8", "1.0")
202
                        pagedWriter.writeStartElement("html")
203
                        pagedWriter.writeEmptyElement("meta", ["http-equiv":"Content-Type", "content":"text/html","charset":"UTF-8"])
204
                        pagedWriter.writeStartElement("head")
205
                        pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "href":txtname+".css"])
206
                        pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "href":"doc.css"])
207
                        pagedWriter.writeStartElement("title")
208
                        pagedWriter.writeCharacters(basename.toUpperCase()+" Edition - Page "+pagecount)
209
                        pagedWriter.writeEndElement() // </title>
210
                        pagedWriter.writeEndElement() // </head>
211
                        pagedWriter.writeStartElement("body") //<body>
212
                        pagedWriter.writeStartElements(tags)
213
                        return true;
214
                } catch (Exception e) {
215
                        System.out.println(e.getLocalizedMessage());
216
                        return false;
217
                }
218
        }
219

    
220
        /**
221
         * Creates the output.
222
         *
223
         * @param outfile the outfile
224
         * @return true, if successful
225
         */
226
        private boolean createOutput() {
227
                try {
228
                        return createNextOutput();
229
                } catch (Exception e) {
230
                        System.out.println(e.getLocalizedMessage());
231
                        return false;
232
                }
233
        }
234

    
235
        /**
236
         * Gets the page files.
237
         *
238
         * @return the page files
239
         */
240
        public ArrayList<File> getPageFiles() {
241
                return pages;
242
        }
243

    
244
        /**
245
         * Gets the idx.
246
         *
247
         * @return the idx
248
         */
249
        public ArrayList<String> getIdx() {
250
                return idxstart;
251
        }
252

    
253
        void gotoText() {
254
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
255
                        switch (event) {
256
                                case XMLStreamConstants.END_ELEMENT:
257
                                        if(parser.getLocalName() == "teiHeader")
258
                                                return;
259
                        }
260
                }
261
        }
262
        /**
263
         * Process.
264
         */
265
        void process() {
266
                try {
267
                String localname = "";
268
                String figurerend = ""
269
                boolean unordered = false;
270
                createNextOutput();
271
                
272
                gotoText();
273
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
274
                        switch (event) {
275
                                case XMLStreamConstants.START_ELEMENT:
276
                                        localname = parser.getLocalName();
277
                                        switch (localname) {
278
                                                case "textunit":
279
                                                        if (parser.getAttributeValue(null,"id") != null) {
280
                                                                pagedWriter.writeStartElement("h3")
281
                                                                pagedWriter.writeCharacters(parser.getAttributeValue(null,"id"))
282
                                                                pagedWriter.writeEndElement() // h3
283
                                                        }
284
                                                        pagedWriter.writeStartElement("table")
285
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
286
                                                                pagedWriter.writeStartElement("tr")
287
                                                                pagedWriter.writeStartElement("td")
288
                                                                pagedWriter.writeCharacters(parser.getAttributeLocalName(i))
289
                                                                pagedWriter.writeEndElement() // td
290
                                                                pagedWriter.writeStartElement("td")
291
                                                                pagedWriter.writeCharacters(parser.getAttributeValue(i))
292
                                                                pagedWriter.writeEndElement() // td
293
                                                                pagedWriter.writeEndElement() // tr
294
                                                        }
295
                                                        pagedWriter.writeEndElement() // table
296
                                                        pagedWriter.writeEmptyElement("br")
297
                                                        break;
298
                                                case "head":
299
                                                        String rend = parser.getAttributeValue(null, "rend")
300
                                                        if (rend == null) rend = "normal"
301
                                                        pagedWriter.writeStartElement("h2", ["class":rend])
302
                                                        break;
303
                                                case "figure":
304
                                                        figurerend = parser.getAttributeValue(null, "rend")
305
                                                        break;
306
                                                case "graphic":
307
                                                        String url = parser.getAttributeValue(null, "url")
308
                                                        if (url.startsWith("..//")) url = url.substring(4);
309
                                                        else if (url.startsWith("../")) url = url.substring(3);
310

    
311
                                                        if (url != null) { 
312
                                                                pagedWriter.writeStartElement("div")
313
                                                                pagedWriter.writeEmptyElement("img", ["class":figurerend, "src":url])
314
                                                                pagedWriter.writeEndElement() // div
315
                                                        }
316
                                                        figurerend = "";
317
                                                        break;
318
                                                case "lg":
319
                                                case "p":
320
                                                case "q":
321
                                                        String rend = parser.getAttributeValue(null, "rend")
322
                                                        if (rend == null) rend = "normal"
323
                                                        pagedWriter.writeStartElement("p", ["class":rend])
324
                                                        break;
325
                                                //case "pb":
326
                                                case editionPage:
327
                                                        createNextOutput();
328
                                                        wordcount=0;
329
                                                        if (parser.getAttributeValue(null,"n") != null) {
330
                                                                pagedWriter.writeStartElement("p", ["style":"color:red", "align":"center"])
331
                                                                pagedWriter.writeCharacters("- "+parser.getAttributeValue(null,"n")+" -")
332
                                                                pagedWriter.writeEndElement() // p
333
                                                        }
334
                                                        break;
335
                                                case "table":
336
                                                        pagedWriter.writeStartElement("table", ["class":parser.getAttributeValue(null, "rend")])
337
                                                        break;
338
                                                case "row":
339
                                                        pagedWriter.writeStartElement("tr")
340
                                                        break;
341
                                                case "cell":
342
                                                        pagedWriter.writeStartElement("td")
343
                                                        break;
344
                                                case "lb":
345
                                                case "br":
346
                                                        pagedWriter.writeEmptyElement("br")
347
                                                        break;
348
                                                case "w":
349
                                                        wordid = parser.getAttributeValue(null,"id");
350

    
351
                                                        if (firstWord) {
352
                                                                firstWord = false;
353
                                                                this.idxstart.add(wordid);
354
                                                        }
355
                                                        wordcount++;
356

    
357
                                                        break;
358
                                                case "list":
359
                                                        String type = parser.getAttributeValue(null,"type");
360
                                                        if ("unordered" == type) {
361
                                                                unordered = true;
362
                                                                pagedWriter.writeStartElement("ul")
363
                                                        } else {
364
                                                                unordered = false
365
                                                                pagedWriter.writeStartElement("ol")
366
                                                        }
367
                                                        break
368
                                                case "item":
369
                                                        pagedWriter.writeStartElement("li")
370
                                                        break;
371
                                                case "hi":
372
                                                        pagedWriter.writeStartElement("b")
373
                                                        break;
374
                                                case "emph":
375
                                                        pagedWriter.writeStartElement("i")
376
                                                        break;
377
                                                case "ana":
378
                                                        flaginterp=true;
379
                                                        interpvalue+=" "+parser.getAttributeValue(null,"type").substring(1)+":"
380
                                                        break;
381

    
382
                                                case "form":
383
                                                        wordvalue="";
384
                                                        interpvalue ="";
385
                                                        flagform=true;
386
                                        }
387
                                        break;
388
                                case XMLStreamConstants.END_ELEMENT:
389
                                        localname = parser.getLocalName();
390
                                        switch(localname)
391
                                        {
392
                                                case "head":
393
                                                        pagedWriter.writeEndElement(); // h2
394
                                                        break;
395
                                                case "lg":
396
                                                case "p":
397
                                                case "q":
398
                                                        pagedWriter.writeEndElement(); // p
399
                                                        if (wordcount >= wordmax) {
400
                                                                createNextOutput();
401
                                                        }
402
                                                        break;
403
                                                case "list":
404
                                                        pagedWriter.writeEndElement(); // ul or ol
405
                                                        break
406
                                                case "item":
407
                                                        pagedWriter.writeEndElement(); // li
408
                                                        break;
409
                                                case "hi":
410
                                                        pagedWriter.writeEndElement(); // b
411
                                                        break;
412
                                                case "emph":
413
                                                        pagedWriter.writeEndElement(); // i
414
                                                        break;
415
                                                case "table":
416
                                                        pagedWriter.writeEndElement(); // table
417
                                                        break;
418
                                                case "row":
419
                                                        pagedWriter.writeEndElement(); // tr
420
                                                        break;
421
                                                case "cell":
422
                                                        pagedWriter.writeEndElement(); // td
423
                                                        break;
424
                                                case "form":
425
                                                        flagform = false
426
                                                        break;
427
                                                case "ana":
428
                                                        flaginterp = false
429
                                                        break;
430
                                                case "w":
431
                                                        int l = lastword.length();
432
                                                        String endOfLastWord = "";
433
                                                        if (l > 0)
434
                                                                endOfLastWord = lastword.subSequence(l-1, l);
435

    
436
                                                        if (interpvalue != null)
437
                                                                interpvalue = interpvalue;
438

    
439
                                                        if (NoSpaceBefore.contains(wordvalue) ||
440
                                                                NoSpaceAfter.contains(lastword) ||
441
                                                                wordvalue.startsWith("-") ||
442
                                                                NoSpaceAfter.contains(endOfLastWord))
443
                                                        {
444
                                                                // multiwriter.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\">");
445
                                                        } else {
446
                                                                // multiwriter.write(" <span title=\""+interpvalue+"\" id=\""+wordid+"\">");
447
                                                                pagedWriter.writeCharacters(" ");
448
                                                        }
449
                                                        pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
450
                                                        pagedWriter.writeCharacters(wordvalue);
451
                                                        pagedWriter.writeEndElement() // span
452
                                                        lastword=wordvalue;
453
                                                        break;
454
                                        }
455
                                        break;
456
                                case XMLStreamConstants.CHARACTERS:
457
                                        if (flagform)
458
                                                if (parser.getText().length() > 0)
459
                                                        wordvalue+=(parser.getText());
460
                                        if (flaginterp)
461
                                                if (parser.getText().length() > 0)
462
                                                        interpvalue+=(parser.getText());
463
                                        break;
464
                        }
465
                }
466
                //writer.write("</body>");
467
                //writer.write("</html>");
468
                //writer.close();
469
                closeMultiWriter();
470
        } catch(Exception e) { org.txm.utils.logger.Log.printStackTrace(e); pagedWriter.close(); }
471
        }
472
}