Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / doc / pager.groovy @ 1000

History | View | Annotate | Download (12.3 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2012-04-24 14:30:47 +0200 (mar., 24 avr. 2012) $
25
// $LastChangedRevision: 2174 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.doc;
29

    
30
import org.txm.scripts.importer.*;
31
import org.xml.sax.Attributes;
32
import org.txm.importer.scripts.filters.*;
33
import java.io.File;
34
import java.io.IOException;
35
import java.util.ArrayList;
36
import javax.xml.parsers.SAXParserFactory;
37
import javax.xml.parsers.ParserConfigurationException;
38
import javax.xml.parsers.SAXParser;
39
import javax.xml.stream.*;
40
import java.net.URL;
41
import org.xml.sax.InputSource;
42
import org.xml.sax.helpers.DefaultHandler;
43

    
44
// TODO: Auto-generated Javadoc
45
/** 
46
 * Build edition from xml-tei-odt
47
 * @author mdecorde
48
 * 
49
 */
50
class pager {
51
        List<String> NoSpaceBefore;
52

    
53
        /** The No space after. */
54
        List<String> NoSpaceAfter;
55

    
56
        /** The wordcount. */
57
        int wordcount = 0;
58

    
59
        /** The pagecount. */
60
        int pagecount = 0;
61

    
62
        /** The wordmax. */
63
        int wordmax = 0;
64

    
65
        /** The basename. */
66
        String basename = "";
67
        String txtname = "";
68
        File outdir;
69

    
70
        /** The wordid. */
71
        String wordid;
72

    
73
        /** The first word. */
74
        boolean firstWord = true;
75

    
76
        /** The wordvalue. */
77
        String wordvalue;
78

    
79
        /** The interpvalue. */
80
        String interpvalue;
81

    
82
        /** The lastword. */
83
        String lastword = " ";
84

    
85
        /** The wordtype. */
86
        String wordtype;
87

    
88
        /** The flagform. */
89
        boolean flagform = false;
90

    
91
        /** The flaginterp. */
92
        boolean flaginterp = false;
93

    
94
        /** The url. */
95
        private def url;
96

    
97
        /** The input data. */
98
        private def inputData;
99

    
100
        /** The factory. */
101
        private def factory;
102

    
103
        /** The parser. */
104
        private XMLStreamReader parser;
105

    
106
        /** The writer. */
107
        OutputStreamWriter writer;
108

    
109
        /** The multiwriter. */
110
        OutputStreamWriter multiwriter = null;
111
        
112
        /** The pagedWriter. */
113
        StaxStackWriter pagedWriter = null;
114

    
115
        /** The infile. */
116
        File infile;
117

    
118
        /** The outfile. */
119
        File outfile;
120

    
121
        /** The pages. */
122
        ArrayList<File> pages = new ArrayList<File>();
123

    
124
        /** The idxstart. */
125
        ArrayList<String> idxstart = new ArrayList<String>();
126
        String editionPage;
127

    
128
        /**
129
         * Instantiates a new pager.
130
         *
131
         * @param infile the infile
132
         * @param outfile the outfile
133
         * @param NoSpaceBefore the no space before
134
         * @param NoSpaceAfter the no space after
135
         * @param max the max
136
         * @param basename the basename
137
         */
138
        pager(File infile, File outdir, String txtname, List<String> NoSpaceBefore,
139
        List<String> NoSpaceAfter, int max, String basename, String editionPage) {
140
                this.editionPage = editionPage;
141
                this.basename = basename;
142
                this.txtname = txtname;
143
                this.outdir = outdir;
144
                this.wordmax = max;
145
                this.NoSpaceBefore = NoSpaceBefore;
146
                this.NoSpaceAfter = NoSpaceAfter;
147
                this.url = infile.toURI().toURL();
148
                this.infile = infile;
149

    
150
                inputData = url.openStream();
151
                factory = XMLInputFactory.newInstance();
152
                parser = factory.createXMLStreamReader(inputData);
153
                process();
154
        }
155

    
156
        private def closeMultiWriter()
157
        {
158
                if (pagedWriter != null) {
159
                        def tags = pagedWriter.getTagStack().clone();
160
                        if (firstWord) { // there was no words
161
                                this.idxstart.add("w_0")
162
                                pagedWriter.write("<span id=\"w_0\"/>");
163
                        }
164

    
165
                        pagedWriter.writeEndElements();
166
                        pagedWriter.close();
167
                        return tags;
168

    
169
                        //                        pagedWriter.write("</body>");
170
                        //                        pagedWriter.write("</html>");
171
                        //                        pagedWriter.close();
172
                } else {
173
                        return [];
174
                }
175
        }
176

    
177
        /**
178
         * Creates the next output.
179
         *
180
         * @return true, if successful
181
         */
182
        private boolean createNextOutput()
183
        {
184
                wordcount = 0;
185
                try {
186
                        def tags = closeMultiWriter()
187
                        for (int i = 0 ; i < tags.size() ; i++) {
188
                                String tag = tags[i]
189
                                if ("body" != tag) {
190
                                        tags.remove(i--)
191
                                } else {
192
                                        tags.remove(i--) // remove "body"
193
                                        break; // remove elements until "body tag
194
                                }
195
                        }
196
                        File outfile = new File(outdir, txtname+"_"+(++pagecount)+".html")
197
                        pages.add(outfile)
198
                        firstWord = true // waiting for next word
199

    
200
                        pagedWriter = new StaxStackWriter(outfile , "UTF-8");
201
                        pagedWriter.writeStartDocument("UTF-8", "1.0")
202
                        pagedWriter.writeStartElement("html")
203
                        pagedWriter.writeEmptyElement("meta", ["http-equiv":"Content-Type", "content":"text/html","charset":"UTF-8"])
204
                        pagedWriter.writeStartElement("head")
205
                        pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "href":txtname+".css"])
206
                        pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "href":"doc.css"])
207
                        pagedWriter.writeStartElement("title")
208
                        pagedWriter.writeCharacters(basename.toUpperCase()+" Edition - Page "+pagecount)
209
                        pagedWriter.writeEndElement() // </title>
210
                        pagedWriter.writeEndElement() // </head>
211
                        pagedWriter.writeStartElement("body") //<body>
212
                        pagedWriter.writeStartElements(tags)
213
                        return true;
214
                } catch (Exception e) {
215
                        System.out.println(e.getLocalizedMessage());
216
                        return false;
217
                }
218
        }
219

    
220
        /**
221
         * Creates the output.
222
         *
223
         * @param outfile the outfile
224
         * @return true, if successful
225
         */
226
        private boolean createOutput() {
227
                try {
228
                        return createNextOutput();
229
                } catch (Exception e) {
230
                        System.out.println(e.getLocalizedMessage());
231
                        return false;
232
                }
233
        }
234

    
235
        /**
236
         * Gets the page files.
237
         *
238
         * @return the page files
239
         */
240
        public ArrayList<File> getPageFiles() {
241
                return pages;
242
        }
243

    
244
        /**
245
         * Gets the idx.
246
         *
247
         * @return the idx
248
         */
249
        public ArrayList<String> getIdx() {
250
                return idxstart;
251
        }
252

    
253
        void gotoText() {
254
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
255
                        switch (event) {
256
                                case XMLStreamConstants.END_ELEMENT:
257
                                        if(parser.getLocalName() == "teiHeader")
258
                                                return;
259
                        }
260
                }
261
        }
262
        /**
263
         * Process.
264
         */
265
        void process() {
266
                try {
267
                String localname = "";
268
                String figurerend = ""
269
                boolean unordered = false;
270
                createNextOutput();
271
                
272
                gotoText();
273
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
274
                        switch (event) {
275
                                case XMLStreamConstants.START_ELEMENT:
276
                                        localname = parser.getLocalName();
277
                                        
278
                                        if (localname == editionPage) {
279
                                                createNextOutput();
280
                                                wordcount=0;
281
                                                if (parser.getAttributeValue(null,"n") != null) {
282
                                                        pagedWriter.writeStartElement("p", ["style":"color:red", "align":"center"])
283
                                                        pagedWriter.writeCharacters("- "+parser.getAttributeValue(null,"n")+" -")
284
                                                        pagedWriter.writeEndElement() // p
285
                                                }
286
                                        }
287
                                        
288
                                        switch (localname) {
289
                                                case "textunit":
290
                                                        if (parser.getAttributeValue(null,"id") != null) {
291
                                                                pagedWriter.writeStartElement("h3")
292
                                                                pagedWriter.writeCharacters(parser.getAttributeValue(null,"id"))
293
                                                                pagedWriter.writeEndElement() // h3
294
                                                        }
295
                                                        pagedWriter.writeStartElement("table")
296
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
297
                                                                pagedWriter.writeStartElement("tr")
298
                                                                pagedWriter.writeStartElement("td")
299
                                                                pagedWriter.writeCharacters(parser.getAttributeLocalName(i))
300
                                                                pagedWriter.writeEndElement() // td
301
                                                                pagedWriter.writeStartElement("td")
302
                                                                pagedWriter.writeCharacters(parser.getAttributeValue(i))
303
                                                                pagedWriter.writeEndElement() // td
304
                                                                pagedWriter.writeEndElement() // tr
305
                                                        }
306
                                                        pagedWriter.writeEndElement() // table
307
                                                        pagedWriter.writeEmptyElement("br")
308
                                                        break;
309
                                                case "head":
310
                                                        String rend = parser.getAttributeValue(null, "rend")
311
                                                        if (rend == null) rend = "normal"
312
                                                        pagedWriter.writeStartElement("h2", ["class":rend])
313
                                                        break;
314
                                                case "figure":
315
                                                        figurerend = parser.getAttributeValue(null, "rend")
316
                                                        break;
317
                                                case "graphic":
318
                                                        String url = parser.getAttributeValue(null, "url")
319
                                                        if (url.startsWith("..//")) url = url.substring(4);
320
                                                        else if (url.startsWith("../")) url = url.substring(3);
321

    
322
                                                        if (url != null) { 
323
                                                                pagedWriter.writeStartElement("div")
324
                                                                pagedWriter.writeEmptyElement("img", ["class":figurerend, "src":url])
325
                                                                pagedWriter.writeEndElement() // div
326
                                                        }
327
                                                        figurerend = "";
328
                                                        break;
329
                                                case "lg":
330
                                                case "p":
331
                                                case "q":
332
                                                        String rend = parser.getAttributeValue(null, "rend")
333
                                                        if (rend == null) rend = "normal"
334
                                                        pagedWriter.writeStartElement("p", ["class":rend])
335
                                                        break;
336
                                                //case "pb":
337
                                                case "table":
338
                                                        pagedWriter.writeStartElement("table", ["class":parser.getAttributeValue(null, "rend")])
339
                                                        break;
340
                                                case "row":
341
                                                        pagedWriter.writeStartElement("tr")
342
                                                        break;
343
                                                case "cell":
344
                                                        pagedWriter.writeStartElement("td")
345
                                                        break;
346
                                                case "lb":
347
                                                case "br":
348
                                                        pagedWriter.writeEmptyElement("br")
349
                                                        break;
350
                                                case "w":
351
                                                        wordid = parser.getAttributeValue(null,"id");
352

    
353
                                                        if (firstWord) {
354
                                                                firstWord = false;
355
                                                                this.idxstart.add(wordid);
356
                                                        }
357
                                                        wordcount++;
358

    
359
                                                        break;
360
                                                case "list":
361
                                                        String type = parser.getAttributeValue(null,"type");
362
                                                        if ("unordered" == type) {
363
                                                                unordered = true;
364
                                                                pagedWriter.writeStartElement("ul")
365
                                                        } else {
366
                                                                unordered = false
367
                                                                pagedWriter.writeStartElement("ol")
368
                                                        }
369
                                                        break
370
                                                case "item":
371
                                                        pagedWriter.writeStartElement("li")
372
                                                        break;
373
                                                case "hi":
374
                                                        pagedWriter.writeStartElement("b")
375
                                                        break;
376
                                                case "emph":
377
                                                        pagedWriter.writeStartElement("i")
378
                                                        break;
379
                                                case "ana":
380
                                                        flaginterp=true;
381
                                                        interpvalue+=" "+parser.getAttributeValue(null,"type").substring(1)+":"
382
                                                        break;
383

    
384
                                                case "form":
385
                                                        wordvalue="";
386
                                                        interpvalue ="";
387
                                                        flagform=true;
388
                                        }
389
                                        break;
390
                                case XMLStreamConstants.END_ELEMENT:
391
                                        localname = parser.getLocalName();
392
                                        switch(localname)
393
                                        {
394
                                                case "head":
395
                                                        pagedWriter.writeEndElement(); // h2
396
                                                        break;
397
                                                case "lg":
398
                                                case "p":
399
                                                case "q":
400
                                                        pagedWriter.writeEndElement(); // p
401
                                                        if (wordcount >= wordmax) {
402
                                                                createNextOutput();
403
                                                        }
404
                                                        break;
405
                                                case "list":
406
                                                        pagedWriter.writeEndElement(); // ul or ol
407
                                                        break
408
                                                case "item":
409
                                                        pagedWriter.writeEndElement(); // li
410
                                                        break;
411
                                                case "hi":
412
                                                        pagedWriter.writeEndElement(); // b
413
                                                        break;
414
                                                case "emph":
415
                                                        pagedWriter.writeEndElement(); // i
416
                                                        break;
417
                                                case "table":
418
                                                        pagedWriter.writeEndElement(); // table
419
                                                        break;
420
                                                case "row":
421
                                                        pagedWriter.writeEndElement(); // tr
422
                                                        break;
423
                                                case "cell":
424
                                                        pagedWriter.writeEndElement(); // td
425
                                                        break;
426
                                                case "form":
427
                                                        flagform = false
428
                                                        break;
429
                                                case "ana":
430
                                                        flaginterp = false
431
                                                        break;
432
                                                case "w":
433
                                                        int l = lastword.length();
434
                                                        String endOfLastWord = "";
435
                                                        if (l > 0)
436
                                                                endOfLastWord = lastword.subSequence(l-1, l);
437

    
438
                                                        if (interpvalue != null)
439
                                                                interpvalue = interpvalue;
440

    
441
                                                        if (NoSpaceBefore.contains(wordvalue) ||
442
                                                                NoSpaceAfter.contains(lastword) ||
443
                                                                wordvalue.startsWith("-") ||
444
                                                                NoSpaceAfter.contains(endOfLastWord))
445
                                                        {
446
                                                                // multiwriter.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\">");
447
                                                        } else {
448
                                                                // multiwriter.write(" <span title=\""+interpvalue+"\" id=\""+wordid+"\">");
449
                                                                pagedWriter.writeCharacters(" ");
450
                                                        }
451
                                                        pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
452
                                                        pagedWriter.writeCharacters(wordvalue);
453
                                                        pagedWriter.writeEndElement() // span
454
                                                        lastword=wordvalue;
455
                                                        break;
456
                                        }
457
                                        break;
458
                                case XMLStreamConstants.CHARACTERS:
459
                                        if (flagform)
460
                                                if (parser.getText().length() > 0)
461
                                                        wordvalue+=(parser.getText());
462
                                        if (flaginterp)
463
                                                if (parser.getText().length() > 0)
464
                                                        interpvalue+=(parser.getText());
465
                                        break;
466
                        }
467
                }
468
                //writer.write("</body>");
469
                //writer.write("</html>");
470
                //writer.close();
471
                closeMultiWriter();
472
        } catch(Exception e) { org.txm.utils.logger.Log.printStackTrace(e); pagedWriter.close(); }
473
        }
474
}