Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / xml / pager.groovy @ 479

History | View | Annotate | Download (10.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-05-02 11:55:17 +0200 (mar. 02 mai 2017) $
25
// $LastChangedRevision: 3436 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.xml;
29

    
30
import org.txm.importer.*;
31
import org.xml.sax.Attributes;
32
import org.txm.importer.filters.*;
33
import java.io.File;
34
import java.io.IOException;
35
import java.util.ArrayList;
36
import javax.xml.parsers.*
37
import javax.xml.stream.*;
38
import java.net.URL;
39
import org.xml.sax.InputSource;
40
import org.xml.sax.helpers.DefaultHandler;
41

    
42
/** Build a simple edition from a xml-tei. 
43
 * @author mdecorde */
44
class pager {
45

    
46
        List<String> NoSpaceBefore;
47

    
48
        /** The No space after. */
49
        List<String> NoSpaceAfter;
50

    
51
        /** The wordcount. */
52
        int wordcount = 0;
53

    
54
        /** The pagecount. */
55
        int pagecount = 0;
56

    
57
        /** The wordmax. */
58
        int wordmax = 0;
59

    
60
        /** The basename. */
61
        String basename = "";
62
        String txtname = "";
63
        File outdir;
64

    
65
        /** The wordid. */
66
        String wordid;
67

    
68
        /** The first word. */
69
        boolean firstWord = true;
70

    
71
        /** The wordvalue. */
72
        String wordvalue = "";
73

    
74
        /** The interpvalue. */
75
        String interpvalue = "";
76

    
77
        /** The lastword. */
78
        String lastword = " ";
79

    
80
        /** The wordtype. */
81
        String wordtype;
82

    
83
        /** The flagform. */
84
        boolean flagform = false;
85

    
86
        /** The flaginterp. */
87
        boolean flaginterp = false;
88

    
89
        /** The url. */
90
        private def url;
91

    
92
        /** The input data. */
93
        private def inputData;
94

    
95
        /** The factory. */
96
        private def factory;
97

    
98
        /** The parser. */
99
        private XMLStreamReader parser;
100

    
101
        /** The writer. */
102
        OutputStreamWriter writer;
103

    
104
        /** The pagedWriter. */
105
        StaxStackWriter pagedWriter = null;
106

    
107
        /** The infile. */
108
        File infile;
109

    
110
        /** The outfile. */
111
        File outfile;
112

    
113
        /** The pages. */
114
        ArrayList<File> pages = new ArrayList<File>();
115

    
116
        /** The idxstart. */
117
        ArrayList<String> idxstart = new ArrayList<String>();
118
        String paginationElement;
119

    
120
        /**
121
         * Instantiates a new pager.
122
         *
123
         * @param infile the infile
124
         * @param outfile the outfile
125
         * @param NoSpaceBefore the no space before
126
         * @param NoSpaceAfter the no space after
127
         * @param max the max
128
         * @param basename the basename
129
         */
130
        pager(File infile, File outdir, String txtname, List<String> NoSpaceBefore,
131
        List<String> NoSpaceAfter, int max, String basename, String paginationElement) {
132
                this.paginationElement = paginationElement;
133
                this.basename = basename;
134
                this.txtname = txtname;
135
                this.outdir = outdir;
136
                this.wordmax = max;
137
                this.NoSpaceBefore = NoSpaceBefore;
138
                this.NoSpaceAfter = NoSpaceAfter;
139
                this.url = infile.toURI().toURL();
140
                this.infile = infile;
141

    
142
                inputData = url.openStream();
143
                factory = XMLInputFactory.newInstance();
144
                parser = factory.createXMLStreamReader(inputData);
145

    
146
                process();
147
        }
148

    
149
        private def closeMultiWriter()
150
        {
151
                if (pagedWriter != null) {
152
                        def tags = pagedWriter.getTagStack().clone();
153

    
154
                        if (firstWord) { // there was no words found
155
                                pagedWriter.writeCharacters("");
156
                                this.idxstart.add("w_0")
157
                                pagedWriter.write("<span id=\"w_0\"/>");
158
                        }
159
                        pagedWriter.writeEndElements();
160
                        pagedWriter.close();
161
                        return tags;
162
                } else {
163
                        return [];
164
                }
165
        }
166

    
167
        /**
168
         * Creates the next output.
169
         *
170
         * @return true, if successful
171
         */
172
        private boolean createNextOutput() {
173
                wordcount = 0;
174
                try {
175
                        def tags = closeMultiWriter();
176
                        for (int i = 0 ; i < tags.size() ; i++) {
177
                                String tag = tags[i]
178
                                if ("body" != tag) {
179
                                        tags.remove(i--)
180
                                } else {
181
                                        tags.remove(i--) // remove "body"
182
                                        break; // remove elements until "body tag
183
                                }
184
                        }
185
                        File outfile = new File(outdir, txtname+"_"+(++pagecount)+".html")
186
                        pages.add(outfile);
187
                        firstWord = true; // waiting for next word
188

    
189
                        pagedWriter = new StaxStackWriter(outfile, "UTF-8");
190

    
191
                        pagedWriter.writeStartDocument("UTF-8", "1.0")
192
                        pagedWriter.writeStartElement("html");
193
                        pagedWriter.writeEmptyElement("meta", ["http-equiv":"Content-Type", "content":"text/html","charset":"UTF-8"]);
194
                        pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "type":"text/css","href":"txm.css"]);
195
                        pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "type":"text/css","href":"${basename}.css"]);
196
                        pagedWriter.writeStartElement("head");
197
                        pagedWriter.writeStartElement("title")
198
                        pagedWriter.writeCharacters(basename.toUpperCase()+" Edition - Page "+pagecount)
199
                        pagedWriter.writeEndElement(); // </title>
200
                        pagedWriter.writeEndElement() // </head>
201
                        pagedWriter.writeStartElement("body") //<body>
202

    
203
                        pagedWriter.writeStartElements(tags);
204
                        return true;
205
                } catch (Exception e) {
206
                        System.out.println("Error: "+e.getLocalizedMessage());
207
                        return false;
208
                }
209
        }
210

    
211
        /**
212
         * Creates the output.
213
         *
214
         * @param outfile the outfile
215
         * @return true, if successful
216
         */
217
        private boolean createOutput() {
218
                try {
219
                        return createNextOutput();
220
                } catch (Exception e) {
221
                        System.out.println(e.getLocalizedMessage());
222
                        return false;
223
                }
224
        }
225

    
226
        /**
227
         * Gets the page files.
228
         *
229
         * @return the page files
230
         */
231
        public ArrayList<File> getPageFiles() {
232
                return pages;
233
        }
234

    
235
        /**
236
         * Gets the idx.
237
         *
238
         * @return the idx
239
         */
240
        public ArrayList<String> getIdx() {
241
                return idxstart;
242
        }
243

    
244
        /**
245
         * Process.
246
         */
247
        void process() {
248
                boolean flagNote = false;
249
                String noteContent = "";
250

    
251
                String localname = "";
252
                createNextOutput();
253
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
254
                        switch (event) {
255
                                case XMLStreamConstants.START_ELEMENT:
256
                                        localname = parser.getLocalName();
257
                                        
258
                                        if (localname == paginationElement) {
259
                                                createNextOutput();
260
                                                pagedWriter.write("\n");
261
                                                if (parser.getAttributeValue(null,"n") != null) {
262
                                                        pagedWriter.writeElement("p", ["style":"color:red", "align":"center"], "- "+parser.getAttributeValue(null,"n")+" -")
263
                                                }
264
                                        }
265
                                        
266
                                        switch (localname) {
267
                                                case "text":
268
                                                        pagedWriter.write("\n");
269
                                                        if (parser.getAttributeValue(null,"id") != null)
270
                                                                pagedWriter.writeElement("h3", parser.getAttributeValue(null,"id"))
271
                                                        pagedWriter.writeStartElement("table");
272
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
273
                                                                pagedWriter.writeStartElement("tr");
274
                                                                pagedWriter.writeElement("td", parser.getAttributeLocalName(i));
275
                                                                pagedWriter.writeElement("td", parser.getAttributeValue(i).toString());
276
                                                                pagedWriter.writeEndElement();
277
                                                        }
278
                                                        pagedWriter.writeEndElement() // table
279
                                                        pagedWriter.writeEmptyElement("br")
280
                                                        pagedWriter.writeCharacters("");
281
                                                        break;
282
                                                case "note":
283
                                                        flagNote = true;
284
                                                        noteContent = ""
285
                                                        break;
286
                                                case "a":
287
                                                        pagedWriter.writeStartElement("a")
288
                                                        pagedWriter.writeAttribute("href", parser.getAttributeValue(null,"href"));
289
                                                        String target = parser.getAttributeValue(null,"target");
290
                                                        if (target == null) target = "_blank";
291
                                                        pagedWriter.writeAttribute("target", target);
292
                                                        break;
293
                                                case "head":
294
                                                        pagedWriter.write("\n");
295
                                                        pagedWriter.writeStartElement("h2")
296
                                                        break;
297
                                                case "graphic":
298
                                                        pagedWriter.write("\n");
299
                                                        String url = parser.getAttributeValue(null, "url")
300
                                                        if (url != null) {
301
                                                                pagedWriter.writeStartElement("div")
302
                                                                pagedWriter.writeEmptyElement("img", ["src":url])
303
                                                                pagedWriter.writeEndElement() // div
304
                                                        }
305
                                                        break;
306
                                                case "lg":
307
                                                case "p":
308
                                                case "q":
309
                                                        pagedWriter.write("\n");
310
                                                        String rend = parser.getAttributeValue(null, "rend")
311
                                                        if (rend == null) rend = "normal"
312
                                                        pagedWriter.writeStartElement("p",["class":rend])
313
                                                        break;
314
                                                //case "pb":
315
                                                
316
                                                case "lb":
317
                                                case "br":
318
                                                        pagedWriter.writeEmptyElement("br")
319
                                                        break;
320
                                                case "w":
321
                                                
322
                                                        wordid = parser.getAttributeValue(null,"id");
323
                                                        
324
                                                        wordcount++;
325
                                                        if (wordcount >= wordmax) {
326
                                                                createNextOutput();
327
                                                        }
328
                                                        
329
                                                        if (firstWord) {
330
                                                                firstWord = false;
331
                                                                this.idxstart.add(wordid);
332
                                                        }
333
                                                        
334
                                                        break;
335
                                                case "ana":
336
                                                        flaginterp=true;
337
                                                        interpvalue+=" "+parser.getAttributeValue(null,"type").substring(1)+":"
338
                                                        break;
339
                                                case "form":
340
                                                        wordvalue="";
341
                                                        interpvalue ="";
342
                                                        flagform=true;
343
                                                        break;
344
                                                //                                                default:
345
                                                //                                                        pagedWriter.writeStartElement(localname)
346
                                        }
347
                                        break;
348
                                case XMLStreamConstants.END_ELEMENT:
349
                                        localname = parser.getLocalName();
350
                                        switch (localname) {
351
                                                case "head":
352
                                                        pagedWriter.writeEndElement() // </h2>
353
                                                        break;
354
                                                case "note":
355
                                                        flagNote = false;
356
                                                        if (noteContent.length() > 0) {
357
                                                                pagedWriter.writeStartElement("span", ["style":"color:red;", "title":noteContent]);
358
                                                                pagedWriter.writeCharacters("[*]");
359
                                                                pagedWriter.writeEndElement() // </span>
360
                                                        }
361
                                                        break;
362
                                                case "lg":
363
                                                case "p":
364
                                                case "q":
365
                                                        pagedWriter.writeEndElement() // </p>
366
                                                        break;
367
                                                case "a":
368
                                                        pagedWriter.writeEndElement() // </a>
369
                                                        break;
370
                                                case "form":
371
                                                        flagform = false
372
                                                        break;
373
                                                case "ana":
374
                                                        flaginterp = false
375
                                                        break;
376
                                                case "w":
377
                                                        int l = lastword.length();
378
                                                        String endOfLastWord = "";
379
                                                        if (l > 0)
380
                                                                endOfLastWord = lastword.subSequence(l-1, l);
381

    
382
                                                        if (interpvalue != null)
383
                                                                interpvalue = interpvalue;
384

    
385
                                                        if (NoSpaceBefore.contains(wordvalue) ||
386
                                                        NoSpaceAfter.contains(lastword) ||
387
                                                        wordvalue.startsWith("-") ||
388
                                                        NoSpaceAfter.contains(endOfLastWord)) {
389
                                                                pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
390
                                                        } else {
391
                                                                pagedWriter.writeCharacters(" ");
392
                                                                pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
393
                                                        }
394

    
395
                                                        pagedWriter.writeCharacters(wordvalue);
396
                                                        pagedWriter.writeEndElement();
397
                                                        lastword=wordvalue;
398
                                                        break;
399
                                                //                                                default:
400
                                                //                                                        pagedWriter.writeEndElement()
401
                                        }
402
                                        break;
403
                                case XMLStreamConstants.CHARACTERS:
404
                                        if (flagform && parser.getText().length() > 0) {
405
                                                wordvalue+=(parser.getText());
406
                                                if (flagNote == parser.getText().length() > 0)
407
                                                        noteContent += parser.getText().replace("\n", " ");
408
                                        } else        if (flaginterp && parser.getText().length() > 0) {
409
                                                interpvalue+=(parser.getText());
410
                                        } else if (flagNote == parser.getText().length() > 0) {
411
                                                noteContent += parser.getText().replace("\n", " ");
412
                                        }
413
                                        break;
414
                        }
415
                }
416
                closeMultiWriter();
417
        }
418
}