Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / xml / pager.groovy @ 187

History | View | Annotate | Download (11.1 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (Tue, 24 Jan 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.xml;
29

    
30
import org.txm.importer.*;
31
import org.xml.sax.Attributes;
32
import org.txm.importer.filters.*;
33
import java.io.File;
34
import java.io.IOException;
35
import java.util.ArrayList;
36
import javax.xml.parsers.*
37
import javax.xml.stream.*;
38
import java.net.URL;
39
import org.xml.sax.InputSource;
40
import org.xml.sax.helpers.DefaultHandler;
41

    
42
/** Build a simple edition from a xml-tei. 
43
 * @author mdecorde */
44
class pager {
45

    
46
        List<String> NoSpaceBefore;
47

    
48
        /** The No space after. */
49
        List<String> NoSpaceAfter;
50

    
51
        /** The wordcount. */
52
        int wordcount = 0;
53

    
54
        /** The pagecount. */
55
        int pagecount = 0;
56

    
57
        /** The wordmax. */
58
        int wordmax = 0;
59

    
60
        /** The basename. */
61
        String basename = "";
62
        String txtname = "";
63
        File outdir;
64

    
65
        /** The wordid. */
66
        String wordid;
67

    
68
        /** The first word. */
69
        boolean firstWord = true;
70

    
71
        /** The wordvalue. */
72
        String wordvalue = "";
73

    
74
        /** The interpvalue. */
75
        String interpvalue = "";
76

    
77
        /** The lastword. */
78
        String lastword = " ";
79

    
80
        /** The wordtype. */
81
        String wordtype;
82

    
83
        /** The flagform. */
84
        boolean flagform = false;
85

    
86
        /** The flaginterp. */
87
        boolean flaginterp = false;
88

    
89
        /** The url. */
90
        private def url;
91

    
92
        /** The input data. */
93
        private def inputData;
94

    
95
        /** The factory. */
96
        private def factory;
97

    
98
        /** The parser. */
99
        private XMLStreamReader parser;
100

    
101
        /** The writer. */
102
        OutputStreamWriter writer;
103

    
104
        /** The pagedWriter. */
105
        StaxStackWriter pagedWriter = null;
106

    
107
        /** The infile. */
108
        File infile;
109

    
110
        /** The outfile. */
111
        File outfile;
112

    
113
        /** The pages. */
114
        ArrayList<File> pages = new ArrayList<File>();
115

    
116
        /** The idxstart. */
117
        ArrayList<String> idxstart = new ArrayList<String>();
118
        String paginationElement;
119

    
120
        /**
121
         * Instantiates a new pager.
122
         *
123
         * @param infile the infile
124
         * @param outfile the outfile
125
         * @param NoSpaceBefore the no space before
126
         * @param NoSpaceAfter the no space after
127
         * @param max the max
128
         * @param basename the basename
129
         */
130
        pager(File infile, File outdir, String txtname, List<String> NoSpaceBefore,
131
        List<String> NoSpaceAfter, int max, String basename, String paginationElement) {
132
                this.paginationElement = paginationElement;
133
                this.basename = basename;
134
                this.txtname = txtname;
135
                this.outdir = outdir;
136
                this.wordmax = max;
137
                this.NoSpaceBefore = NoSpaceBefore;
138
                this.NoSpaceAfter = NoSpaceAfter;
139
                this.url = infile.toURI().toURL();
140
                this.infile = infile;
141

    
142
                inputData = url.openStream();
143
                factory = XMLInputFactory.newInstance();
144
                parser = factory.createXMLStreamReader(inputData);
145

    
146
                process();
147
        }
148

    
149
        private def closeMultiWriter()
150
        {
151
                if (pagedWriter != null) {
152
                        def tags = pagedWriter.getTagStack().clone();
153

    
154
                        if (firstWord) { // there was no words
155
                                pagedWriter.writeCharacters("");
156
                                //                                if (this.idxstart.size() == 0) {
157
                                //                                        this.idxstart.add("w_0")
158
                                //                                        pagedWriter.write("<span id=\"w_0\"/>");
159
                                //                                } else {
160
                                this.idxstart.add("w_0")
161
                                pagedWriter.write("<span id=\"w_0\"/>");
162
                                //                                }
163
                        }
164
                        pagedWriter.writeEndElements();
165
                        pagedWriter.close();
166
                        return tags;
167
                } else {
168
                        return [];
169
                }
170
        }
171

    
172
        /**
173
         * Creates the next output.
174
         *
175
         * @return true, if successful
176
         */
177
        private boolean createNextOutput() {
178
                wordcount = 0;
179
                try {
180
                        def tags = closeMultiWriter();
181
                        for (int i = 0 ; i < tags.size() ; i++) {
182
                                String tag = tags[i]
183
                                if ("body" != tag) {
184
                                        tags.remove(i--)
185
                                } else {
186
                                        tags.remove(i--) // remove "body"
187
                                        break; // remove elements until "body tag
188
                                }
189
                        }
190
                        File outfile = new File(outdir, txtname+"_"+(++pagecount)+".html")
191
                        pages.add(outfile);
192
                        firstWord = true; // waiting for next word
193

    
194
                        pagedWriter = new StaxStackWriter(outfile, "UTF-8");
195

    
196
                        pagedWriter.writeStartDocument("UTF-8", "1.0")
197
                        pagedWriter.writeStartElement("html");
198
                        pagedWriter.writeEmptyElement("meta", ["http-equiv":"Content-Type", "content":"text/html","charset":"UTF-8"]);
199
                        pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "type":"text/css","href":"txm.css"]);
200
                        pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "type":"text/css","href":"${basename}.css"]);
201
                        pagedWriter.writeStartElement("head");
202
                        pagedWriter.writeStartElement("title")
203
                        pagedWriter.writeCharacters(basename.toUpperCase()+" Edition - Page "+pagecount)
204
                        pagedWriter.writeEndElement(); // </title>
205
                        pagedWriter.writeEndElement() // </head>
206
                        pagedWriter.writeStartElement("body") //<body>
207

    
208
                        pagedWriter.writeStartElements(tags);
209
                        return true;
210
                } catch (Exception e) {
211
                        System.out.println(e.getLocalizedMessage());
212
                        return false;
213
                }
214
        }
215

    
216
        /**
217
         * Creates the output.
218
         *
219
         * @param outfile the outfile
220
         * @return true, if successful
221
         */
222
        private boolean createOutput() {
223
                try {
224
                        return createNextOutput();
225
                } catch (Exception e) {
226
                        System.out.println(e.getLocalizedMessage());
227
                        return false;
228
                }
229
        }
230

    
231
        /**
232
         * Gets the page files.
233
         *
234
         * @return the page files
235
         */
236
        public ArrayList<File> getPageFiles() {
237
                return pages;
238
        }
239

    
240
        /**
241
         * Gets the idx.
242
         *
243
         * @return the idx
244
         */
245
        public ArrayList<String> getIdx() {
246
                return idxstart;
247
        }
248

    
249
        /**
250
         * Process.
251
         */
252
        void process() {
253
                boolean flagNote = false;
254
                String noteContent = "";
255

    
256
                String localname = "";
257
                createNextOutput();
258
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
259
                        switch (event) {
260
                                case XMLStreamConstants.START_ELEMENT:
261
                                        localname = parser.getLocalName();
262
                                        switch (localname) {
263
                                                case "text":
264
                                                        pagedWriter.write("\n");
265
                                                        if (parser.getAttributeValue(null,"id") != null)
266
                                                                pagedWriter.writeElement("h3", parser.getAttributeValue(null,"id"))
267
                                                        pagedWriter.writeStartElement("table");
268
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
269
                                                                pagedWriter.writeStartElement("tr");
270
                                                                pagedWriter.writeElement("td", parser.getAttributeLocalName(i));
271
                                                                pagedWriter.writeElement("td", parser.getAttributeValue(i).toString());
272
                                                                pagedWriter.writeEndElement();
273
                                                        }
274
                                                        pagedWriter.writeEndElement() // table
275
                                                        pagedWriter.writeEmptyElement("br")
276
                                                        pagedWriter.writeCharacters("");
277
                                                        break;
278
                                                case "note":
279
                                                        flagNote = true;
280
                                                        noteContent = ""
281
                                                        break;
282
                                                case "a":
283
                                                        pagedWriter.writeStartElement("a")
284
                                                        pagedWriter.writeAttribute("href", parser.getAttributeValue(null,"href"));
285
                                                        String target = parser.getAttributeValue(null,"target");
286
                                                        if (target == null) target = "_blank";
287
                                                        pagedWriter.writeAttribute("target", target);
288
                                                        break;
289
                                                case "head":
290
                                                        pagedWriter.write("\n");
291
                                                        pagedWriter.writeStartElement("h2")
292
                                                        break;
293
                                                case "graphic":
294
                                                        pagedWriter.write("\n");
295
                                                        String url = parser.getAttributeValue(null, "url")
296
                                                        if (url != null) {
297
                                                                pagedWriter.writeStartElement("div")
298
                                                                pagedWriter.writeEmptyElement("img", ["src":url])
299
                                                                pagedWriter.writeEndElement() // div
300
                                                        }
301
                                                        break;
302
                                                case "lg":
303
                                                case "p":
304
                                                case "q":
305
                                                        pagedWriter.write("\n");
306
                                                        String rend = parser.getAttributeValue(null, "rend")
307
                                                        if (rend == null) rend = "normal"
308
                                                        pagedWriter.writeStartElement("p",["class":rend])
309
                                                        break;
310
                                                //case "pb":
311
                                                case paginationElement:
312
                                                        createNextOutput();
313
                                                        wordcount=0;
314
                                                        pagedWriter.write("\n");
315
                                                        if (parser.getAttributeValue(null,"n") != null) {
316
                                                                pagedWriter.writeElement("p", ["style":"color:red", "align":"center"], "- "+parser.getAttributeValue(null,"n")+" -")
317
                                                        }
318
                                                        break;
319
                                                case "lb":
320
                                                case "br":
321
                                                        pagedWriter.writeEmptyElement("br")
322
                                                        break;
323
                                                case "w":
324
                                                        wordid = parser.getAttributeValue(null,"id");
325
                                                        if (firstWord) {
326
                                                                firstWord = false;
327
                                                                this.idxstart.add(wordid);
328
                                                        }
329
                                                        wordcount++;
330
                                                        if (wordcount >= wordmax) {
331
                                                                createNextOutput();
332
                                                        }
333
                                                        break;
334
                                                case "ana":
335
                                                        flaginterp=true;
336
                                                        interpvalue+=" "+parser.getAttributeValue(null,"type").substring(1)+":"
337
                                                        break;
338
                                                case "form":
339
                                                        wordvalue="";
340
                                                        interpvalue ="";
341
                                                        flagform=true;
342
                                                        break;
343
                                                //                                                default:
344
                                                //                                                        pagedWriter.writeStartElement(localname)
345
                                        }
346
                                        break;
347
                                case XMLStreamConstants.END_ELEMENT:
348
                                        localname = parser.getLocalName();
349
                                        switch (localname) {
350
                                                case "head":
351
                                                        pagedWriter.writeEndElement() // </h2>
352
                                                        break;
353
                                                case "note":
354
                                                        flagNote = false;
355
                                                        if (noteContent.length() > 0) {
356
                                                                pagedWriter.writeStartElement("span", ["style":"color:red;", "title":noteContent]);
357
                                                                pagedWriter.writeCharacters("[*]");
358
                                                                pagedWriter.writeEndElement() // </span>
359
                                                        }
360
                                                        break;
361
                                                case "lg":
362
                                                case "p":
363
                                                case "q":
364
                                                        pagedWriter.writeEndElement() // </p>
365
                                                        break;
366
                                                case "a":
367
                                                        pagedWriter.writeEndElement() // </a>
368
                                                        break;
369
                                                case "form":
370
                                                        flagform = false
371
                                                        break;
372
                                                case "ana":
373
                                                        flaginterp = false
374
                                                        break;
375
                                                case "w":
376
                                                        int l = lastword.length();
377
                                                        String endOfLastWord = "";
378
                                                        if (l > 0)
379
                                                                endOfLastWord = lastword.subSequence(l-1, l);
380

    
381
                                                        if (interpvalue != null)
382
                                                                interpvalue = interpvalue;
383

    
384
                                                        if (NoSpaceBefore.contains(wordvalue) ||
385
                                                        NoSpaceAfter.contains(lastword) ||
386
                                                        wordvalue.startsWith("-") ||
387
                                                        NoSpaceAfter.contains(endOfLastWord)) {
388
                                                                pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
389
                                                        } else {
390
                                                                pagedWriter.writeCharacters(" ");
391
                                                                pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
392
                                                        }
393

    
394
                                                        pagedWriter.writeCharacters(wordvalue);
395
                                                        pagedWriter.writeEndElement();
396
                                                        lastword=wordvalue;
397
                                                        break;
398
                                                //                                                default:
399
                                                //                                                        pagedWriter.writeEndElement()
400
                                        }
401
                                        break;
402
                                case XMLStreamConstants.CHARACTERS:
403
                                        if (flagform && parser.getText().length() > 0) {
404
                                                wordvalue+=(parser.getText());
405
                                                if (flagNote == parser.getText().length() > 0)
406
                                                        noteContent += parser.getText().replace("\n", " ");
407
                                        } else        if (flaginterp && parser.getText().length() > 0) {
408
                                                interpvalue+=(parser.getText());
409
                                        } else if (flagNote == parser.getText().length() > 0) {
410
                                                noteContent += parser.getText().replace("\n", " ");
411
                                        }
412
                                        break;
413
                        }
414
                }
415
                closeMultiWriter();
416
        }
417
}