Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / xml / pager.groovy @ 625

History | View | Annotate | Download (11.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-05-02 11:55:17 +0200 (mar. 02 mai 2017) $
25
// $LastChangedRevision: 3436 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.xml;
29

    
30
import org.txm.importer.*;
31
import org.xml.sax.Attributes;
32
import org.txm.importer.filters.*;
33
import java.io.File;
34
import java.io.IOException;
35
import java.util.ArrayList;
36
import javax.xml.parsers.*
37
import javax.xml.stream.*;
38
import java.net.URL;
39
import org.xml.sax.InputSource;
40
import org.xml.sax.helpers.DefaultHandler;
41
import org.txm.utils.io.FileCopy
42

    
43
/** Build a simple edition from a xml-tei. 
44
 * @author mdecorde */
45
class pager {
46

    
47
        List<String> NoSpaceBefore;
48

    
49
        /** The No space after. */
50
        List<String> NoSpaceAfter;
51

    
52
        /** The wordcount. */
53
        int wordcount = 0;
54

    
55
        /** The pagecount. */
56
        int pagecount = 0;
57

    
58
        /** The wordmax. */
59
        int wordmax = 0;
60

    
61
        /** The basename. */
62
        String basename = "";
63
        String txtname = "";
64
        File outdir;
65

    
66
        /** The wordid. */
67
        String wordid;
68

    
69
        /** The first word. */
70
        boolean firstWord = true;
71

    
72
        /** The wordvalue. */
73
        String wordvalue = "";
74

    
75
        /** The interpvalue. */
76
        String interpvalue = "";
77

    
78
        /** The lastword. */
79
        String lastword = " ";
80

    
81
        /** The wordtype. */
82
        String wordtype;
83

    
84
        /** The flagform. */
85
        boolean flagform = false;
86

    
87
        /** The flaginterp. */
88
        boolean flaginterp = false;
89

    
90
        /** The url. */
91
        private def url;
92

    
93
        /** The input data. */
94
        private def inputData;
95

    
96
        /** The factory. */
97
        private def factory;
98

    
99
        /** The parser. */
100
        private XMLStreamReader parser;
101

    
102
        /** The writer. */
103
        OutputStreamWriter writer;
104

    
105
        /** The pagedWriter. */
106
        StaxStackWriter pagedWriter = null;
107

    
108
        /** The infile. */
109
        File infile;
110

    
111
        /** The outfile. */
112
        File outfile;
113

    
114
        /** The pages. */
115
        ArrayList<File> pages = new ArrayList<File>();
116

    
117
        /** The idxstart. */
118
        ArrayList<String> idxstart = new ArrayList<String>();
119
        String paginationElement;
120

    
121
        /**
122
         * Instantiates a new pager.
123
         *
124
         * @param infile the infile
125
         * @param outfile the outfile
126
         * @param NoSpaceBefore the no space before
127
         * @param NoSpaceAfter the no space after
128
         * @param max the max
129
         * @param basename the basename
130
         */
131
        pager(File infile, File outdir, String txtname, List<String> NoSpaceBefore,
132
        List<String> NoSpaceAfter, int max, String basename, String paginationElement) {
133
                this.paginationElement = paginationElement;
134
                this.basename = basename;
135
                this.txtname = txtname;
136
                this.outdir = outdir;
137
                this.wordmax = max;
138
                this.NoSpaceBefore = NoSpaceBefore;
139
                this.NoSpaceAfter = NoSpaceAfter;
140
                this.url = infile.toURI().toURL();
141
                this.infile = infile;
142

    
143
                inputData = url.openStream();
144
                factory = XMLInputFactory.newInstance();
145
                parser = factory.createXMLStreamReader(inputData);
146

    
147
                process();
148
                
149
                // copy txm.css file near editions
150
                try {
151
                        File txmDir = infile.getParentFile().getParentFile();
152
                        if (txmDir.exists() && txmDir.getName().equals("txm")) {
153
                                File binDir = txmDir.getParentFile();
154
                                File corporaDir = binDir.getParentFile();
155
                                if (binDir.exists() && corporaDir.getName().equals("corpora")) {
156
                                        File htmlDir = new File(binDir, "HTML/"+basename.toUpperCase()+"/default")
157
                                        File txmhomeDir = corporaDir.getParentFile()
158
                                        File txmcss = new File(txmhomeDir, "css/txm.css")
159
                                        if (htmlDir.exists() && txmcss.exists()) {
160
                                                FileCopy.copy(txmcss, new File(htmlDir, "txm.css"))
161
                                        }
162
                                }
163
                        }
164
                } catch(Exception e) {
165
                        println "Failed to copy TXM default CSS: "+e.getLocalizedMessage();
166
                }
167
        }
168

    
169
        private def closeMultiWriter()
170
        {
171
                if (pagedWriter != null) {
172
                        def tags = pagedWriter.getTagStack().clone();
173

    
174
                        if (firstWord) { // there was no words found
175
                                pagedWriter.writeCharacters("");
176
                                this.idxstart.add("w_0")
177
                                pagedWriter.write("<span id=\"w_0\"/>");
178
                        }
179
                        pagedWriter.writeEndElements();
180
                        pagedWriter.close();
181
                        return tags;
182
                } else {
183
                        return [];
184
                }
185
        }
186

    
187
        /**
188
         * Creates the next output.
189
         *
190
         * @return true, if successful
191
         */
192
        private boolean createNextOutput() {
193
                wordcount = 0;
194
                try {
195
                        def tags = closeMultiWriter();
196
                        for (int i = 0 ; i < tags.size() ; i++) {
197
                                String tag = tags[i]
198
                                if ("div" != tag) {
199
                                        tags.remove(i--)
200
                                } else {
201
                                        tags.remove(i--) // remove first "div"
202
                                        break; // remove elements until first "div" tag
203
                                }
204
                        }
205
                        File outfile = new File(outdir, txtname+"_"+(++pagecount)+".html")
206
                        pages.add(outfile);
207
                        firstWord = true; // waiting for next word
208

    
209
                        pagedWriter = new StaxStackWriter(outfile, "UTF-8");
210

    
211
                        pagedWriter.writeStartDocument("UTF-8", "1.0")
212
                        pagedWriter.writeStartElement("html");
213
                        pagedWriter.writeEmptyElement("meta", ["http-equiv":"Content-Type", "content":"text/html","charset":"UTF-8"]);
214
                        pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "type":"text/css","href":"txm.css"]);
215
                        pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "type":"text/css","href":"${basename}.css"]);
216
                        pagedWriter.writeStartElement("head");
217
                        pagedWriter.writeStartElement("title")
218
                        pagedWriter.writeCharacters(basename.toUpperCase()+" Edition - Page "+pagecount)
219
                        pagedWriter.writeEndElement(); // </title>
220
                        pagedWriter.writeEndElement() // </head>
221
                        pagedWriter.writeStartElement("body") //<body>
222
                        pagedWriter.writeStartElement("div") //<div class="txmeditionpage">
223
                        pagedWriter.writeAttribute("class", "txmeditionpage")
224
                        
225
                        pagedWriter.writeStartElements(tags);
226
                        return true;
227
                } catch (Exception e) {
228
                        System.out.println("Error: "+e.getLocalizedMessage());
229
                        return false;
230
                }
231
        }
232

    
233
        /**
234
         * Creates the output.
235
         *
236
         * @param outfile the outfile
237
         * @return true, if successful
238
         */
239
        private boolean createOutput() {
240
                try {
241
                        return createNextOutput();
242
                } catch (Exception e) {
243
                        System.out.println(e.getLocalizedMessage());
244
                        return false;
245
                }
246
        }
247

    
248
        /**
249
         * Gets the page files.
250
         *
251
         * @return the page files
252
         */
253
        public ArrayList<File> getPageFiles() {
254
                return pages;
255
        }
256

    
257
        /**
258
         * Gets the idx.
259
         *
260
         * @return the idx
261
         */
262
        public ArrayList<String> getIdx() {
263
                return idxstart;
264
        }
265

    
266
        /**
267
         * Process.
268
         */
269
        void process() {
270
                boolean flagNote = false;
271
                String noteContent = "";
272

    
273
                String localname = "";
274
                createNextOutput();
275
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
276
                        switch (event) {
277
                                case XMLStreamConstants.START_ELEMENT:
278
                                        localname = parser.getLocalName();
279
                                        
280
                                        if (localname == paginationElement) {
281
                                                createNextOutput();
282
                                                pagedWriter.write("\n");
283
                                                if (parser.getAttributeValue(null,"n") != null) {
284
                                                        pagedWriter.writeElement("p", ["style":"color:red", "align":"center"], "- "+parser.getAttributeValue(null,"n")+" -")
285
                                                }
286
                                        }
287
                                        
288
                                        switch (localname) {
289
                                                case "text":
290
                                                        pagedWriter.write("\n");
291
                                                        if (parser.getAttributeValue(null,"id") != null)
292
                                                                pagedWriter.writeElement("h3", parser.getAttributeValue(null,"id"))
293
                                                        pagedWriter.writeStartElement("table");
294
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
295
                                                                pagedWriter.writeStartElement("tr");
296
                                                                pagedWriter.writeElement("td", parser.getAttributeLocalName(i));
297
                                                                pagedWriter.writeElement("td", parser.getAttributeValue(i).toString());
298
                                                                pagedWriter.writeEndElement();
299
                                                        }
300
                                                        pagedWriter.writeEndElement() // table
301
                                                        pagedWriter.writeEmptyElement("br")
302
                                                        pagedWriter.writeCharacters("");
303
                                                        break;
304
                                                case "note":
305
                                                        flagNote = true;
306
                                                        noteContent = ""
307
                                                        break;
308
                                                case "a":
309
                                                        pagedWriter.writeStartElement("a")
310
                                                        pagedWriter.writeAttribute("href", parser.getAttributeValue(null,"href"));
311
                                                        String target = parser.getAttributeValue(null,"target");
312
                                                        if (target == null) target = "_blank";
313
                                                        pagedWriter.writeAttribute("target", target);
314
                                                        break;
315
                                                case "head":
316
                                                        pagedWriter.write("\n");
317
                                                        pagedWriter.writeStartElement("h2")
318
                                                        break;
319
                                                case "graphic":
320
                                                        pagedWriter.write("\n");
321
                                                        String url = parser.getAttributeValue(null, "url")
322
                                                        if (url != null) {
323
                                                                pagedWriter.writeStartElement("div")
324
                                                                pagedWriter.writeEmptyElement("img", ["src":url])
325
                                                                pagedWriter.writeEndElement() // div
326
                                                        }
327
                                                        break;
328
                                                case "lg":
329
                                                case "p":
330
                                                case "q":
331
                                                        pagedWriter.write("\n");
332
                                                        String rend = parser.getAttributeValue(null, "rend")
333
                                                        if (rend == null) rend = "normal"
334
                                                        pagedWriter.writeStartElement("p",["class":rend])
335
                                                        break;
336
                                                //case "pb":
337
                                                
338
                                                case "lb":
339
                                                case "br":
340
                                                        pagedWriter.writeEmptyElement("br")
341
                                                        break;
342
                                                case "w":
343
                                                
344
                                                        wordid = parser.getAttributeValue(null,"id");
345
                                                        
346
                                                        wordcount++;
347
                                                        if (wordcount >= wordmax) {
348
                                                                createNextOutput();
349
                                                        }
350
                                                        
351
                                                        if (firstWord) {
352
                                                                firstWord = false;
353
                                                                this.idxstart.add(wordid);
354
                                                        }
355
                                                        
356
                                                        break;
357
                                                case "ana":
358
                                                        flaginterp=true;
359
                                                        interpvalue+=" "+parser.getAttributeValue(null,"type").substring(1)+":"
360
                                                        break;
361
                                                case "form":
362
                                                        wordvalue="";
363
                                                        interpvalue ="";
364
                                                        flagform=true;
365
                                                        break;
366
                                                //                                                default:
367
                                                //                                                        pagedWriter.writeStartElement(localname)
368
                                        }
369
                                        break;
370
                                case XMLStreamConstants.END_ELEMENT:
371
                                        localname = parser.getLocalName();
372
                                        switch (localname) {
373
                                                case "head":
374
                                                        pagedWriter.writeEndElement() // </h2>
375
                                                        break;
376
                                                case "note":
377
                                                        flagNote = false;
378
                                                        if (noteContent.length() > 0) {
379
                                                                pagedWriter.writeStartElement("span", ["style":"color:red;", "title":noteContent]);
380
                                                                pagedWriter.writeCharacters("[*]");
381
                                                                pagedWriter.writeEndElement() // </span>
382
                                                        }
383
                                                        break;
384
                                                case "lg":
385
                                                case "p":
386
                                                case "q":
387
                                                        pagedWriter.writeEndElement() // </p>
388
                                                        break;
389
                                                case "a":
390
                                                        pagedWriter.writeEndElement() // </a>
391
                                                        break;
392
                                                case "form":
393
                                                        flagform = false
394
                                                        break;
395
                                                case "ana":
396
                                                        flaginterp = false
397
                                                        break;
398
                                                case "w":
399
                                                        int l = lastword.length();
400
                                                        String endOfLastWord = "";
401
                                                        if (l > 0)
402
                                                                endOfLastWord = lastword.subSequence(l-1, l);
403

    
404
                                                        if (interpvalue != null)
405
                                                                interpvalue = interpvalue;
406

    
407
                                                        if (NoSpaceBefore.contains(wordvalue) ||
408
                                                        NoSpaceAfter.contains(lastword) ||
409
                                                        wordvalue.startsWith("-") ||
410
                                                        NoSpaceAfter.contains(endOfLastWord)) {
411
                                                                pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
412
                                                        } else {
413
                                                                pagedWriter.writeCharacters(" ");
414
                                                                pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
415
                                                        }
416

    
417
                                                        pagedWriter.writeCharacters(wordvalue);
418
                                                        pagedWriter.writeEndElement();
419
                                                        lastword=wordvalue;
420
                                                        break;
421
                                                //                                                default:
422
                                                //                                                        pagedWriter.writeEndElement()
423
                                        }
424
                                        break;
425
                                case XMLStreamConstants.CHARACTERS:
426
                                        if (flagform && parser.getText().length() > 0) {
427
                                                wordvalue+=(parser.getText());
428
                                                if (flagNote == parser.getText().length() > 0)
429
                                                        noteContent += parser.getText().replace("\n", " ");
430
                                        } else        if (flaginterp && parser.getText().length() > 0) {
431
                                                interpvalue+=(parser.getText());
432
                                        } else if (flagNote == parser.getText().length() > 0) {
433
                                                noteContent += parser.getText().replace("\n", " ");
434
                                        }
435
                                        break;
436
                        }
437
                }
438
                closeMultiWriter();
439
        }
440
}