Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xml / pager.groovy @ 1688

History | View | Annotate | Download (12.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-05-02 11:55:17 +0200 (mar. 02 mai 2017) $
25
// $LastChangedRevision: 3436 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.xml;
29

    
30
import org.txm.scripts.importer.*;
31
import org.xml.sax.Attributes;
32
import org.txm.importer.scripts.filters.*;
33
import java.io.File;
34
import java.io.IOException;
35
import java.util.ArrayList;
36
import javax.xml.parsers.*
37
import javax.xml.stream.*;
38
import java.net.URL;
39
import org.xml.sax.InputSource;
40
import org.xml.sax.helpers.DefaultHandler;
41
import org.txm.utils.io.FileCopy
42

    
43
/** Build a simple edition from a xml-tei. 
44
 * @author mdecorde */
45
class pager {
46

    
47
        List<String> NoSpaceBefore;
48

    
49
        /** The No space after. */
50
        List<String> NoSpaceAfter;
51

    
52
        /** The wordcount. */
53
        int wordcount = 0;
54

    
55
        /** The pagecount. */
56
        int pagecount = 0;
57

    
58
        /** The wordmax. */
59
        int wordmax = 0;
60

    
61
        /** The basename. */
62
        String basename = "";
63
        String txtname = "";
64
        File outdir;
65

    
66
        /** The wordid. */
67
        String wordid;
68

    
69
        /** The first word. */
70
        boolean firstWord = true;
71

    
72
        /** The wordvalue. */
73
        String wordvalue = "";
74

    
75
        /** The interpvalue. */
76
        String interpvalue = "";
77

    
78
        /** The lastword. */
79
        String lastword = " ";
80

    
81
        /** The wordtype. */
82
        String wordtype;
83

    
84
        /** The flagform. */
85
        boolean flagform = false;
86

    
87
        /** The flaginterp. */
88
        boolean flaginterp = false;
89

    
90
        /** The url. */
91
        private def url;
92

    
93
        /** The input data. */
94
        private def inputData;
95

    
96
        /** The factory. */
97
        private def factory;
98

    
99
        /** The parser. */
100
        private XMLStreamReader parser;
101

    
102
        /** The writer. */
103
        OutputStreamWriter writer;
104

    
105
        /** The pagedWriter. */
106
        StaxStackWriter pagedWriter = null;
107

    
108
        /** The infile. */
109
        File infile;
110

    
111
        /** The outfile. */
112
        File outfile;
113

    
114
        /** The pages. */
115
        ArrayList<File> pages = new ArrayList<File>();
116

    
117
        /** The idxstart. */
118
        ArrayList<String> idxstart = new ArrayList<String>();
119
        String paginationElement;
120

    
121
        /**
122
         * Instantiates a new pager.
123
         *
124
         * @param infile the infile
125
         * @param outfile the outfile
126
         * @param NoSpaceBefore the no space before
127
         * @param NoSpaceAfter the no space after
128
         * @param max the max
129
         * @param basename the basename
130
         */
131
        pager(File infile, File outdir, String txtname, List<String> NoSpaceBefore,
132
        List<String> NoSpaceAfter, int max, String basename, String paginationElement) {
133
                this.paginationElement = paginationElement;
134
                this.basename = basename;
135
                this.txtname = txtname;
136
                this.outdir = outdir;
137
                this.wordmax = max;
138
                this.NoSpaceBefore = NoSpaceBefore;
139
                this.NoSpaceAfter = NoSpaceAfter;
140
                this.url = infile.toURI().toURL();
141
                this.infile = infile;
142

    
143
                inputData = url.openStream();
144
                factory = XMLInputFactory.newInstance();
145
                parser = factory.createXMLStreamReader(inputData);
146

    
147
                process();
148
                
149
                // copy txm.css file near editions
150
                try {
151
                        File txmDir = infile.getParentFile().getParentFile();
152
                        if (txmDir.exists() && txmDir.getName().equals("txm")) {
153
                                File binDir = txmDir.getParentFile();
154
                                File corporaDir = binDir.getParentFile();
155
                                if (binDir.exists() && corporaDir.getName().equals("corpora")) {
156
                                        File htmlDir = new File(binDir, "HTML/"+basename.toUpperCase()+"/default")
157
                                        File txmhomeDir = corporaDir.getParentFile()
158
                                        File txmcss = new File(txmhomeDir, "css/txm.css")
159
                                        if (htmlDir.exists() && txmcss.exists()) {
160
                                                FileCopy.copy(txmcss, new File(htmlDir, "txm.css"))
161
                                        }
162
                                }
163
                        }
164
                } catch(Exception e) {
165
                        println "Failed to copy TXM default CSS: "+e.getLocalizedMessage();
166
                }
167
        }
168

    
169
        private def closeMultiWriter()
170
        {
171
                if (pagedWriter != null) {
172
                        def tags = pagedWriter.getTagStack().clone();
173

    
174
                        if (firstWord) { // there was no words found
175
                                pagedWriter.writeCharacters("");
176
                                this.idxstart.add("w_0")
177
                                pagedWriter.write("<span id=\"w_0\"/>");
178
                        }
179
                        pagedWriter.writeEndElements();
180
                        pagedWriter.close();
181
                        return tags;
182
                } else {
183
                        return [];
184
                }
185
        }
186

    
187
        /**
188
         * Creates the next output.
189
         *
190
         * @return true, if successful
191
         */
192
        private boolean createNextOutput() {
193
                wordcount = 0;
194
                try {
195
                        def tags = closeMultiWriter();
196
                        for (int i = 0 ; i < tags.size() ; i++) {
197
                                String tag = tags[i]
198
                                if ("div" != tag) {
199
                                        tags.remove(i--)
200
                                } else {
201
                                        tags.remove(i--) // remove first "div"
202
                                        break; // remove elements until first "div" tag
203
                                }
204
                        }
205
                        File outfile = new File(outdir, txtname+"_"+(++pagecount)+".html")
206
                        pages.add(outfile);
207
                        firstWord = true; // waiting for next word
208

    
209
                        pagedWriter = new StaxStackWriter(outfile, "UTF-8");
210

    
211
                        //pagedWriter.writeStartDocument()
212
                        pagedWriter.writeDTD("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">")
213
                        pagedWriter.writeCharacters("\n")
214
                        pagedWriter.writeStartElement("html");
215
                        pagedWriter.writeCharacters("\n")
216
                        pagedWriter.writeEmptyElement("meta", ["http-equiv":"Content-Type", "content":"text/html","charset":"UTF-8"]);
217
                        pagedWriter.writeCharacters("\n")
218
                        pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "type":"text/css","href":"txm.css"]);
219
                        pagedWriter.writeCharacters("\n")
220
                        pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "type":"text/css","href":"${basename}.css"]);
221
                        pagedWriter.writeCharacters("\n")
222
                        pagedWriter.writeStartElement("head");
223
                        pagedWriter.writeCharacters("\n")
224
                        pagedWriter.writeStartElement("title")
225
                        pagedWriter.writeCharacters(basename.toUpperCase()+" Edition - Page "+pagecount)
226
                        pagedWriter.writeEndElement(); // </title>
227
                        pagedWriter.writeCharacters("\n")
228
                        pagedWriter.writeEndElement() // </head>
229
                        pagedWriter.writeCharacters("\n")
230
                        pagedWriter.writeStartElement("body") //<body>
231
                        pagedWriter.writeCharacters("\n")
232
                        pagedWriter.writeStartElement("div") //<div class="txmeditionpage">
233
                        pagedWriter.writeAttribute("class", "txmeditionpage")
234
                        
235
                        pagedWriter.writeStartElements(tags);
236
                        return true;
237
                } catch (Exception e) {
238
                        System.out.println("Error: "+e.getLocalizedMessage());
239
                        return false;
240
                }
241
        }
242

    
243
        /**
244
         * Creates the output.
245
         *
246
         * @param outfile the outfile
247
         * @return true, if successful
248
         */
249
        private boolean createOutput() {
250
                try {
251
                        return createNextOutput();
252
                } catch (Exception e) {
253
                        System.out.println(e.getLocalizedMessage());
254
                        return false;
255
                }
256
        }
257

    
258
        /**
259
         * Gets the page files.
260
         *
261
         * @return the page files
262
         */
263
        public ArrayList<File> getPageFiles() {
264
                return pages;
265
        }
266

    
267
        /**
268
         * Gets the idx.
269
         *
270
         * @return the idx
271
         */
272
        public ArrayList<String> getIdx() {
273
                return idxstart;
274
        }
275

    
276
        /**
277
         * Process.
278
         */
279
        void process() {
280
                boolean flagNote = false;
281
                String noteContent = "";
282

    
283
                String localname = "";
284
                createNextOutput();
285
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
286
                        switch (event) {
287
                                case XMLStreamConstants.START_ELEMENT:
288
                                        localname = parser.getLocalName();
289
                                        
290
                                        if (localname == paginationElement) {
291
                                                createNextOutput();
292
                                                pagedWriter.write("\n");
293
                                                if (parser.getAttributeValue(null,"n") != null) {
294
                                                        pagedWriter.writeElement("p", ["style":"color:red", "align":"center"], "- "+parser.getAttributeValue(null,"n")+" -")
295
                                                }
296
                                        }
297
                                        
298
                                        switch (localname) {
299
                                                case "text":
300
                                                        pagedWriter.write("\n");
301
                                                        if (parser.getAttributeValue(null,"id") != null)
302
                                                                pagedWriter.writeElement("h3", parser.getAttributeValue(null,"id"))
303
                                                        pagedWriter.writeStartElement("table");
304
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
305
                                                                pagedWriter.writeStartElement("tr");
306
                                                                pagedWriter.writeElement("td", parser.getAttributeLocalName(i));
307
                                                                pagedWriter.writeElement("td", parser.getAttributeValue(i).toString());
308
                                                                pagedWriter.writeEndElement();
309
                                                        }
310
                                                        pagedWriter.writeEndElement() // table
311
                                                        pagedWriter.writeEmptyElement("br")
312
                                                        pagedWriter.writeCharacters("");
313
                                                        break;
314
                                                case "note":
315
                                                        flagNote = true;
316
                                                        noteContent = ""
317
                                                        break;
318
                                                case "a":
319
                                                        pagedWriter.writeStartElement("a")
320
                                                        pagedWriter.writeAttribute("href", parser.getAttributeValue(null,"href"));
321
                                                        String target = parser.getAttributeValue(null,"target");
322
                                                        if (target == null) target = "_blank";
323
                                                        pagedWriter.writeAttribute("target", target);
324
                                                        break;
325
                                                case "head":
326
                                                        pagedWriter.write("\n");
327
                                                        pagedWriter.writeStartElement("h2")
328
                                                        break;
329
                                                case "graphic":
330
                                                        pagedWriter.write("\n");
331
                                                        String url = parser.getAttributeValue(null, "url")
332
                                                        if (url != null) {
333
                                                                pagedWriter.writeStartElement("div")
334
                                                                pagedWriter.writeEmptyElement("img", ["src":url])
335
                                                                pagedWriter.writeEndElement() // div
336
                                                        }
337
                                                        break;
338
                                                case "lg":
339
                                                case "p":
340
                                                case "q":
341
                                                        pagedWriter.write("\n");
342
                                                        String rend = parser.getAttributeValue(null, "rend")
343
                                                        if (rend == null) rend = "normal"
344
                                                        pagedWriter.writeStartElement("p",["class":rend])
345
                                                        break;
346
                                                //case "pb":
347
                                                
348
                                                case "lb":
349
                                                case "br":
350
                                                        pagedWriter.writeEmptyElement("br")
351
                                                        break;
352
                                                case "w":
353
                                                
354
                                                        wordid = parser.getAttributeValue(null,"id");
355
                                                        
356
                                                        wordcount++;
357
                                                        if (wordcount >= wordmax) {
358
                                                                createNextOutput();
359
                                                        }
360
                                                        
361
                                                        if (firstWord) {
362
                                                                firstWord = false;
363
                                                                this.idxstart.add(wordid);
364
                                                        }
365
                                                        
366
                                                        break;
367
                                                case "ana":
368
                                                        flaginterp=true;
369
                                                        interpvalue+=" "+parser.getAttributeValue(null,"type").substring(1)+":"
370
                                                        break;
371
                                                case "form":
372
                                                        wordvalue="";
373
                                                        interpvalue ="";
374
                                                        flagform=true;
375
                                                        break;
376
                                                //                                                default:
377
                                                //                                                        pagedWriter.writeStartElement(localname)
378
                                        }
379
                                        break;
380
                                case XMLStreamConstants.END_ELEMENT:
381
                                        localname = parser.getLocalName();
382
                                        switch (localname) {
383
                                                case "head":
384
                                                        pagedWriter.writeEndElement() // </h2>
385
                                                        pagedWriter.writeCharacters("\n")
386
                                                        break;
387
                                                case "note":
388
                                                        flagNote = false;
389
                                                        if (noteContent.length() > 0) {
390
                                                                pagedWriter.writeStartElement("span", ["style":"color:red;", "title":noteContent]);
391
                                                                pagedWriter.writeCharacters("[*]");
392
                                                                pagedWriter.writeEndElement() // </span>
393
                                                        }
394
                                                        break;
395
                                                case "lg":
396
                                                case "p":
397
                                                case "q":
398
                                                        pagedWriter.writeEndElement() // </p>
399
                                                        pagedWriter.writeCharacters("\n")
400
                                                        break;
401
                                                case "a":
402
                                                        pagedWriter.writeEndElement() // </a>
403
                                                        break;
404
                                                case "form":
405
                                                        flagform = false
406
                                                        break;
407
                                                case "ana":
408
                                                        flaginterp = false
409
                                                        break;
410
                                                case "w":
411
                                                        int l = lastword.length();
412
                                                        String endOfLastWord = "";
413
                                                        if (l > 0)
414
                                                                endOfLastWord = lastword.subSequence(l-1, l);
415

    
416
                                                        if (interpvalue != null)
417
                                                                interpvalue = interpvalue;
418

    
419
                                                        if (NoSpaceBefore.contains(wordvalue) ||
420
                                                        NoSpaceAfter.contains(lastword) ||
421
                                                        wordvalue.startsWith("-") ||
422
                                                        NoSpaceAfter.contains(endOfLastWord)) {
423
                                                                pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
424
                                                        } else {
425
                                                                pagedWriter.writeCharacters(" ");
426
                                                                pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid]);
427
                                                        }
428

    
429
                                                        pagedWriter.writeCharacters(wordvalue);
430
                                                        pagedWriter.writeEndElement();
431
                                                        pagedWriter.writeComment("\n")
432
                                                        lastword=wordvalue;
433
                                                        break;
434
                                                //                                                default:
435
                                                //                                                        pagedWriter.writeEndElement()
436
                                        }
437
                                        break;
438
                                case XMLStreamConstants.CHARACTERS:
439
                                        if (flagform && parser.getText().length() > 0) {
440
                                                wordvalue+=(parser.getText());
441
                                                if (flagNote == parser.getText().length() > 0)
442
                                                        noteContent += parser.getText().replace("\n", " ");
443
                                        } else        if (flaginterp && parser.getText().length() > 0) {
444
                                                interpvalue+=(parser.getText());
445
                                        } else if (flagNote == parser.getText().length() > 0) {
446
                                                noteContent += parser.getText().replace("\n", " ");
447
                                        }
448
                                        break;
449
                        }
450
                }
451
                closeMultiWriter();
452
                if (parser != null) parser.close();
453
                if (inputData != null) inputData.close();
454
        }
455
}