Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / bfm / pager.groovy @ 187

History | View | Annotate | Download (15.4 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (Tue, 24 Jan 2017) $
41
// $LastChangedRevision: 3400 $
42
// $LastChangedBy: mdecorde $
43
//
44
package org.txm.importer.bfm;
45

    
46
import org.txm.importer.*;
47
import org.xml.sax.Attributes;
48
import org.txm.importer.filters.*;
49
import java.io.File;
50
import java.io.IOException;
51
import java.util.ArrayList;
52
import javax.xml.parsers.SAXParserFactory;
53
import javax.xml.parsers.ParserConfigurationException;
54
import javax.xml.parsers.SAXParser;
55
import javax.xml.stream.*;
56
import java.net.URL;
57
import org.xml.sax.InputSource;
58
import org.xml.sax.helpers.DefaultHandler;
59

    
60
// TODO: Auto-generated Javadoc
61
/**
62
 * Build BFM texts HTML edition from the TEI-TXM files. <br/>
63
 * The result is similar to the one of the XSLT script made by Alexis Lavrentiev
64
 * (BFM & ICAR3). <br/>
65
 * 
66
 * @author mdecorde
67
 */
68
class pager {
69
        private List<String> NoSpaceBefore;
70

    
71
        /** The No space after. */
72
        private List<String> NoSpaceAfter;
73

    
74
        /** The wordcount. */
75
        private int wordcount = 0;
76

    
77
        /** The pagecount. */
78
        private int pagecount = 0;
79

    
80
        /** The wordmax. */
81
        private int wordmax = 0;
82

    
83
        /** The wordid. */
84
        private String wordid;
85

    
86
        /** The first word. */
87
        private boolean firstWord = true;
88

    
89
        /** The wordvalue. */
90
        private String wordvalue;
91

    
92
        /** The interpvalue. */
93
        private String interpvalue;
94

    
95
        /** The lastword. */
96
        private String lastword = " ";
97

    
98
        /** The wordtype. */
99
        private String wordtype;
100

    
101
        /** The flagform. */
102
        private boolean flagform = false;
103

    
104
        /** The flaginterp. */
105
        private boolean flaginterp = false;
106

    
107
        /** The url. */
108
        private URL url;
109

    
110
        /** The input data. */
111
        private def inputData;
112

    
113
        /** The factory. */
114
        private def factory;
115

    
116
        /** The parser. */
117
        private XMLStreamReader parser;
118

    
119
        /** The writer. */
120
        private OutputStreamWriter writer;
121

    
122
        /** The multiwriter. */
123
        private OutputStreamWriter multiwriter = null;
124
        StaxStackWriter pagedWriter = null;
125

    
126
        /** The infile. */
127
        private File infile;
128

    
129
        /** The outfile. */
130
        private File outfile;
131
        private File outDir;
132

    
133
        /** The pages. */
134
        ArrayList<File> pages = new ArrayList<File>();
135

    
136
        /** The idxstart. */
137
        ArrayList<String> idxstart = new ArrayList<String>();
138

    
139
        /** The titre id. */
140
        private String titreId;
141

    
142
        private String basename;
143
        private String textname;
144

    
145
        private def xpathProperties;
146

    
147
        /**
148
         * Instantiates a new pager.
149
         *
150
         * @param infile the file to convert
151
         * @param outfile the result file
152
         * @param NoSpaceBefore the punctuation marks that don't have a space before
153
         * @param NoSpaceAfter the punctuation marks that don't have a space after
154
         * @param max the max number of word per page
155
         */
156
        pager(File infile, File outDir, String textname, List<String> NoSpaceBefore,
157
        List<String> NoSpaceAfter, int max, String basename, Properties xpathprops) {
158
                this.wordmax = max;
159
                this.NoSpaceBefore = NoSpaceBefore;
160
                this.NoSpaceAfter = NoSpaceAfter;
161
                this.basename = basename;
162
                this.textname = textname;
163
                this.outDir = outDir;
164
                this.xpathProperties = xpathprops;
165
                this.url = infile.toURI().toURL();
166
                this.infile = infile;
167

    
168
                inputData = url.openStream();
169
                factory = XMLInputFactory.newInstance();
170
                parser = factory.createXMLStreamReader(inputData);
171
                process();
172
        }
173

    
174
        private def closeMultiWriter()
175
        {
176
                if (pagedWriter != null) {
177
                        def tags = pagedWriter.getTagStack().clone();
178

    
179
                        if (firstWord) { // there was no words
180
                                this.idxstart.add("w_0")
181
                                pagedWriter.write("<span id=\"w_0\"/>");
182
                        }
183
                        pagedWriter.writeEndElements();
184
                        pagedWriter.close();
185
                        return tags;
186
                } else {
187
                        return [];
188
                }
189
        }
190

    
191
        /**
192
         * Creates the next output.
193
         *
194
         * @return true, if successful
195
         */
196
        private boolean createNextOutput()
197
        {
198
                wordcount = 0;
199
                try {
200
                        def tags = closeMultiWriter();
201
                        for (int i = 0 ; i < tags.size() ; i++) {
202
                                String tag = tags[i]
203
                                if ("body" != tag) {
204
                                        tags.remove(i--)
205
                                } else {
206
                                        tags.remove(i--) // remove "body"
207
                                        break; // remove elements until "body tag
208
                                }
209
                        }
210
                        File outfile = new File(outDir, textname+"_"+(++pagecount)+".html")
211
                        pages.add(outfile);
212
                        firstWord = true; // waiting for next word
213

    
214
                        pagedWriter = new StaxStackWriter(outfile, "UTF-8");
215

    
216
                        pagedWriter.writeStartDocument("UTF-8", "1.0")
217
                        pagedWriter.writeStartElement("html")
218
                        pagedWriter.writeEmptyElement("meta", ["http-equiv":"Content-Type", "content":"text/html","charset":"UTF-8"]);
219
                        pagedWriter.writeStartElement("head")
220
                        pagedWriter.writeStartElement("title")
221
                        pagedWriter.writeCharacters(basename.toUpperCase()+" - "+textname+" - Edition - Page "+pagecount)
222
                        pagedWriter.writeEndElement() // </title>
223
                        pagedWriter.writeEndElement() // </head>
224
                        pagedWriter.writeStartElement("body") //<body>
225
                        pagedWriter.writeStartElements(tags);
226
                        return true;
227
                } catch (Exception e) {
228
                        System.out.println(e.getLocalizedMessage());
229
                        return false;
230
                }
231
        }
232

    
233
        /**
234
         * Creates the output.
235
         *
236
         * @param outfile the outfile
237
         * @return true, if successful
238
         */
239
        private boolean createOutput(File outfile) {
240
                try {
241
                        return createNextOutput();
242
                } catch (Exception e) {
243
                        System.out.println(e.getLocalizedMessage());
244
                        return false;
245
                }
246
        }
247

    
248
        /**
249
         * Gets the page files.
250
         *
251
         * @return the pages create during the processing
252
         */
253
        public ArrayList<File> getPageFiles() {
254
                return pages;
255
        }
256

    
257
        /**
258
         * Gets the idx.
259
         *
260
         * @return the word's indexes
261
         */
262
        public ArrayList<String> getIdx() {
263
                return idxstart;
264
        }
265

    
266
        /**
267
         * process the infile.
268
         */
269
        void process() {
270
                String localname = "";
271

    
272
                boolean closeTitre = false;
273
                boolean closeAmen = false;
274

    
275
                boolean flagchoice = false;
276
                boolean flagcorr = false;
277
                boolean flagsic = false;
278
                boolean flagreg = false;
279
                boolean flagexpan = false;
280
                boolean flagorig = false;
281
                boolean flagabbr = false;
282
                boolean flagforeign = false;
283
                int levelSupplied = 0;
284
                boolean flagSurplus = false;
285
                boolean flagDel = false;
286

    
287
                // option for metadata from BFM TEI header
288
                def mValues = [:];
289
                def xpathprocessor = new XPathResult(infile)
290
                for (String name : xpathProperties.keySet()) {
291
                        String value = xpathprocessor.getXpathResponse(xpathProperties.get(name), "N/A");
292
                        mValues.put(name, value.replace("\n", " "));
293
                }
294
                if (mValues.containsKey("forme") && !mValues.get("forme").equals("N/A")) {
295
                        if (mValues.get("forme").startsWith("#forme_")) {
296
                                mValues.put("forme", mValues.get("forme").substring(7));
297
                        } else if (mValues.get("forme").startsWith("#")) {
298
                                mValues.put("forme", mValues.get("forme").substring(1));
299
                        } else {
300
                                mValues.put("forme", mValues.get("forme"));
301
                        }
302
                }
303
                createNextOutput();
304
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
305
                {
306
                        switch (event) {
307
                                case XMLStreamConstants.START_ELEMENT:
308
                                        localname = parser.getLocalName();
309
                                        switch (localname) {
310
                                                case "text":
311
                                                        pagedWriter.writeStartElement("h1",
312
                                                        ["align":"center", "style":"color:darkviolet;font-family:Times;font-weight:bold;font-style:italic;font-size=200%", "class":"text"])
313
                                                        pagedWriter.writeCharacters(mValues.get("titre"))
314
                                                        pagedWriter.writeEndElement(); // h1
315
                                                        pagedWriter.writeStartElement("p")
316
                                                        pagedWriter.writeEmptyElement("hr")
317
                                                        pagedWriter.writeCharacters("auteur : "+mValues.get("auteur"))
318
                                                        pagedWriter.writeEmptyElement("br");
319
                                                        pagedWriter.writeCharacters("date de composition : "+mValues.get("datecompolibre"));
320
                                                        pagedWriter.writeEmptyElement("br");
321
                                                        pagedWriter.writeCharacters("domaine : "+mValues.get("domaine"))
322
                                                        pagedWriter.writeEmptyElement("br");
323
                                                        pagedWriter.writeCharacters("genre : "+mValues.get("genre"))
324
                                                        pagedWriter.writeEmptyElement("br");
325
                                                        pagedWriter.writeCharacters("forme : "+mValues.get("forme"))
326
                                                        pagedWriter.writeEmptyElement("br");
327
                                                        pagedWriter.writeCharacters("dialecte : "+mValues.get("dialecte"))
328
                                                        pagedWriter.writeEmptyElement("br");
329
                                                        pagedWriter.writeEmptyElement("hr")
330
                                                        pagedWriter.writeEndElement(); // p
331
                                                        break;
332
                                                case "head":
333
                                                        pagedWriter.writeStartElement("h2")
334
                                                        break;
335
                                                case "lg":
336
                                                case "p":
337
                                                        pagedWriter.writeStartElement("p")
338
                                                        break;
339
                                                case "pb":
340
                                                        createNextOutput();
341
                                                        if (parser.getAttributeValue(null,"n") != null) {
342
                                                                pagedWriter.writeStartElement("p", ["style":"color:red", "align":"center"]);
343
                                                                pagedWriter.writeCharacters("- "+parser.getAttributeValue(null,"n")+" -")
344
                                                                pagedWriter.writeEndElement() // p
345
                                                        }
346
                                                        break;
347

    
348
                                                case "ab":
349
                                                        if (parser.getAttributeValue(null,"n") != null) {
350
                                                                pagedWriter.writeStartElement("p", ["align":"center"]);
351
                                                                pagedWriter.writeCharacters(parser.getAttributeValue(null,"n"))
352
                                                                pagedWriter.writeEndElement() // p
353
                                                        }
354
                                                        break;
355
                                                case "l":
356
                                                        pagedWriter.writeEmptyElement("br");
357
                                                        break;
358
                                                case "div":
359
                                                        if (parser.getAttributeValue(null,"type") == "titre") {
360
                                                                pagedWriter.writeStartElement("h3")
361
                                                                closeTitre = true;
362
                                                        } else if (parser.getAttributeValue(null,"type") == "amen") {
363
                                                                pagedWriter.writeStartElement("h3")
364
                                                                closeAmen = true;
365
                                                        }
366
                                                        break;
367
                                                case "w":
368
                                                        interpvalue = "";
369
                                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) {
370
                                                                if (parser.getAttributeLocalName(i).equals("type")) {
371
                                                                        interpvalue = " pos: "+parser.getAttributeValue(i);
372
                                                                }
373
                                                        }
374
                                                        wordid = parser.getAttributeValue(null, "id");
375

    
376
                                                        if (firstWord) {
377
                                                                firstWord = false;
378
                                                                this.idxstart.add(wordid);
379
                                                        }
380
                                                        wordcount++;
381
                                                        if (wordcount >= wordmax) {
382
                                                                createNextOutput();
383
                                                        }
384
                                                        break;
385

    
386
                                                case "choice":
387
                                                        flagchoice = true;
388
                                                        break;
389
                                                case "corr":
390
                                                        flagcorr = true;
391
                                                        break;
392
                                                case "sic":
393
                                                        flagsic = true;
394
                                                        break;
395
                                                case "reg":
396
                                                        flagreg = true;
397
                                                        break;
398
                                                case "orig":
399
                                                        flagorig = true;
400
                                                        break;
401
                                                case "foreign":
402
                                                        flagforeign = true;
403
                                                        break;
404
                                                case "supplied":
405
                                                        levelSupplied = levelSupplied +1;
406
                                                        if(flagform)
407
                                                                wordvalue = wordvalue+"[";
408
                                                        break;
409

    
410
                                                case "surplus":
411
                                                        flagSurplus = true;
412
                                                        pagedWriter.writeStartElement("span", ["class": "surplus", "style":"color:red;"])
413
                                                        break;
414

    
415
                                                case "del":
416
                                                        flagDel = true;
417
                                                        pagedWriter.writeStartElement("span", ["class": "del", "style":"color:red;text-decoration:line-through;"])
418
                                                        break;
419

    
420
                                                case "ana":
421
                                                        flaginterp=true;
422
                                                        interpvalue+=" "+parser.getAttributeValue(null,"type").substring(1)+":"
423
                                                        break;
424

    
425
                                                case "form":
426
                                                        wordvalue="";
427
                                                        flagform=true;
428
                                        }
429
                                        break;
430
                                case XMLStreamConstants.END_ELEMENT:
431
                                        localname = parser.getLocalName();
432
                                        switch (localname) {
433
                                                case "head":
434
                                                        pagedWriter.writeEndElement() // h2
435
                                                        break;
436

    
437
                                                case "div":
438
                                                        if (closeTitre) {
439
                                                                pagedWriter.writeEndElement() // h3
440
                                                                closeTitre = false;
441
                                                        } else if(closeAmen) {
442
                                                                pagedWriter.writeEndElement() // h3
443
                                                                closeTitre = false;
444
                                                        }
445
                                                        break;
446

    
447
                                                case "lb":
448
                                                        pagedWriter.writeEmptyElement("br")
449
                                                        break;
450
                                                case "lg":
451
                                                case "p":
452
                                                        pagedWriter.writeEndElement() // p
453
                                                        break;
454

    
455
                                                case "choice":
456
                                                        flagchoice = false;
457
                                                        break;
458
                                                case "corr":
459
                                                        flagcorr = false;
460
                                                        break;
461
                                                case "sic":
462
                                                        flagsic = false;
463
                                                        break;
464
                                                case "reg":
465
                                                        flagreg = false;
466
                                                        break;
467
                                                case "orig":
468
                                                        flagorig = false;
469
                                                        break;
470
                                                case "foreign":
471
                                                        flagforeign = false;
472
                                                        break;
473
                                                case "supplied":
474
                                                        levelSupplied = levelSupplied -1;
475
                                                        if (flagform)
476
                                                                wordvalue = wordvalue+"]";
477
                                                        break;
478
                                                case "surplus":
479
                                                        flagSurplus = false;
480
                                                        pagedWriter.writeCharacters(")")
481
                                                        pagedWriter.writeEndElement() // span
482
                                                        break;
483

    
484
                                                case "del":
485
                                                        flagDel = false;
486
                                                        pagedWriter.writeEndElement() // span
487
                                                        break;
488

    
489
                                                case "form":
490
                                                        flagform = false
491
                                                        break;
492

    
493
                                                case "ana":
494
                                                        flaginterp = false
495
                                                        break;
496

    
497
                                                case "w":
498
                                                        int l = lastword.length();
499
                                                        String color = "";
500
                                                        if (flagcorr)
501
                                                                color = "color: green;";
502
                                                        else if (flagreg)
503
                                                                color = "color: darkgreen;";
504
                                                        else if (flagforeign)
505
                                                                color = "color: darkred;";
506
                                                        else if (levelSupplied == 1)
507
                                                                color = "color: blue;";
508
                                                        else if (levelSupplied == 2)
509
                                                                color = "color: darkblue;";
510
                                                        if (!flagchoice || flagcorr || flagreg) {
511
                                                                String endOfLastWord = "";
512
                                                                if (l > 0)
513
                                                                        endOfLastWord = lastword.subSequence(l-1, l);
514

    
515
                                                                if (NoSpaceBefore.contains(wordvalue) ||
516
                                                                NoSpaceAfter.contains(lastword) ||
517
                                                                wordvalue.startsWith("-") ||
518
                                                                NoSpaceAfter.contains(endOfLastWord))
519
                                                                {
520
                                                                        //multiwriter.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\" "+color+">");
521
                                                                } else {
522
                                                                        //multiwriter.write(" <span title=\""+interpvalue+"\" id=\""+wordid+"\" "+color+">");
523
                                                                        pagedWriter.writeCharacters(" ")
524
                                                                }
525
                                                                
526
                                                                pagedWriter.writeStartElement("span", ["title":interpvalue, "id":wordid, "style":color]) // span
527
                                                                pagedWriter.writeCharacters(wordvalue) 
528
                                                                pagedWriter.writeEndElement() // span
529
                                                                lastword=wordvalue;
530
                                                        }
531
                                                        break;
532
                                        }
533
                                        break;
534

    
535
                                case XMLStreamConstants.CHARACTERS:
536
                                        if (flagform && parser.getText().length() > 0)
537
                                                        wordvalue += (parser.getText());
538
                                        if (flaginterp && parser.getText().length() > 0)
539
                                                        interpvalue += (parser.getText());
540
                                        break;
541
                        }
542
                }
543
                closeMultiWriter();
544
        }
545
}