Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / bvh / pager.groovy @ 187

History | View | Annotate | Download (9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (Tue, 24 Jan 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.bvh;
29

    
30
import org.txm.importer.*;
31
import org.xml.sax.Attributes;
32
import org.txm.importer.filters.*;
33
import java.io.File;
34
import java.io.IOException;
35
import java.util.ArrayList;
36
import javax.xml.parsers.SAXParserFactory;
37
import javax.xml.parsers.ParserConfigurationException;
38
import javax.xml.parsers.SAXParser;
39
import javax.xml.stream.*;
40
import java.net.URL;
41
import org.xml.sax.InputSource;
42
import org.xml.sax.helpers.DefaultHandler;
43

    
44
// TODO: Auto-generated Javadoc
45
/**
46
 * Build BVH texts simple edition from the TEI-TXM files. <br/>
47
 * The result is similar to the edition of the BVH site. <br/>
48
 * Segment edition by page at &lt;pb> tags. <br/>
49
 * 
50
 * @author mdecorde
51
 * 
52
 */
53
class pager {
54
        private List<String> NoSpaceBefore;
55
        
56
        /** The No space after. */
57
        private List<String> NoSpaceAfter;
58

    
59
        /** The wordcount. */
60
        private int wordcount = 0;
61
        
62
        /** The pagecount. */
63
        private int pagecount = 0;
64
        
65
        /** The wordmax. */
66
        private int wordmax = 0;
67

    
68
        /** The wordid. */
69
        private String wordid;
70
        
71
        /** The first word. */
72
        private boolean firstWord = true;
73
        
74
        /** The wordvalue. */
75
        private String wordvalue;
76
        
77
        /** The interpvalue. */
78
        private String interpvalue;
79
        
80
        /** The lastword. */
81
        private String lastword = " ";
82
        
83
        /** The wordtype. */
84
        private String wordtype;
85
        
86
        /** The flagform. */
87
        private boolean flagform = false;
88
        
89
        /** The flaginterp. */
90
        private boolean flaginterp = false;
91
        
92
        /** The url. */
93
        private def url;
94
        
95
        /** The input data. */
96
        private def inputData;
97
        
98
        /** The factory. */
99
        private def factory;
100
        
101
        /** The parser. */
102
        private XMLStreamReader parser;
103
        
104
        /** The writer. */
105
        private OutputStreamWriter writer;
106
        
107
        /** The multiwriter. */
108
        private OutputStreamWriter multiwriter = null;
109
        
110
        /** The infile. */
111
        private File infile;
112
        
113
        /** The outfile. */
114
        private File outfile;
115
        
116
        /** The pages. */
117
        private ArrayList<File> pages = new ArrayList<File>();
118
        
119
        /** The idxstart. */
120
        private ArrayList<String> idxstart = new ArrayList<String>();
121

    
122
        /**
123
         * Instantiates a new pager.
124
         *
125
         * @param infile the file to convert
126
         * @param outfile the result file
127
         * @param NoSpaceBefore the punctuation marks that don't have a space before
128
         * @param NoSpaceAfter the punctuation marks that don't have a space after
129
         * @param max the max number of word per page
130
         */
131
        pager(File infile, File outfile, List<String> NoSpaceBefore,
132
                        List<String> NoSpaceAfter, int max) {
133
                this.wordmax = max;
134
                this.NoSpaceBefore = NoSpaceBefore;
135
                this.NoSpaceAfter = NoSpaceAfter;
136
                this.url = infile.toURI().toURL();
137
                this.infile = infile;
138

    
139
                inputData = url.openStream();
140
                factory = XMLInputFactory.newInstance();
141
                parser = factory.createXMLStreamReader(inputData);
142
                createOutput(outfile);
143
                process();
144
        }
145

    
146
        /**
147
         * Creates the next output.
148
         *
149
         * @return true, if successful
150
         */
151
        private boolean createNextOutput()
152
        {
153
                try {
154
                        if(multiwriter != null)
155
                        {
156
                                multiwriter.write("</body>");
157
                                multiwriter.write("</html>");
158
                                multiwriter.close();
159
                        }
160
                        pagecount++;
161
                        File f = new File(outfile.getParent()+"/multi/",outfile.getName().substring(0,outfile.getName().length()-5)+"_"+pagecount+".html");
162
                        pages.add(f);
163
                        idxstart.add(wordid)
164
                        multiwriter = new OutputStreamWriter(new FileOutputStream(f) , "UTF-8");
165
                        
166
                        multiwriter.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
167
                        multiwriter.write("<html>");
168
                        multiwriter.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>");
169
                        multiwriter.write("<head>");
170
                        multiwriter.write("<title>BVH Edition - Page "+pagecount+"</title>");
171
                        multiwriter.write("</head>");
172
                        multiwriter.write("<body>");
173
                        
174
                        return true;
175
                } catch (Exception e) {
176
                        System.out.println(e.getLocalizedMessage());
177
                        return false;
178
                }
179
        }
180

    
181
        /**
182
         * Creates the output.
183
         *
184
         * @param outfile the outfile
185
         * @return true, if successful
186
         */
187
        private boolean createOutput(File outfile) {
188
                try {
189
                        this.outfile = outfile;
190

    
191
                        writer = new OutputStreamWriter(new FileOutputStream(outfile),
192
                                        "UTF-8");
193
                        createNextOutput();
194
                        return true;
195
                } catch (Exception e) {
196
                        System.out.println(e.getLocalizedMessage());
197
                        return false;
198
                }
199
        }
200

    
201
        /**
202
         * Gets the page files.
203
         *
204
         * @return the geenrated edition page path
205
         */
206
        public ArrayList<File> getPageFiles() {
207
                return pages;
208
        }
209

    
210
        /**
211
         * Gets the idx.
212
         *
213
         * @return the id of the edition
214
         */
215
        public ArrayList<String> getIdx() {
216
                return idxstart;
217
        }
218

    
219
        /**
220
         * start processing.
221
         */
222
        void process()
223
        {
224
                String localname = "";
225
                
226
                writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
227
                writer.write("<html>");
228
                writer.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>");
229
                writer.write("<head>");
230
                writer.write("<title>BVH edition</title>");
231
                writer.write("</head>");
232
                writer.write("<body>");
233
                
234
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
235
                        
236
                        
237
                        switch (event) {
238
                                case XMLStreamConstants.START_ELEMENT:
239
                                        localname = parser.getLocalName();
240
                                        switch(localname)
241
                                        {
242
                                                case "head":
243
                                                        writer.write("<h2>\n")
244
                                                        multiwriter.write("<h2>\n")
245
                                                        break;
246
                                                
247
                                                case "lg":
248
                                                        writer.write("<p>\n")
249
                                                        multiwriter.write("<p>\n")
250
                                                        break;
251
                                                        
252
                                                case "pb":
253
                                                        createNextOutput();
254
                                                break;
255
                                                
256
                                                case "l":
257
                                                        writer.write("<br/>\n")
258
                                                        multiwriter.write("<br/>\n")
259
                                                        break;
260
                                                
261
                                                case "p":
262
                                                case "q":
263
                                                        writer.write("<p>\n")
264
                                                        multiwriter.write("<p>\n")
265
                                                        break;
266
                                                
267
                                                case "w":
268
                                                        wordid=parser.getAttributeValue(null,"id");
269
                                                        if(firstWord)
270
                                                        {
271
                                                                firstWord=false;
272
                                                                this.idxstart.set(0,wordid);
273
                                                        }
274
                                                        wordcount++;
275
                                                        if(wordcount >= wordmax)
276
                                                        {
277
                                                                
278
                                                                wordcount=0;
279
                                                        }
280
                                                        break;
281
                                                
282
                                                case "ana":
283
                                                        flaginterp=true;
284
                                                        interpvalue+=" "+parser.getAttributeValue(null,"type").substring(1)+":"
285
                                                        break;
286
                                                
287
                                                case "form":
288
                                                        wordvalue="";
289
                                                        interpvalue ="";
290
                                                        flagform=true;
291
                                        }
292
                                        break;
293
                                case XMLStreamConstants.END_ELEMENT:
294
                                        localname = parser.getLocalName();
295
                                        switch(localname)
296
                                        {
297
                                                case "head":
298
                                                        writer.write("</h2>\n")
299
                                                        multiwriter.write("</h2>\n")
300
                                                        break;
301
                                                
302
                                                case "lg":
303
                                                        writer.write("</p>\n")
304
                                                        multiwriter.write("</p>\n")
305
                                                        break;
306
                                                
307
                                                case "lb":
308
                                                        writer.write("<br/>\n")
309
                                                        multiwriter.write("<br/>\n")
310
                                                        break;
311
                                                                                                
312
                                                case "p":
313
                                                case "q":
314
                                                        writer.write("</p>\n")
315
                                                        multiwriter.write("</p>\n")
316
                                                        break;
317
                                                        
318
                                                case "form":
319
                                                        flagform = false
320
                                                
321
                                                        break;
322
                                                
323
                                                case "ana":
324
                                                        flaginterp = false
325
                                                        break;
326
                                                
327
                                                case "w":
328
                                                        int l = lastword.length();
329
                                                        String endOfLastWord = "";
330
                                                        if(l > 0)
331
                                                                endOfLastWord = lastword.subSequence(l-1, l);
332
                                                        
333
                                                        if(interpvalue != null)
334
                                                                interpvalue = interpvalue.replace("\"","&quot;");
335
                                                        
336
                                                        if(NoSpaceBefore.contains(wordvalue) || 
337
                                                                        NoSpaceAfter.contains(lastword) || 
338
                                                                        wordvalue.startsWith("-") || 
339
                                                                        NoSpaceAfter.contains(endOfLastWord))
340
                                                        {
341
                                                                writer.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\">");
342
                                                                multiwriter.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\">");
343
                                                        }
344
                                                        else
345
                                                        {
346
                                                                writer.write(" <span title=\""+interpvalue+"\" id=\""+wordid+"\">");
347
                                                                multiwriter.write(" <span title=\""+interpvalue+"\" id=\""+wordid+"\">");
348
                                                        }
349
                                                        writer.write(wordvalue.replace("<", "&lt;")+"</span>");
350
                                                        multiwriter.write(wordvalue.replace("<", "&lt;")+"</span>");
351
                                                        lastword=wordvalue;
352
                                                        break;
353
                                        }
354
                                        break;
355
                                
356
                                case XMLStreamConstants.CHARACTERS:
357
                                        if(flagform)
358
                                                if(parser.getText().length() > 0)
359
                                                        wordvalue+=(parser.getText());
360
                                        if(flaginterp)
361
                                                if(parser.getText().length() > 0)
362
                                                        interpvalue+=(parser.getText());
363
                                        break;
364
                        }
365
                }        
366
                writer.write("</body>");
367
                writer.write("</html>");
368
                writer.close();
369
                multiwriter.write("</body>");
370
                multiwriter.write("</html>");
371
                multiwriter.close();
372
                inputData.close();
373
        }
374
}