Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / lasla / pager.groovy @ 187

History | View | Annotate | Download (9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (Tue, 24 Jan 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.lasla
29

    
30
;
31

    
32
import org.txm.importer.*;
33
import org.xml.sax.Attributes;
34
import org.txm.importer.filters.*;
35
import java.io.File;
36
import java.io.IOException;
37
import java.util.ArrayList;
38
import javax.xml.parsers.SAXParserFactory;
39
import javax.xml.parsers.ParserConfigurationException;
40
import javax.xml.parsers.SAXParser;
41
import javax.xml.stream.*;
42
import java.net.URL;
43
import org.xml.sax.InputSource;
44
import org.xml.sax.helpers.DefaultHandler;
45

    
46
// TODO: Auto-generated Javadoc
47
/** Build GEO corpus simple edition from a xml-tei. @author mdecorde */
48
class pager {
49
        List<String> NoSpaceBefore;
50
        
51
        /** The No space after. */
52
        List<String> NoSpaceAfter;
53

    
54
        /** The wordcount. */
55
        int wordcount = 0;
56
        
57
        /** The pagecount. */
58
        int pagecount = 0;
59
        
60
        /** The wordmax. */
61
        int wordmax = 0;
62
        
63
        /** The basename. */
64
        String basename = "";
65

    
66
        /** The wordid. */
67
        String wordid;
68
        
69
        /** The first word. */
70
        boolean firstWord = true;
71
        
72
        /** The wordvalue. */
73
        String wordvalue;
74
        
75
        /** The interpvalue. */
76
        String interpvalue;
77
        
78
        /** The lastword. */
79
        String lastword = " ";
80
        
81
        /** The wordtype. */
82
        String wordtype;
83
        
84
        /** The flagform. */
85
        boolean flagform = false;
86
        
87
        /** The flaginterp. */
88
        boolean flaginterp = false;
89
        
90
        /** The url. */
91
        private def url;
92
        
93
        /** The input data. */
94
        private def inputData;
95
        
96
        /** The factory. */
97
        private def factory;
98
        
99
        /** The parser. */
100
        private XMLStreamReader parser;
101
        
102
        /** The writer. */
103
        OutputStreamWriter writer;
104
        
105
        /** The multiwriter. */
106
        OutputStreamWriter multiwriter = null;
107
        
108
        /** The infile. */
109
        File infile;
110
        
111
        /** The outfile. */
112
        File outfile;
113
        
114
        /** The pages. */
115
        ArrayList<File> pages = new ArrayList<File>();
116
        
117
        /** The idxstart. */
118
        ArrayList<String> idxstart = new ArrayList<String>();
119

    
120
        /**
121
         * Instantiates a new pager.
122
         *
123
         * @param infile the infile
124
         * @param outfile the outfile
125
         * @param NoSpaceBefore the no space before
126
         * @param NoSpaceAfter the no space after
127
         * @param max the max
128
         * @param basename the basename
129
         */
130
        pager(File infile, File outfile, List<String> NoSpaceBefore,
131
                        List<String> NoSpaceAfter, int max, String basename) {
132
                this.basename = basename;
133
                this.wordmax = max;
134
                this.NoSpaceBefore = NoSpaceBefore;
135
                this.NoSpaceAfter = NoSpaceAfter;
136
                this.url = infile.toURI().toURL();
137
                this.infile = infile;
138

    
139
                inputData = url.openStream();
140
                factory = XMLInputFactory.newInstance();
141
                parser = factory.createXMLStreamReader(inputData);
142
                createOutput(outfile);
143
                process();
144
        }
145

    
146
        /**
147
         * Creates the next output.
148
         *
149
         * @return true, if successful
150
         */
151
        private boolean createNextOutput()
152
        {
153
                try {
154
                        if(multiwriter != null)
155
                        {
156
                                multiwriter.write("</body>");
157
                                multiwriter.write("</html>");
158
                                multiwriter.close();
159
                        }
160
                        pagecount++;
161
                        File f = new File(outfile.getParent()+"/multi/",outfile.getName().substring(0,outfile.getName().length()-5)+"_"+pagecount+".html");
162
                        pages.add(f);
163
                        idxstart.add(wordid)
164
                        multiwriter = new OutputStreamWriter(new FileOutputStream(f) , "UTF-8");
165
                        
166
                        multiwriter.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
167
                        multiwriter.write("<html>");
168
                        multiwriter.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>");
169
                        multiwriter.write("<head>");
170
                        multiwriter.write("<title>"+basename.toUpperCase()+" Edition - Page "+pagecount+"</title>");
171
                        multiwriter.write("</head>");
172
                        multiwriter.write("<body>");
173
                        
174
                        return true;
175
                } catch (Exception e) {
176
                        System.out.println(e.getLocalizedMessage());
177
                        return false;
178
                }
179
        }
180

    
181
        /**
182
         * Creates the output.
183
         *
184
         * @param outfile the outfile
185
         * @return true, if successful
186
         */
187
        private boolean createOutput(File outfile) {
188
                try {
189
                        this.outfile = outfile;
190

    
191
                        writer = new OutputStreamWriter(new FileOutputStream(outfile),
192
                                        "UTF-8");
193
                        createNextOutput();
194
                        return true;
195
                } catch (Exception e) {
196
                        System.out.println(e.getLocalizedMessage());
197
                        return false;
198
                }
199
        }
200

    
201
        /**
202
         * Gets the page files.
203
         *
204
         * @return the page files
205
         */
206
        public ArrayList<File> getPageFiles() {
207
                return pages;
208
        }
209

    
210
        /**
211
         * Gets the idx.
212
         *
213
         * @return the idx
214
         */
215
        public ArrayList<String> getIdx() {
216
                return idxstart;
217
        }
218

    
219
        /**
220
         * Process.
221
         */
222
        void process()
223
        {
224
                String localname = "";
225
                
226
                writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
227
                writer.write("<html>");
228
                writer.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>");
229
                writer.write("<head>");
230
                writer.write("<title>"+basename.toUpperCase()+" edition</title>");
231
                writer.write("</head>");
232
                writer.write("<body>");
233
                
234
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
235
                        
236
                        
237
                        switch (event) {
238
                                case XMLStreamConstants.START_ELEMENT:
239
                                        localname = parser.getLocalName();
240
                                        switch(localname)
241
                                        {
242
                                                case "head":
243
                                                        writer.write("<h2>\n")
244
                                                        multiwriter.write("<h2>\n")
245
                                                        break;
246
                                                
247
                                                case "lg":
248
                                                        writer.write("<p>\n")
249
                                                        multiwriter.write("<p>\n")
250
                                                        break;
251
                                                
252
                                                case "p":
253
                                                case "q":
254
                                                        writer.write("<p>\n")
255
                                                        multiwriter.write("<p>\n")
256
                                                        break;
257
                                                        
258
                                                case "lb":
259
                                                case "br":
260
                                                writer.write("<br/>\n")
261
                                                multiwriter.write("<br/>\n")
262
                                                break;
263
                                                
264
                                                case "w":
265
                                                        wordid=parser.getAttributeValue(null,"id");
266
                                                        if(firstWord)
267
                                                        {
268
                                                                firstWord=false;
269
                                                                this.idxstart.set(0,wordid);
270
                                                        }
271
                                                        wordcount++;
272
                                                        if(wordcount >= wordmax)
273
                                                        {
274
                                                                createNextOutput();
275
                                                                wordcount=0;
276
                                                        }
277
                                                        break;
278
                                                
279
                                                case "ana":
280
                                                        flaginterp=true;
281
                                                        interpvalue+=" "+parser.getAttributeValue(null,"type").substring(1)+":"
282
                                                        break;
283
                                                
284
                                                case "form":
285
                                                        wordvalue="";
286
                                                        interpvalue ="";
287
                                                        flagform=true;
288
                                        }
289
                                        break;
290
                                case XMLStreamConstants.END_ELEMENT:
291
                                        localname = parser.getLocalName();
292
                                        switch(localname)
293
                                        {
294
                                                case "head":
295
                                                        writer.write("</h2>\n")
296
                                                        multiwriter.write("</h2>\n")
297
                                                        break;
298
                                                
299
                                                case "lg":
300
                                                        writer.write("</p>\n")
301
                                                        multiwriter.write("</p>\n")
302
                                                        break;
303
                                                
304
                                                case "lb":
305
                                                        writer.write("<br/>\n")
306
                                                        multiwriter.write("<br/>\n")
307
                                                        break;
308
                                                        
309
                                                case "pb":
310
                                                        writer.write("<br/><br/>\n")
311
                                                        multiwriter.write("<br/><br/>\n")
312
                                                        break;
313
                                                
314
                                                case "p":
315
                                                case "q":
316
                                                        writer.write("</p>\n")
317
                                                        multiwriter.write("</p>\n")
318
                                                        break;
319
                                                
320
                                                
321
                                                case "form":
322
                                                        flagform = false
323
                                                
324
                                                        break;
325
                                                
326
                                                case "ana":
327
                                                        flaginterp = false
328
                                                        break;
329
                                                
330
                                                case "w":
331
                                                        int l = lastword.length();
332
                                                        String endOfLastWord = "";
333
                                                        if(l > 0)
334
                                                                endOfLastWord = lastword.subSequence(l-1, l);
335
                                                        
336
                                                        if(interpvalue != null)
337
                                                                interpvalue = interpvalue.replace("&", "&amp;").replace("\"","&quot;").replace("<", "&lt;");
338
                                                        
339
                                                        if(NoSpaceBefore.contains(wordvalue) || 
340
                                                                        NoSpaceAfter.contains(lastword) || 
341
                                                                        wordvalue.startsWith("-") || 
342
                                                                        NoSpaceAfter.contains(endOfLastWord))
343
                                                        {
344
                                                                writer.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\">");
345
                                                                multiwriter.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\">");
346
                                                        }
347
                                                        else
348
                                                        {
349
                                                                writer.write(" <span title=\""+interpvalue+"\" id=\""+wordid+"\">");
350
                                                                multiwriter.write(" <span title=\""+interpvalue+"\" id=\""+wordid+"\">");
351
                                                        }
352
                                                        writer.write(wordvalue.replace("<", "&lt;")+"</span>");
353
                                                        multiwriter.write(wordvalue.replace("<", "&lt;")+"</span>");
354
                                                        lastword=wordvalue;
355
                                                        break;
356
                                        }
357
                                        break;
358
                                
359
                                case XMLStreamConstants.CHARACTERS:
360
                                        if(flagform)
361
                                                if(parser.getText().length() > 0)
362
                                                        wordvalue+=(parser.getText().replace("&","&amp;").replace("<", "&lt;"));
363
                                        if(flaginterp)
364
                                                if(parser.getText().length() > 0)
365
                                                        interpvalue+=(parser.getText().replace("&","&amp;").replace("<", "&lt;"));
366
                                        break;
367
                        }
368
                }        
369
                writer.write("</body>");
370
                writer.write("</html>");
371
                writer.close();
372
                multiwriter.write("</body>");
373
                multiwriter.write("</html>");
374
                multiwriter.close();
375
                inputData.close();
376
        }
377
}