Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / perrault / pager.groovy @ 479

History | View | Annotate | Download (8.7 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.perrault;
29

    
30
import org.txm.importer.*;
31
import org.xml.sax.Attributes;
32
import org.txm.importer.filters.*;
33
import java.io.File;
34
import java.io.IOException;
35
import java.util.ArrayList;
36
import javax.xml.parsers.SAXParserFactory;
37
import javax.xml.parsers.ParserConfigurationException;
38
import javax.xml.parsers.SAXParser;
39
import javax.xml.stream.*;
40
import java.net.URL;
41
import org.xml.sax.InputSource;
42
import org.xml.sax.helpers.DefaultHandler;
43

    
44
// TODO: Auto-generated Javadoc
45
/** Build Discours corpus simple edition from a xml-tei. @author mdecorde */
46
class pager {
47
        List<String> NoSpaceBefore;
48
        
49
        /** The No space after. */
50
        List<String> NoSpaceAfter;
51

    
52
        /** The wordcount. */
53
        int wordcount = 0;
54
        
55
        /** The pagecount. */
56
        int pagecount = 0;
57
        
58
        /** The wordmax. */
59
        int wordmax = 0;
60

    
61
        /** The wordid. */
62
        String wordid;
63
        
64
        /** The first word. */
65
        boolean firstWord = true;
66
        
67
        /** The wordvalue. */
68
        String wordvalue;
69
        
70
        /** The interpvalue. */
71
        String interpvalue;
72
        
73
        /** The lastword. */
74
        String lastword = " ";
75
        
76
        /** The wordtype. */
77
        String wordtype;
78
        
79
        /** The flagform. */
80
        boolean flagform = false;
81
        
82
        /** The flaginterp. */
83
        boolean flaginterp = false;
84
        
85
        /** The url. */
86
        private def url;
87
        
88
        /** The input data. */
89
        private def inputData;
90
        
91
        /** The factory. */
92
        private def factory;
93
        
94
        /** The parser. */
95
        private XMLStreamReader parser;
96
        
97
        /** The writer. */
98
        def writer;
99
        
100
        /** The multiwriter. */
101
        def multiwriter = null;
102
        
103
        /** The infile. */
104
        File infile;
105
        
106
        /** The outfile. */
107
        File outfile;
108
        
109
        /** The pages. */
110
        ArrayList<File> pages = new ArrayList<File>();
111
        
112
        /** The idxstart. */
113
        ArrayList<String> idxstart = new ArrayList<String>();
114

    
115
        /**
116
         * Instantiates a new pager.
117
         *
118
         * @param infile the infile
119
         * @param outfile the outfile
120
         * @param NoSpaceBefore the no space before
121
         * @param NoSpaceAfter the no space after
122
         * @param max the max
123
         */
124
        pager(File infile, File outfile, List<String> NoSpaceBefore,
125
                        List<String> NoSpaceAfter, int max) {
126
                this.wordmax = max;
127
                this.NoSpaceBefore = NoSpaceBefore;
128
                this.NoSpaceAfter = NoSpaceAfter;
129
                this.url = infile.toURI().toURL();
130
                this.infile = infile;
131

    
132
                inputData = url.openStream();
133
                factory = XMLInputFactory.newInstance();
134
                parser = factory.createXMLStreamReader(inputData);
135
                createOutput(outfile);
136
                process();
137
        }
138

    
139
        /**
140
         * Creates the next output.
141
         *
142
         * @return true, if successful
143
         */
144
        private boolean createNextOutput()
145
        {
146
                try {
147
                        if(multiwriter != null)
148
                        {
149
                                multiwriter.write("</body>");
150
                                multiwriter.write("</html>");
151
                                multiwriter.close();
152
                        }
153
                        pagecount++;
154
                        File f = new File(outfile.getParent()+"/multi/",outfile.getName().substring(0,outfile.getName().length()-6)+"_"+pagecount+".html");
155
                        pages.add(f);
156
                        idxstart.add(wordid)
157
                        multiwriter = new OutputStreamWriter(new FileOutputStream(f) , "UTF-8");
158
                        
159
                        multiwriter.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
160
                        multiwriter.write("<html>");
161
                        multiwriter.write("<head>");
162
                        multiwriter.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>");
163
                        multiwriter.write("<title>Perrault Edition - Page "+pagecount+"</title>");
164
                        multiwriter.write("</head>");
165
                        multiwriter.write("<body>");
166
                        
167
                        return true;
168
                } catch (Exception e) {
169
                        System.out.println(e.getLocalizedMessage());
170
                        return false;
171
                }
172
        }
173

    
174
        /**
175
         * Creates the output.
176
         *
177
         * @param outfile the outfile
178
         * @return true, if successful
179
         */
180
        private boolean createOutput(File outfile) {
181
                try {
182
                        this.outfile = outfile;
183

    
184
                        writer = new OutputStreamWriter(new FileOutputStream(outfile),
185
                                        "UTF-8");
186
                        createNextOutput();
187
                        return true;
188
                } catch (Exception e) {
189
                        System.out.println(e.getLocalizedMessage());
190
                        return false;
191
                }
192
        }
193

    
194
        /**
195
         * Gets the page files.
196
         *
197
         * @return the page files
198
         */
199
        public ArrayList<File> getPageFiles() {
200
                return pages;
201
        }
202

    
203
        /**
204
         * Gets the idx.
205
         *
206
         * @return the idx
207
         */
208
        public ArrayList<String> getIdx() {
209
                return idxstart;
210
        }
211

    
212
        /**
213
         * Process.
214
         */
215
        void process()
216
        {
217
                String localname = "";
218
                
219
                writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
220
                writer.write("<html>");
221
                writer.write("<head>");
222
                writer.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>");
223
                writer.write("<title>Perrault Edition</title>");
224
                writer.write("</head>");
225
                writer.write("<body>");
226
                
227
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
228
                        
229
                        
230
                        switch (event) {
231
                                case XMLStreamConstants.START_ELEMENT:
232
                                        localname = parser.getLocalName();
233
                                        switch(localname)
234
                                        {                                
235
                                                case "text":
236
                                                XPathResult testG = new XPathResult(infile);
237
                                                String title =testG.getXpathResponse("//tei:TEI/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title[type='sub']","")
238
                                                writer.write("<h1>"+title+"</h1>\n")
239
                                                multiwriter.write("<h1>"+title+"</h1>\n")
240
                                                break;
241
                                                
242
                                                case "head":
243
                                                writer.write("<h2>\n")
244
                                                multiwriter.write("<h2>\n")
245
                                                break;
246
                                                
247
                                                case "lg":
248
                                                writer.write("<p>\n")
249
                                                multiwriter.write("<p>\n")
250
                                                break;
251
                                                
252
                                                case "p":
253
                                                case "q":
254
                                                writer.write("<p>\n")
255
                                                multiwriter.write("<p>\n")
256
                                                break;
257
                                                
258
                                                case "w":
259
                                                wordid=parser.getAttributeValue(null,"id");
260
                                                if(firstWord)
261
                                                {
262
                                                        firstWord=false;
263
                                                        this.idxstart.set(0,wordid);
264
                                                }
265
                                                wordcount++;
266
                                                if(wordcount >= wordmax)
267
                                                {
268
                                                        createNextOutput();
269
                                                        wordcount=0;
270
                                                }
271
                                                break;
272
                                                
273
                                                case "ana":
274
                                                flaginterp=true;
275
                                                interpvalue+=" "+parser.getAttributeValue(null,"type")+":"
276
                                                break;
277
                                                
278
                                                case "form":
279
                                                wordvalue="";
280
                                                interpvalue ="";
281
                                                flagform=true;
282
                                                break;
283
                                        }
284
                                        break;
285
                                case XMLStreamConstants.END_ELEMENT:
286
                                        localname = parser.getLocalName();
287
                                        switch(localname)
288
                                        {                                
289
                                                case "head":
290
                                                writer.write("</h2>\n")
291
                                                multiwriter.write("</h2>\n")
292
                                                break;
293
                                                
294
                                                case "lg":
295
                                                writer.write("</p>\n")
296
                                                multiwriter.write("</p>\n")
297
                                                break;
298
                                                
299
                                                case "l":
300
                                                        writer.write("<br/>\n")
301
                                                        multiwriter.write("<br/>\n")
302
                                                        break;
303
                                                
304
                                                case "p":
305
                                                case "q":
306
                                                writer.write("</p>\n")
307
                                                multiwriter.write("</p>\n")
308
                                                break;
309
                                                
310
                                                case "form":
311
                                                flagform = false
312
                                                
313
                                                break;
314
                                                
315
                                                case "ana":
316
                                                flaginterp = false
317
                                                break;
318
                                                
319
                                                case "w":
320
                                                        int l = lastword.length();
321
                                                        String endOfLastWord = "";
322
                                                        if(l > 0)
323
                                                                endOfLastWord = lastword.subSequence(l-1, l);
324
                                                        
325
                                                        if(interpvalue != null)
326
                                                                interpvalue = interpvalue.replace("\"","&quot;");
327
                                                        
328
                                                        if(NoSpaceBefore.contains(wordvalue) || 
329
                                                                        NoSpaceAfter.contains(lastword) || 
330
                                                                        wordvalue.startsWith("-") || 
331
                                                                        NoSpaceAfter.contains(endOfLastWord))
332
                                                        {
333
                                                                writer.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\">");
334
                                                                multiwriter.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\">");
335
                                                        }
336
                                                        else
337
                                                        {
338
                                                                writer.write(" <span title=\""+interpvalue+"\" id=\""+wordid+"\">");
339
                                                                multiwriter.write(" <span title=\""+interpvalue+"\" id=\""+wordid+"\">");
340
                                                        }
341
                                                        writer.write(wordvalue.replace("<", "&lt;")+"</span>");
342
                                                        multiwriter.write(wordvalue.replace("<", "&lt;")+"</span>");
343
                                                        lastword=wordvalue;
344
                                                break;
345
                                                
346
                                        }
347
                                        break;
348
                                
349
                                case XMLStreamConstants.CHARACTERS:
350
                                        if(flagform)
351
                                                if(parser.getText().length() > 0)
352
                                                        wordvalue+=(parser.getText());
353
                                        if(flaginterp)
354
                                                if(parser.getText().length() > 0)
355
                                                        interpvalue+=(parser.getText());
356
                                        break;
357
                        }
358
                }        
359
                writer.write("</body>");
360
                writer.write("</html>");
361
                writer.close();
362
                multiwriter.write("</body>");
363
                multiwriter.write("</html>");
364
                multiwriter.close();
365
        }
366
}