Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / xml / pager_old.groovy @ 479

History | View | Annotate | Download (9.3 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.xml;
29

    
30
import org.txm.importer.*;
31
import org.xml.sax.Attributes;
32
import org.txm.importer.filters.*;
33
import java.io.File;
34
import java.io.IOException;
35
import java.util.ArrayList;
36
import javax.xml.parsers.SAXParserFactory;
37
import javax.xml.parsers.ParserConfigurationException;
38
import javax.xml.parsers.SAXParser;
39
import javax.xml.stream.*;
40
import java.net.URL;
41
import org.xml.sax.InputSource;
42
import org.xml.sax.helpers.DefaultHandler;
43

    
44
// TODO: Auto-generated Javadoc
45
/** Build a simple edition from a xml-tei. 
46
 * @author mdecorde */
47
class pager_old {
48
        List<String> NoSpaceBefore;
49
        //StaxStack staxStack;
50

    
51
        /** The No space after. */
52
        List<String> NoSpaceAfter;
53

    
54
        /** The wordcount. */
55
        int wordcount = 0;
56

    
57
        /** The pagecount. */
58
        int pagecount = 0;
59

    
60
        /** The wordmax. */
61
        int wordmax = 0;
62

    
63
        /** The basename. */
64
        String basename = "";
65
        String txtname = "";
66
        File outdir;
67

    
68
        /** The wordid. */
69
        String wordid;
70

    
71
        /** The first word. */
72
        boolean firstWord = true;
73

    
74
        /** The wordvalue. */
75
        String wordvalue;
76

    
77
        /** The interpvalue. */
78
        String interpvalue;
79

    
80
        /** The lastword. */
81
        String lastword = " ";
82

    
83
        /** The wordtype. */
84
        String wordtype;
85

    
86
        /** The flagform. */
87
        boolean flagform = false;
88

    
89
        /** The flaginterp. */
90
        boolean flaginterp = false;
91

    
92
        /** The url. */
93
        private def url;
94

    
95
        /** The input data. */
96
        private def inputData;
97

    
98
        /** The factory. */
99
        private def factory;
100

    
101
        /** The parser. */
102
        private XMLStreamReader parser;
103

    
104
        /** The writer. */
105
        OutputStreamWriter writer;
106

    
107
        /** The multiwriter. */
108
        OutputStreamWriter multiwriter = null;
109

    
110
        /** The infile. */
111
        File infile;
112

    
113
        /** The outfile. */
114
        File outfile;
115

    
116
        /** The pages. */
117
        ArrayList<File> pages = new ArrayList<File>();
118

    
119
        /** The idxstart. */
120
        ArrayList<String> idxstart = new ArrayList<String>();
121
        String editionPage;
122

    
123
        /**
124
         * Instantiates a new pager.
125
         *
126
         * @param infile the infile
127
         * @param outfile the outfile
128
         * @param NoSpaceBefore the no space before
129
         * @param NoSpaceAfter the no space after
130
         * @param max the max
131
         * @param basename the basename
132
         */
133
        pager_old(File infile, File outdir, String txtname, List<String> NoSpaceBefore,
134
        List<String> NoSpaceAfter, int max, String basename, String editionPage) {
135
                this.editionPage = editionPage;
136
                this.basename = basename;
137
                this.txtname = txtname;
138
                this.outdir = outdir;
139
                this.wordmax = max;
140
                this.NoSpaceBefore = NoSpaceBefore;
141
                this.NoSpaceAfter = NoSpaceAfter;
142
                this.url = infile.toURI().toURL();
143
                this.infile = infile;
144

    
145
                inputData = url.openStream();
146
                factory = XMLInputFactory.newInstance();
147
                parser = factory.createXMLStreamReader(inputData);
148
                //staxStack = new StaxStack(parser)
149
                process();
150
        }
151

    
152
        private void closeMultiWriter()
153
        {
154
                if (multiwriter != null) {
155
                        //staxStack.closeStack(multiwriter);
156
                        if (firstWord) { // there was no words
157
                                this.idxstart.add("w_0")
158
                                multiwriter.write("<span id=\"w_0\"/>");
159
                        }
160
                        multiwriter.write("</body>");
161
                        multiwriter.write("</html>"); 
162
                        multiwriter.close();
163
                }
164
        }
165

    
166
        /**
167
         * Creates the next output.
168
         *
169
         * @return true, if successful
170
         */
171
        private boolean createNextOutput()
172
        {
173
                wordcount = 0;
174
                try {
175
                        closeMultiWriter();
176
                        File outfile = new File(outdir, txtname+"_"+(++pagecount)+".html")
177
                        pages.add(outfile);
178
                        firstWord = true; // waiting for next word
179

    
180
                        multiwriter = new OutputStreamWriter(new FileOutputStream(outfile) , "UTF-8");
181

    
182
                        multiwriter.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
183
                        multiwriter.write("<html>");
184
                        multiwriter.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>");
185
                        multiwriter.write("<head>");
186
                        multiwriter.write("<title>"+basename.toUpperCase()+" Edition - Page "+pagecount+"</title>");
187
                        multiwriter.write("</head>");
188
                        multiwriter.write("<body>");
189

    
190
                        //staxStack.openStack(multiwriter);
191
                        return true;
192
                } catch (Exception e) {
193
                        System.out.println(e.getLocalizedMessage());
194
                        return false;
195
                }
196
        }
197

    
198
        /**
199
         * Creates the output.
200
         *
201
         * @param outfile the outfile
202
         * @return true, if successful
203
         */
204
        private boolean createOutput() {
205
                try {
206
                        return createNextOutput();
207
                } catch (Exception e) {
208
                        System.out.println(e.getLocalizedMessage());
209
                        return false;
210
                }
211
        }
212

    
213
        /**
214
         * Gets the page files.
215
         *
216
         * @return the page files
217
         */
218
        public ArrayList<File> getPageFiles() {
219
                return pages;
220
        }
221

    
222
        /**
223
         * Gets the idx.
224
         *
225
         * @return the idx
226
         */
227
        public ArrayList<String> getIdx() {
228
                return idxstart;
229
        }
230

    
231
        /**
232
         * Process.
233
         */
234
        void process() {
235
                String localname = "";
236
                createNextOutput();
237
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
238
                        switch (event) {
239
                                case XMLStreamConstants.START_ELEMENT:
240
                                        localname = parser.getLocalName();
241
                                        switch (localname) {
242
                                                case "text":
243
                                                        if (parser.getAttributeValue(null,"id") != null)
244
                                                                multiwriter.write("<h3>"+parser.getAttributeValue(null,"id")+"</h3>\n")
245
                                                        multiwriter.write("<table>\n");
246
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
247
                                                                multiwriter.write("<tr>");
248
                                                                multiwriter.write("<td>"+parser.getAttributeLocalName(i)+": </td>");
249
                                                                multiwriter.write("<td>"+parser.getAttributeValue(i)+"</td>");
250
                                                                multiwriter.write("</tr>\n");
251
                                                        }
252
                                                        multiwriter.write("</table><br/>\n");
253
                                                        break;
254
                                                case "head":
255
                                                        multiwriter.write("<h2>\n")
256
                                                        //staxStack.onStartElement("h2", null);
257
                                                        break;
258
                                                case "graphic":
259
                                                        String url = parser.getAttributeValue(null, "url")
260
                                                //println "URL: "+url
261
                                                        if (url != null) multiwriter.write("<div><img src=\"$url\"/></div>");
262
                                                        break;
263
                                                case "lg":
264
                                                case "p":
265
                                                case "q":
266
                                                        String rend = parser.getAttributeValue(null, "rend")
267
                                                        if (rend == null) rend = "normal"
268
                                                        multiwriter.write("<p class=\"$rend\">\n")
269
                                                        //staxStack.onStartElement("p", null);
270
                                                        break;
271
                                                //case "pb":
272
                                                case editionPage:
273
                                                        createNextOutput();
274
                                                        wordcount=0;
275
                                                        if (parser.getAttributeValue(null,"n") != null) {
276
                                                                multiwriter.write("<p style=\"color:red\" align=\"center\">- "+parser.getAttributeValue(null,"n")+" -</p>\n")
277
                                                        }
278
                                                        break;
279
                                                case "lb":
280
                                                case "br":
281
                                                        multiwriter.write("<br/>\n")
282
                                                        break;
283
                                                case "w":
284
                                                        wordid = parser.getAttributeValue(null,"id");
285

    
286
                                                        if (firstWord) {
287
                                                                firstWord = false;
288
                                                                this.idxstart.add(wordid);
289
                                                        }
290
                                                        wordcount++;
291
                                                        if (wordcount >= wordmax) {
292
                                                                createNextOutput();
293
                                                        }
294
                                                        break;
295

    
296
                                                case "ana":
297
                                                        flaginterp=true;
298
                                                        interpvalue+=" "+parser.getAttributeValue(null,"type").substring(1)+":"
299
                                                        break;
300

    
301
                                                case "form":
302
                                                        wordvalue="";
303
                                                        interpvalue ="";
304
                                                        flagform=true;
305
                                        }
306
                                        break;
307
                                case XMLStreamConstants.END_ELEMENT:
308
                                        localname = parser.getLocalName();
309
                                        switch (localname) {
310
                                                case "head":
311
                                                        multiwriter.write("</h2>\n")
312
                                                        
313
                                                        break;
314
                                                case "lg":
315
                                                case "p":
316
                                                case "q":
317
                                                        multiwriter.write("</p>\n")
318
                                                        
319
                                                        break;
320

    
321

    
322
                                                case "form":
323
                                                        flagform = false
324
                                                        break;
325

    
326
                                                case "ana":
327
                                                        flaginterp = false
328
                                                        break;
329

    
330
                                                case "w":
331

    
332
                                                        int l = lastword.length();
333
                                                        String endOfLastWord = "";
334
                                                        if (l > 0)
335
                                                                endOfLastWord = lastword.subSequence(l-1, l);
336

    
337
                                                        if (interpvalue != null)
338
                                                                interpvalue = interpvalue.replace("&", "&amp;").replace("<", "&lt;").replace("\"","&quot;");
339

    
340
                                                        if (NoSpaceBefore.contains(wordvalue) ||
341
                                                        NoSpaceAfter.contains(lastword) ||
342
                                                        wordvalue.startsWith("-") ||
343
                                                        NoSpaceAfter.contains(endOfLastWord)) {
344
                                                                multiwriter.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\">");
345
                                                        } else {
346
                                                                multiwriter.write(" <span title=\""+interpvalue+"\" id=\""+wordid+"\">");
347
                                                        }
348
                                                //writer.write(wordvalue.replace("&", "&amp;").replace("\"","&quot;").replace("<", "&lt;")+"</span>");
349
                                                        multiwriter.write(wordvalue.replace("&", "&amp;").replace("\"","&quot;").replace("<", "&lt;")+"</span>");
350
                                                        lastword=wordvalue;
351
                                                        break;
352
                                        }
353
                                        break;
354

    
355
                                case XMLStreamConstants.CHARACTERS:
356
                                        if (flagform)
357
                                                if (parser.getText().length() > 0)
358
                                                        wordvalue+=(parser.getText());
359
                                        if (flaginterp)
360
                                                if (parser.getText().length() > 0)
361
                                                        interpvalue+=(parser.getText());
362
                                        break;
363
                        }
364
                }
365
                //writer.write("</body>");
366
                //writer.write("</html>");
367
                //writer.close();
368
                closeMultiWriter();
369
        }
370
}