Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / tmx / pager.groovy @ 479

History | View | Annotate | Download (12.4 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2012-06-01 17:47:31 +0200 (ven., 01 juin 2012) $
25
// $LastChangedRevision: 2185 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.tmx;
29

    
30
import org.txm.importer.*;
31
import org.xml.sax.Attributes;
32
import org.txm.importer.filters.*;
33
import java.io.File;
34
import java.io.IOException;
35
import java.util.ArrayList;
36
import javax.xml.parsers.SAXParserFactory;
37
import javax.xml.parsers.ParserConfigurationException;
38
import javax.xml.parsers.SAXParser;
39
import javax.xml.stream.*;
40
import java.net.URL;
41
import org.xml.sax.InputSource;
42
import org.xml.sax.helpers.DefaultHandler;
43

    
44
// TODO: Auto-generated Javadoc
45
/** Build GEO corpus simple edition from a xml-tei. @author mdecorde */
46
class pager {
47
        List<String> NoSpaceBefore;
48

    
49
        /** The No space after. */
50
        List<String> NoSpaceAfter;
51

    
52
        /** The wordcount. */
53
        int wordcount = 0;
54

    
55
        /** The pagecount. */
56
        int pagecount = 0;
57

    
58
        /** The wordmax. */
59
        int wordmax = 0;
60

    
61
        /** The basename. */
62
        String basename = "";
63
        String txtname = "";
64
        File outdir;
65

    
66
        /** The wordid. */
67
        String wordid;
68

    
69
        /** The first word. */
70
        boolean firstWord = true;
71

    
72
        /** The wordvalue. */
73
        String wordvalue;
74

    
75
        /** The interpvalue. */
76
        String interpvalue;
77

    
78
        /** The lastword. */
79
        String lastword = " ";
80

    
81
        /** The wordtype. */
82
        String wordtype;
83

    
84
        /** The flagform. */
85
        boolean flagform = false;
86

    
87
        /** The flaginterp. */
88
        boolean flaginterp = false;
89

    
90
        /** The url. */
91
        private def url;
92

    
93
        /** The input data. */
94
        private def inputData;
95

    
96
        /** The factory. */
97
        private def factory;
98

    
99
        /** The parser. */
100
        private XMLStreamReader parser;
101

    
102
        /** The writer. */
103
        OutputStreamWriter writer;
104

    
105
        /** The multiwriter. */
106
        OutputStreamWriter multiwriter = null;
107

    
108
        /** The infile. */
109
        File infile;
110

    
111
        /** The outfile. */
112
        File outfile;
113

    
114
        /** The pages. */
115
        ArrayList<File> pages = new ArrayList<File>();
116

    
117
        /** The idxstart. */
118
        ArrayList<String> idxstart = new ArrayList<String>();
119
        String editionPage;
120
        ArrayList<Integer> splitTUs; // contains the tu ids used to split pages
121
        boolean shouldSplit = false;
122
        boolean useSplitTUs = false;
123
        /**
124
         * Instantiates a new pager.
125
         *
126
         * @param infile the infile
127
         * @param outfile the outfile
128
         * @param NoSpaceBefore the no space before
129
         * @param NoSpaceAfter the no space after
130
         * @param max the max
131
         * @param basename the basename
132
         */
133
        pager(File infile, File outdir, String txtname, List<String> NoSpaceBefore,
134
        List<String> NoSpaceAfter, int max, String basename, String editionPage, ArrayList<Integer> splitTUs) {
135
                this.editionPage = editionPage;
136
                this.basename = basename;
137
                this.txtname = txtname;
138
                this.outdir = outdir;
139
                this.wordmax = max;
140
                this.NoSpaceBefore = NoSpaceBefore;
141
                this.NoSpaceAfter = NoSpaceAfter;
142
                this.url = infile.toURI().toURL();
143
                this.infile = infile;
144
                this.splitTUs = splitTUs;
145
                //println "spliting pages with : "+splitTUs
146
                useSplitTUs = splitTUs.size() > 0;
147
                inputData = url.openStream();
148
                factory = XMLInputFactory.newInstance();
149
                parser = factory.createXMLStreamReader(inputData);
150
                process();
151
        }
152

    
153
        private void closeMultiWriter()
154
        {
155
                if (multiwriter != null) {
156
                        if (firstWord) { // there was no words
157
                                this.idxstart.add("w_0")
158
                                multiwriter.write("<span id=\"w_0\"/>");
159
                        }
160
                        multiwriter.write("</div>\n")
161
                        multiwriter.write("</body>");
162
                        multiwriter.write("</html>");
163
                        multiwriter.close();
164
                }
165
        }
166

    
167
        /**
168
         * Creates the next output.
169
         *
170
         * @return true, if successful
171
         */
172
        private boolean createNextOutput()
173
        {
174
                wordcount = 0;
175
                shouldSplit = false;
176
                try {
177
                        closeMultiWriter();
178
                        File outfile = new File(outdir, txtname+"_"+(++pagecount)+".html")
179
                        pages.add(outfile);
180
                        firstWord = true; // waiting for next word
181

    
182
                        multiwriter = new OutputStreamWriter(new FileOutputStream(outfile) , "UTF-8");
183

    
184
                        multiwriter.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
185
                        multiwriter.write("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"   \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n")
186
                        multiwriter.write("<html>");
187
                        multiwriter.write("<head>");
188
                        multiwriter.write("<title>"+basename.toUpperCase()+" $txtname Edition - Page "+pagecount+"</title>");
189
                        multiwriter.write("<link rel=\"stylesheet\" type=\"text/css\" href=\"tmx.css\"/>");
190
                        multiwriter.write("<link rel=\"stylesheet\" type=\"text/css\" href=\"style.css\"/>");
191
                        multiwriter.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>");
192
                        multiwriter.write("</head>");
193
                        multiwriter.write("<body>");
194
                        multiwriter.write("<div class=\"tmx\">\n")
195

    
196
                        return true;
197
                } catch (Exception e) {
198
                        System.out.println(e.getLocalizedMessage());
199
                        return false;
200
                }
201
        }
202

    
203
        /**
204
         * Creates the output.
205
         *
206
         * @param outfile the outfile
207
         * @return true, if successful
208
         */
209
        private boolean createOutput() {
210
                try {
211
                        return createNextOutput();
212
                } catch (Exception e) {
213
                        System.out.println(e.getLocalizedMessage());
214
                        return false;
215
                }
216
        }
217

    
218
        /**
219
         * Gets the page files.
220
         *
221
         * @return the page files
222
         */
223
        public ArrayList<File> getPageFiles() {
224
                return pages;
225
        }
226

    
227
        /**
228
         * Gets the idx.
229
         *
230
         * @return the idx
231
         */
232
        public ArrayList<String> getIdx() {
233
                return idxstart;
234
        }
235

    
236
        /**
237
         * Process.
238
         */
239
        void process() {
240

    
241
                String vNote = ""
242
                String cssNote = "note"
243
                boolean flagNote = false;
244
                
245
                String tuCSS = "tu"
246
                
247
                int tuCounter = 0;
248
                String localname = "";
249
                createNextOutput();
250
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
251
                        switch (event) {
252
                                case XMLStreamConstants.START_ELEMENT:
253
                                        localname = parser.getLocalName();
254
                                        switch (localname) {
255
                                                case "text":
256
                                                        boolean grey = "true"
257
                                                        String title = txtname;
258
                                                        String subtitle = "";
259
                                                        String author = "";
260
                                                        String translator = "";
261
                                                        String table = "";
262
                                                        table += ("<table>\n");
263
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
264
                                                                String name = parser.getAttributeLocalName(i);
265
                                                                grey = !grey;
266
                                                                if (grey) table += ("<tr  style=\"background-color:lightgrey;\">");
267
                                                                else table += ("<tr>");
268
                                                                
269
                                                                table += ("<td>"+parser.getAttributeLocalName(i)+": </td>");
270
                                                                table += ("<td>"+parser.getAttributeValue(i)+"</td>");
271
                                                                table += ("</tr>\n");
272
                                                                
273
                                                                if (name == "title") title = parser.getAttributeValue(i);
274
                                                                else if (name == "subtitle") subtitle = parser.getAttributeValue(i);
275
                                                                else if (name == "author") author = parser.getAttributeValue(i);
276
                                                                else if (name == "translator") translator = parser.getAttributeValue(i);
277
                                                        }
278
                                                        multiwriter.write("<div class=\"header\">")
279
                                                        multiwriter.write("<span class=\"authorStyle\"> "+author+" </span>")
280
                                                        multiwriter.write("<span class=\"titleStyle\"> "+title+" </span>")
281
                                                        multiwriter.write("<span class=\"subtitleStyle\"> "+subtitle+" </span>")
282
                                                        if (translator.size() > 0)
283
                                                                multiwriter.write("<span class=\"translatorStyle\"> "+translator+" </span>")
284
                                                        multiwriter.write("</div>")
285
//                                                        multiwriter.write(table)
286
//                                                        multiwriter.write("</table><br/>\n");
287
                                                        break;
288
                                                case "head":
289
                                                        multiwriter.write("<h2>\n")
290
                                                        break;
291
                                                case "graphic":
292
                                                        String url = parser.getAttributeValue(null, "url")
293
                                                //println "URL: "+url
294
                                                        if (url != null) multiwriter.write("<div><img src=\"$url\"/></div>");
295
                                                        break;
296
                                                case "lg":
297
                                                case "p":
298
                                                case "q":
299
                                                        String rend = parser.getAttributeValue(null, "rend")
300
                                                        if (rend == null) rend = "normal"
301
                                                        multiwriter.write("<p class=\"$rend\">\n")
302
                                                        break;
303
                                                //case "pb":
304
                                                case editionPage:
305
                                                        createNextOutput();
306
                                                        wordcount=0;
307
                                                        if (parser.getAttributeValue(null,"n") != null) {
308
                                                                multiwriter.write("<p style=\"color:red\" align=\"center\">- "+parser.getAttributeValue(null,"n")+" -</p>\n")
309
                                                        }
310
                                                        break;
311
                                                case "lb":
312
                                                case "br":
313
                                                        multiwriter.write("<br/>\n")
314
                                                        break;
315
                                                case "tu":
316
                                                        tuCounter++;
317
                                                        if (useSplitTUs) {
318
                                                                if (splitTUs.contains(tuCounter))
319
                                                                        createNextOutput();
320
                                                        } else if (shouldSplit) {
321
                                                                createNextOutput();
322
                                                                this.splitTUs << tuCounter
323
                                                        }
324
                                                        String str = "";
325
                                                        String title;
326
                                                        tuCSS = "tu"
327
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
328
                                                                str += " "+parser.getAttributeLocalName(i)+"="+parser.getAttributeValue(i)
329
                                                                if (parser.getAttributeLocalName(i).equals("cssClass")) {
330
                                                                        tuCSS = parser.getAttributeValue(i)
331
                                                                } else if (parser.getAttributeLocalName(i).equals("title")) {
332
                                                                        title = parser.getAttributeValue(i)
333
                                                                }
334
                                                        }
335
                                                        if (title != null) str = title;
336
                                                        multiwriter.write("<h5 class=\"${tuCSS}Title\">$str</h5>\n")
337
                                                        break;
338
                                                case "note":
339
                                                cssNote = "note"
340
                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
341
                                                        if (parser.getAttributeLocalName(i) == "cssClass") {
342
                                                                cssNote = parser.getAttributeValue(i)
343
                                                                break
344
                                                        }
345
                                                }
346
                                                        vNote = "";
347
                                                        flagNote = true;
348
                                                        break;
349
                                                case "seg":
350
                                                String css = "seg"
351
                                                        if (tuCSS != "tu") css = tuCSS;
352
                                                        multiwriter.write("<p class=\"${css}\">\n");
353
                                                        break;
354
                                                case "w":
355
                                                        wordid = parser.getAttributeValue(null,"id");
356

    
357
                                                        wordcount++;
358
                                                        if (wordcount >= wordmax) {
359
                                                                //createNextOutput();
360
                                                                shouldSplit = true;
361
                                                                //println "should split: "+wordcount
362
                                                        }
363
                                                        
364
                                                        if (firstWord) {
365
                                                                firstWord = false;
366
                                                                this.idxstart.add(wordid);
367
                                                        }
368
                                                        
369
                                                        break;
370

    
371
                                                case "ana":
372
                                                        flaginterp=true;
373
                                                        interpvalue+=" "+parser.getAttributeValue(null,"type").substring(1)+":"
374
                                                        break;
375

    
376
                                                case "form":
377
                                                        wordvalue="";
378
                                                        interpvalue ="";
379
                                                        flagform=true;
380
                                        }
381
                                        break;
382
                                case XMLStreamConstants.END_ELEMENT:
383
                                        localname = parser.getLocalName();
384
                                        switch (localname) {
385
                                                case "head":
386
                                                        multiwriter.write("</h2>\n")
387
                                                        break;
388
                                                case "lg":
389
                                                case "p":
390
                                                case "q":
391
                                                        multiwriter.write("</p>\n")
392
                                                        break;
393

    
394
                                                case "seg":
395
                                                        multiwriter.write("</p>\n");
396
                                                        break;
397
                                                case "note":
398
                                                        flagNote = false;
399
                                                        multiwriter.write("<div class=\"${cssNote}\">${vNote}</div>")
400
                                                        break;
401

    
402
                                                case "form":
403
                                                        flagform = false
404
                                                        break;
405
                                                case "ana":
406
                                                        flaginterp = false
407
                                                        break;
408
                                                case "w":
409

    
410
                                                        int l = lastword.length();
411
                                                        String endOfLastWord = "";
412
                                                        if (l > 0)
413
                                                                endOfLastWord = lastword.subSequence(l-1, l);
414

    
415
                                                        if (interpvalue != null)
416
                                                                interpvalue = interpvalue.replace("&", "&amp;").replace("<", "&lt;").replace("\"","&quot;");
417

    
418
                                                        if (NoSpaceBefore.contains(wordvalue) ||
419
                                                        NoSpaceAfter.contains(lastword) ||
420
                                                        wordvalue.startsWith("-") ||
421
                                                        NoSpaceAfter.contains(endOfLastWord)) {
422
                                                                multiwriter.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\">");
423
                                                        } else {
424
                                                                multiwriter.write(" <span title=\""+interpvalue+"\" id=\""+wordid+"\">");
425
                                                        }
426
                                                //writer.write(wordvalue.replace("&", "&amp;").replace("\"","&quot;").replace("<", "&lt;")+"</span>");
427
                                                        multiwriter.write(wordvalue.replace("&", "&amp;").replace("\"","&quot;").replace("<", "&lt;")+"</span>");
428
                                                        lastword=wordvalue;
429
                                                        break;
430
                                        }
431
                                        break;
432

    
433
                                case XMLStreamConstants.CHARACTERS:
434
                                        if (flagform)
435
                                                if (parser.getText().length() > 0)
436
                                                        wordvalue+=(parser.getText());
437
                                        if (flaginterp)
438
                                                if (parser.getText().length() > 0)
439
                                                        interpvalue+=(parser.getText());
440
                                        if (flagNote)
441
                                                if (parser.getText().length() > 0)
442
                                                        vNote+=(parser.getText());
443
                                        break;
444
                        }
445
                }
446
                //writer.write("</body>");
447
                //writer.write("</html>");
448
                //writer.close();
449
                closeMultiWriter();
450
        }
451
}