Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / corptef / pager.groovy @ 479

History | View | Annotate | Download (15.3 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.corptef;
29

    
30
import org.txm.importer.*;
31
import org.xml.sax.Attributes;
32
import org.txm.importer.filters.*;
33
import java.io.File;
34
import java.io.IOException;
35
import java.util.ArrayList;
36
import javax.xml.parsers.SAXParserFactory;
37
import javax.xml.parsers.ParserConfigurationException;
38
import javax.xml.parsers.SAXParser;
39
import javax.xml.stream.*;
40
import java.net.URL;
41
import org.xml.sax.InputSource;
42
import org.xml.sax.helpers.DefaultHandler;
43

    
44
// TODO: Auto-generated Javadoc
45
/** Build BFM corpus simple edition from a xml-tei. @author mdecorde */
46
class pager {
47
        List<String> NoSpaceBefore;
48
        
49
        /** The No space after. */
50
        List<String> NoSpaceAfter;
51

    
52
        /** The wordcount. */
53
        int wordcount = 0;
54
        
55
        /** The pagecount. */
56
        int pagecount = 0;
57
        
58
        /** The wordmax. */
59
        int wordmax = 0;
60

    
61
        /** The wordid. */
62
        String wordid;
63
        
64
        /** The first word. */
65
        boolean firstWord = true;
66
        
67
        /** The wordvalue. */
68
        String wordvalue;
69
        
70
        /** The interpvalue. */
71
        String interpvalue;
72
        
73
        /** The lastword. */
74
        String lastword = " ";
75
        
76
        /** The wordtype. */
77
        String wordtype;
78
        
79
        /** The flagform. */
80
        boolean flagform = false;
81
        
82
        /** The flaginterp. */
83
        boolean flaginterp = false;
84
        
85
        /** The url. */
86
        private def url;
87
        
88
        /** The input data. */
89
        private def inputData;
90
        
91
        /** The factory. */
92
        private def factory;
93
        
94
        /** The parser. */
95
        private XMLStreamReader parser;
96
        
97
        /** The writer. */
98
        OutputStreamWriter writer;
99
        
100
        /** The multiwriter. */
101
        OutputStreamWriter multiwriter = null;
102
        
103
        /** The infile. */
104
        File infile;
105
        
106
        /** The outfile. */
107
        File outfile;
108
        
109
        /** The pages. */
110
        ArrayList<File> pages = new ArrayList<File>();
111
        
112
        /** The idxstart. */
113
        ArrayList<String> idxstart = new ArrayList<String>();
114
        
115
        /** The titre id. */
116
        String titreId;
117

    
118
        /**
119
         * Instantiates a new pager.
120
         *
121
         * @param infile the infile
122
         * @param outfile the outfile
123
         * @param NoSpaceBefore the no space before
124
         * @param NoSpaceAfter the no space after
125
         * @param max the max
126
         */
127
        pager(File infile, File outfile, List<String> NoSpaceBefore,
128
                        List<String> NoSpaceAfter, int max) {
129
                this.wordmax = max;
130
                this.NoSpaceBefore = NoSpaceBefore;
131
                this.NoSpaceAfter = NoSpaceAfter;
132
                this.url = infile.toURI().toURL();
133
                this.infile = infile;
134

    
135
                inputData = url.openStream();
136
                factory = XMLInputFactory.newInstance();
137
                parser = factory.createXMLStreamReader(inputData);
138
                createOutput(outfile);
139
                process();
140
        }
141

    
142
        /**
143
         * Creates the next output.
144
         *
145
         * @return true, if successful
146
         */
147
        private boolean createNextOutput()
148
        {
149
                try {
150
                        if(multiwriter != null)
151
                        {
152
                                multiwriter.write("</body>");
153
                                multiwriter.write("</html>");
154
                                multiwriter.close();
155
                        }
156
                        pagecount++;
157
                        File f = new File(outfile.getParent()+"/multi/",outfile.getName().substring(0,outfile.getName().length()-5)+"_"+pagecount+".html");
158
                        pages.add(f);
159
                        idxstart.add(wordid)
160
                        multiwriter = new OutputStreamWriter(new FileOutputStream(f) , "UTF-8");
161
                        
162
                        multiwriter.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
163
                        multiwriter.write("<html>");
164
                        multiwriter.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>");
165
                        multiwriter.write("<head>");
166
                        multiwriter.write("<title>BFM - "+titreId+" - "+pagecount+"</title>");
167
                        multiwriter.write("</head>");
168
                        multiwriter.write("<body>");
169
                        if(wordid != null)
170
                        {
171
                                String[] splited = wordid.split("_");
172
                                int no = Integer.parseInt(splited[1]);
173
                                multiwriter.write("<span id=\""+splited[0]+"_"+(no++)+"\"/>");
174
                        }
175
                                                
176
                        return true;
177
                } catch (Exception e) {
178
                        System.out.println(e.getLocalizedMessage());
179
                        return false;
180
                }
181
        }
182

    
183
        /**
184
         * Creates the output.
185
         *
186
         * @param outfile the outfile
187
         * @return true, if successful
188
         */
189
        private boolean createOutput(File outfile) {
190
                try {
191
                        this.outfile = outfile;
192

    
193
                        writer = new OutputStreamWriter(new FileOutputStream(outfile),
194
                                        "UTF-8");
195

    
196
                        return true;
197
                } catch (Exception e) {
198
                        System.out.println(e.getLocalizedMessage());
199
                        return false;
200
                }
201
        }
202

    
203
        /**
204
         * Gets the page files.
205
         *
206
         * @return the page files
207
         */
208
        public ArrayList<File> getPageFiles() {
209
                return pages;
210
        }
211

    
212
        /**
213
         * Gets the idx.
214
         *
215
         * @return the idx
216
         */
217
        public ArrayList<String> getIdx() {
218
                return idxstart;
219
        }
220

    
221
        /**
222
         * Process.
223
         */
224
        void process()
225
        {
226
                String localname = "";
227
                
228
                boolean closeTitre = false;
229
                boolean closeAmen = false;
230
                
231
                boolean flagchoice = false;
232
                boolean flagcorr = false;
233
                boolean flagsic = false;
234
                boolean flagreg = false;
235
                boolean flagexpan = false;
236
                boolean flagorig = false;
237
                boolean flagabbr = false;
238
                boolean flagforeign = false;
239
                                
240
                File xpathfile = infile;
241
                titreId = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title[@type='normal']/text()");
242
                String auteur = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:author");
243
                String datecompo = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:date[@type='compo']/@when");
244
                String ssiecle = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:date[@type='compo_sous_siecle']/@n");
245
                String domaine = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:textDesc/tei:domain/@type");
246
                String genre = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:textDesc/@n");
247
                String forme = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:textClass/tei:catRef/@target[contains(.,'forme')]");
248
                String dialecte = XPathResult.getXpathResponse(xpathfile,"tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:region[@type='dialecte_auteur']");
249
                if(titreId == null)
250
                        System.err.println("titreId regexp error");
251
                if(auteur == null)
252
                        System.err.println("auteur regexp error");
253
                if(datecompo == null)
254
                        System.err.println("datecompo regexp error");
255
                if(ssiecle == null)
256
                        System.err.println("ssiecle regexp error");
257
                if(domaine == null)
258
                        System.err.println("domaine regexp error");
259
                if(genre == null)
260
                        System.err.println("genre regexp error");
261
                if(forme == null)
262
                        System.err.println("forme regexp error");
263
                
264
                if(dialecte == null)
265
                        System.err.println("dialecte regexp error");
266
                if(titreId == null || auteur == null || datecompo == null || ssiecle == null || domaine == null || genre == null || forme == null || dialecte == null)
267
                        return;
268
                forme = forme.substring(1);
269
                createNextOutput();
270
                
271
                writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
272
                writer.write("<html>");
273
                writer.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>");
274
                writer.write("<head>");
275
                writer.write("<title>BFM - "+titreId+"</title>");
276
                writer.write("</head>");
277
                writer.write("<body>");
278
                
279
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
280
                        
281
                        
282
                        switch (event) {
283
                                case XMLStreamConstants.START_ELEMENT:
284
                                        localname = parser.getLocalName();
285
                                        switch(localname)
286
                                        {
287
                                        case "text":
288
                                                
289
                                                writer.write("<h1 align=\"center\" style=\"color:darkviolet;font-family:Times;font-weight:bold;font-style:italic;font-size=200%\" class=\"text\">"+titreId+"</h1>")
290
                                                writer.write("<p align=\"center\">")
291
                                                writer.write("_____________________________________________________<br/>\n")
292
                                                writer.write("auteur : "+auteur+"<br/>");
293
                                                writer.write("date de composition : "+datecompo+"<br/>");
294
                                                writer.write("sous-siècle : "+ssiecle+"<br/>");
295
                                                writer.write("domaine : "+domaine+"<br/>");
296
                                                writer.write("genre : "+genre+"<br/>");
297
                                                writer.write("forme : "+forme+"<br/>");
298
                                                writer.write("dialecte : "+dialecte+"<br/>");
299
                                                writer.write("_____________________________________________________<br/>\n")
300
                                                writer.write("</p>")
301
                                                
302
                                                
303
                                                multiwriter.write("<h1 align=\"center\" style=\"color:darkviolet;font-family:Times;font-weight:bold;font-style:italic;font-size=200%\" class=\"text\">"+titreId+"</h1>")
304
                                                multiwriter.write("<p align=\"center\">")
305
                                                multiwriter.write("_____________________________________________________<br/>\n")
306
                                                multiwriter.write("auteur : "+auteur+"<br/>");
307
                                                multiwriter.write("date de composition : "+datecompo+"<br/>");
308
                                                multiwriter.write("sous-siècle : "+ssiecle+"<br/>");
309
                                                multiwriter.write("domaine : "+domaine+"<br/>");
310
                                                multiwriter.write("genre : "+genre+"<br/>");
311
                                                multiwriter.write("forme : "+forme+"<br/>");
312
                                                multiwriter.write("dialecte : "+dialecte+"<br/>");
313
                                                multiwriter.write("_____________________________________________________<br/>\n")
314
                                                multiwriter.write("</p>")
315
                                                
316
                                                break;
317
                                                break;
318
                                                case "head":
319
                                                        writer.write("<h2>\n")
320
                                                        multiwriter.write("<h2>\n")
321
                                                        break;
322
                                                
323
                                                case "lg":
324
                                                        writer.write("<p>\n")
325
                                                        multiwriter.write("<p>\n")
326
                                                        break;
327
                                                
328
                                                case "pb":
329
                                                        if(wordcount > 0)
330
                                                                createNextOutput();
331
                                                        if(parser.getAttributeValue(null,"n") != null)
332
                                                        {
333
                                                                writer.write("<p style=\"color:red\" align=\"center\">- "+parser.getAttributeValue(null,"n")+" -</p>\n")
334
                                                                multiwriter.write("<p style=\"color:red\" align=\"center\">- "+parser.getAttributeValue(null,"n")+" -</p>\n")
335
                                                        }
336
                                                        break;
337
                                                
338
                                                case "ab":
339
                                                        if(parser.getAttributeValue(null,"n") != null)
340
                                                        {
341
                                                                writer.write("<p align=\"center\">"+parser.getAttributeValue(null,"n")+"</p>\n")
342
                                                                multiwriter.write("<p align=\"center\">"+parser.getAttributeValue(null,"n")+"</p>\n")
343
                                                        }
344
                                                        break;
345
                                                case "l":
346
                                                        writer.write("<br/>\n")
347
                                                        multiwriter.write("<br/>\n")
348
                                                        break;
349
                                                
350
                                                case "p":
351
                                                case "q":
352
                                                        writer.write("<p>\n")
353
                                                        multiwriter.write("<p>\n")
354
                                                        break;
355
                                                
356
                                                case "div":
357
                                                        if(parser.getAttributeValue(null,"type") == "titre")
358
                                                        {
359
                                                                writer.write("<h3>\n")
360
                                                                multiwriter.write("<h3>\n")
361
                                                                closeTitre = true;
362
                                                        }
363
                                                        else if(parser.getAttributeValue(null,"type") == "amen")
364
                                                        {
365
                                                                writer.write("<h3>\n")
366
                                                                multiwriter.write("<h3>\n")
367
                                                                closeAmen = true;
368
                                                        }
369
                                                        break;
370
                                                                                                        
371
                                                case "w":
372
                                                        wordid = parser.getAttributeValue(null,"id");
373
                                                        if(firstWord)
374
                                                        {
375
                                                                firstWord=false;
376
                                                                this.idxstart.set(0,wordid);
377
                                                        }
378
                                                        wordcount++;
379
                                                        if(wordcount >= wordmax)
380
                                                        {
381
                                                                
382
                                                                wordcount=0;
383
                                                        }
384
                                                        break;
385
                                                
386
                                                case "choice":
387
                                                        flagchoice = true;
388
                                                        break;
389
                                                case "corr":
390
                                                        flagcorr = true;
391
                                                        break;
392
                                                case "sic":
393
                                                        flagsic = true;
394
                                                        break;
395
                                                case "reg":
396
                                                        flagreg = true;
397
                                                        break;
398
                                                case "orig":
399
                                                        flagorig = true;
400
                                                        break;
401
                                                case "foreign":
402
                                                        flagforeign = true;
403
                                                        break;
404
                                                
405
                                                case "ana":
406
                                                        flaginterp=true;
407
                                                        interpvalue+=" "+parser.getAttributeValue(null,"type").substring(1)+":"
408
                                                        break;
409
                                                
410
                                                case "form":
411
                                                        wordvalue="";
412
                                                        interpvalue ="";
413
                                                        flagform=true;
414
                                        }
415
                                        break;
416
                                case XMLStreamConstants.END_ELEMENT:
417
                                        localname = parser.getLocalName();
418
                                        switch(localname)
419
                                        {
420
                                                case "head":
421
                                                        writer.write("</h2>\n")
422
                                                        multiwriter.write("</h2>\n")
423
                                                        break;
424
                                                
425
                                                case "div":
426
                                                        if(closeTitre)
427
                                                        {
428
                                                                writer.write("</h3>\n")
429
                                                                multiwriter.write("</h3>\n")
430
                                                                closeTitre = false;
431
                                                        }
432
                                                        else if(closeAmen)
433
                                                        {
434
                                                                writer.write("</h3>\n")
435
                                                                multiwriter.write("</h3>\n")
436
                                                                closeTitre = false;
437
                                                        }
438
                                                        break;
439
                                                
440
                                                case "lg":
441
                                                        writer.write("</p>\n")
442
                                                        multiwriter.write("</p>\n")
443
                                                        break;
444
                                                
445
                                                case "lb":
446
                                                        writer.write("<br/>\n")
447
                                                        multiwriter.write("<br/>\n")
448
                                                        break;
449
                                                
450
                                                case "p":
451
                                                case "q":
452
                                                        writer.write("</p>\n")
453
                                                        multiwriter.write("</p>\n")
454
                                                        break;
455
                                                
456
                                                case "choice":
457
                                                        flagchoice = false;
458
                                                        break;
459
                                                case "corr":
460
                                                        flagcorr = false;
461
                                                        break;
462
                                                case "sic":
463
                                                        flagsic = false;
464
                                                        break;
465
                                                case "reg":
466
                                                        flagreg = false;
467
                                                        break;
468
                                                case "orig":
469
                                                        flagorig = false;
470
                                                        break;
471
                                                case "foreign":
472
                                                        flagforeign = false;
473
                                                        break;
474
                                                
475
                                                case "form":
476
                                                        flagform = false
477
                                                        break;
478
                                                
479
                                                case "ana":
480
                                                        flaginterp = false
481
                                                        break;
482
                                                
483
                                                case "w":
484
                                                        int l = lastword.length();
485
                                                        String color = "";
486
                                                        if(flagcorr)
487
                                                                color = "style=\"color: green;\"";
488
                                                        else if(flagreg)
489
                                                                color = "style=\"color: blue;\"";
490
                                                        else if(flagforeign)
491
                                                                color = "style=\"color: blue;\"";
492
                                                        if(!flagchoice || flagcorr || flagreg)
493
                                                        {
494
                                                                String endOfLastWord = "";
495
                                                                if(l > 0)
496
                                                                        endOfLastWord = lastword.subSequence(l-1, l);
497
                                                                
498
                                                                if(interpvalue != null)
499
                                                                        interpvalue = interpvalue.replace("\"","&quot;");
500
                                                                
501
                                                                if(NoSpaceBefore.contains(wordvalue) || 
502
                                                                                NoSpaceAfter.contains(lastword) || 
503
                                                                                wordvalue.startsWith("-") || 
504
                                                                                NoSpaceAfter.contains(endOfLastWord))
505
                                                                {
506
                                                                        writer.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\" "+color+">");
507
                                                                        multiwriter.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\" "+color+">");
508
                                                                }
509
                                                                else
510
                                                                {
511
                                                                        writer.write(" <span title=\""+interpvalue+"\" id=\""+wordid+"\" "+color+">");
512
                                                                        multiwriter.write(" <span title=\""+interpvalue+"\" id=\""+wordid+"\" "+color+">");
513
                                                                }
514
                                                                writer.write(wordvalue.replace("<", "&lt;")+"</span>");
515
                                                                multiwriter.write(wordvalue.replace("<", "&lt;")+"</span>");
516
                                                                lastword=wordvalue;
517
                                                        }
518
                                                
519
                                                
520
                                                        break;
521
                                        }
522
                                        break;
523
                                
524
                                case XMLStreamConstants.CHARACTERS:
525
                                        if(flagform)
526
                                                if(parser.getText().length() > 0)
527
                                                        wordvalue+=(parser.getText());
528
                                        if(flaginterp)
529
                                                if(parser.getText().length() > 0)
530
                                                        interpvalue+=(parser.getText());
531
                                        break;
532
                        }
533
                }        
534
                writer.write("</body>");
535
                writer.write("</html>");
536
                writer.close();
537
                multiwriter.write("</body>");
538
                multiwriter.write("</html>");
539
                multiwriter.close();
540
                inputData.close();
541
        }
542

    
543
        /**
544
         * Writecorrespondances file.
545
         *
546
         * @param file the file
547
         * @return the java.lang. object
548
         */
549
        public writecorrespondancesFile(File file)
550
        {
551
                Writer w = new OutputStreamWriter(new FileOutputStream(file,file.exists()) , "UTF-8");
552
                for(int i = 0 ; i < pages.size(); i ++)
553
                {
554
                        w.write(pages.get(i).getName()+"\t"+idxstart.get(i)+"\n")
555
                }
556
                w.close();
557
        }
558
}