Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / hyperprince / pager.groovy @ 479

History | View | Annotate | Download (11.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer.hyperprince;
29

    
30
import org.txm.objects.*;
31
import org.txm.importer.XPathResult;
32
import org.xml.sax.Attributes;
33
import org.txm.importer.filters.*;
34
import java.io.File;
35
import java.io.IOException;
36
import java.util.ArrayList;
37
import javax.xml.parsers.SAXParserFactory;
38
import javax.xml.parsers.ParserConfigurationException;
39
import javax.xml.parsers.SAXParser;
40
import javax.xml.stream.*;
41
import java.net.URL;
42
import org.xml.sax.InputSource;
43
import org.xml.sax.helpers.DefaultHandler;
44
import org.txm.utils.i18n.*;
45

    
46
// TODO: Auto-generated Javadoc
47
/** Build Discours corpus simple edition from a xml-tei. @author mdecorde */
48
class pager {
49
        List<String> NoSpaceBefore;
50
        
51
        /** The No space after. */
52
        List<String> NoSpaceAfter;
53

    
54
        /** The wordcount. */
55
        int wordcount = 0;
56
        
57
        /** The pagecount. */
58
        int pagecount = 0;
59
        
60
        /** The wordmax. */
61
        int wordmax = 0;
62

    
63
        /** The wordid. */
64
        String wordid;
65
        
66
        /** The first word. */
67
        boolean firstWord = true;
68
        
69
        /** The wordvalue. */
70
        String wordvalue;
71
        
72
        /** The interpvalue. */
73
        String interpvalue;
74
        
75
        /** The lastword. */
76
        String lastword = " ";
77
        
78
        /** The wordtype. */
79
        String wordtype;
80
        
81
        /** The flagform. */
82
        boolean flagform = false;
83
        
84
        /** The flaginterp. */
85
        boolean flaginterp = false;
86
        
87
        /** The url. */
88
        private def url;
89
        
90
        /** The input data. */
91
        private def inputData;
92
        
93
        /** The factory. */
94
        private def factory;
95
        
96
        /** The parser. */
97
        private XMLStreamReader parser;
98
        
99
        /** The writer. */
100
        def writer;
101
        
102
        /** The multiwriter. */
103
        def multiwriter = null;
104
        
105
        /** The infile. */
106
        File infile;
107
        
108
        /** The outfile. */
109
        File outfile;
110
        
111
        /** The pages. */
112
        ArrayList<File> pages = new ArrayList<File>();
113
        
114
        /** The idxstart. */
115
        ArrayList<String> idxstart = new ArrayList<String>();
116
        
117
        /** The segid. */
118
        String segid;
119
        
120
        /** The title. */
121
        String title;
122

    
123
        /**
124
         * Instantiates a new pager.
125
         *
126
         * @param infile the infile
127
         * @param outfile the outfile
128
         * @param NoSpaceBefore the no space before
129
         * @param NoSpaceAfter the no space after
130
         * @param max the max
131
         */
132
        pager(File infile, File outfile, List<String> NoSpaceBefore,
133
                        List<String> NoSpaceAfter, int max) {
134
                this.wordmax = max;
135
                this.NoSpaceBefore = NoSpaceBefore;
136
                this.NoSpaceAfter = NoSpaceAfter;
137
                this.url = infile.toURI().toURL();
138
                this.infile = infile;
139

    
140
                inputData = url.openStream();
141
                factory = XMLInputFactory.newInstance();
142
                parser = factory.createXMLStreamReader(inputData);
143
                createOutput(outfile);
144
                process();
145
        }
146

    
147
        /**
148
         * Creates the next output.
149
         *
150
         * @return true, if successful
151
         */
152
        private boolean createNextOutput()
153
        {
154
                try {
155
                        
156
                        if(multiwriter != null)
157
                        {
158
                                multiwriter.write("</body>");
159
                                multiwriter.write("</html>");
160
                                multiwriter.close();
161
                        }
162
                        pagecount++;
163
                        File f = new File(outfile.getParent()+"/multi/",outfile.getName().substring(0,2)+"_"+"page_"+pagecount+".html");
164
                        pages.add(f);
165
                        idxstart.add(wordid)
166
                        multiwriter = new OutputStreamWriter(new FileOutputStream(f) , "UTF-8");
167
                        
168
                        multiwriter.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
169
                        multiwriter.write("<html>");
170
                        multiwriter.write("<head>");
171
                        multiwriter.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>");
172
                        
173
                        if(segid != null)
174
                        {
175
                                multiwriter.write("<title>Hyperprince "+segid+"</title>");
176
                        }else {
177
                                multiwriter.write("<title>Hyperprince "+title+"</title>");
178
                        }
179
                        multiwriter.write("</head>");
180
                        multiwriter.write("<body>");
181
                        
182
                        return true;
183
                } catch (Exception e) {
184
                        System.out.println(e.getLocalizedMessage());
185
                        return false;
186
                }
187
        }
188

    
189
        /**
190
         * Creates the output.
191
         *
192
         * @param outfile the outfile
193
         * @return true, if successful
194
         */
195
        private boolean createOutput(File outfile) {
196
                try {
197
                        this.outfile = outfile;
198

    
199
                        writer = new OutputStreamWriter(new FileOutputStream(outfile),
200
                                        "UTF-8");
201
                        createNextOutput();
202
                        return true;
203
                } catch (Exception e) {
204
                        System.out.println(e.getLocalizedMessage());
205
                        return false;
206
                }
207
        }
208

    
209
        /**
210
         * Gets the page files.
211
         *
212
         * @return the page files
213
         */
214
        public ArrayList<File> getPageFiles() {
215
                return pages;
216
        }
217

    
218
        /**
219
         * Gets the idx.
220
         *
221
         * @return the idx
222
         */
223
        public ArrayList<String> getIdx() {
224
                return idxstart;
225
        }
226

    
227
        /**
228
         * Process.
229
         */
230
        void process()
231
        {
232
                String localname = "";
233
                String query = "//tei:TEI/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:author/text()";
234
                title = XPathResult.getXpathResponse(infile, query);
235
                
236
                writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
237
                writer.write("<html>");
238
                writer.write("<head>");
239
                writer.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>");
240
                writer.write("<title>Hyperprince "+title+"</title>");
241
                writer.write("</head>");
242
                writer.write("<body>");
243
                
244
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
245
                        
246
                        
247
                        switch (event) {
248
                                case XMLStreamConstants.START_ELEMENT:
249
                                        localname = parser.getLocalName();
250
                                        switch(localname)
251
                                        {        
252
                                                case "text":
253
                                                        //XPathResult testG = new XPathResult(infile);
254
                                                        
255
                                                        //String title = testG.getXpathResponse("//tei:TEI/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title","")
256
                                                        //testG = null;
257
                                                        writer.write("<h1>"+title+"</h1>\n")
258
                                                        multiwriter.write("<h1>"+title+"</h1>\n")
259
                                                        break;
260
                                                
261
                                                case "front":
262
                                                        writer.write("<h2>"+parser.getAttributeValue(null, "type")+"</h2>\n")
263
                                                        multiwriter.write("<h2>"+parser.getAttributeValue(null, "type")+"</h2>\n")
264
                                                        break;
265
                                                
266
                                                case "div0":
267
                                                        writer.write("<h2>Book</h2>\n")
268
                                                        multiwriter.write("<h2>Book</h2>\n")
269
                                                        break;
270
                                                
271
                                                case "back":
272
                                                        writer.write("<h2>"+parser.getAttributeValue(null, "type")+"</h2>\n")
273
                                                        multiwriter.write("<h2>"+parser.getAttributeValue(null, "type")+"</h2>\n")
274
                                                        break;
275
                                                
276
                                                case "seg":
277
                                                        segid=parser.getAttributeValue(null,"id");
278
                                                        createNextOutput();
279
                                                        
280
                                                        writer.write("<p>\n");
281
                                                        multiwriter.write("<p>\n");
282
                                                        break;
283
                                                
284
                                                case "w":
285
                                                        wordid=parser.getAttributeValue(null,"id");
286
                                                        if(firstWord)
287
                                                        {
288
                                                                firstWord=false;
289
                                                                this.idxstart.set(0,wordid);
290
                                                        }
291
                                                        /*wordcount++;
292
                                                        if(wordcount >= wordmax)
293
                                                        {
294
                                                                createNextOutput();
295
                                                                wordcount=0;
296
                                                        }*/
297
                                                        break;
298
                                                
299
                                                case "ana":
300
                                                        flaginterp=true;
301
                                                        interpvalue+=" "+parser.getAttributeValue(null,"type")+":"
302
                                                        break;
303
                                                
304
                                                case "form":
305
                                                        wordvalue="";
306
                                                        interpvalue ="";
307
                                                        flagform=true;
308
                                                        break;
309
                                        }
310
                                        break;
311
                                case XMLStreamConstants.END_ELEMENT:
312
                                        localname = parser.getLocalName();
313
                                        switch(localname)
314
                                        {
315
                                                case "teiHeader":
316
                                                
317
                                                        break;
318
                                                
319
                                                case "seg":
320
                                                        writer.write("</p>\n");
321
                                                        break;
322
                                                
323
                                                case "form":
324
                                                        flagform = false
325
                                                
326
                                                        break;
327
                                                
328
                                                case "ana":
329
                                                        flaginterp = false
330
                                                        break;
331
                                                
332
                                                case "w":
333
                                                        int l = lastword.length();
334
                                                        String endOfLastWord = "";
335
                                                        if(l > 0)
336
                                                                endOfLastWord = lastword.subSequence(l-1, l);
337
                                                        
338
                                                        if(interpvalue != null)
339
                                                                interpvalue = interpvalue.replace("\"","&quot;");
340
                                                        
341
                                                        if(NoSpaceBefore.contains(wordvalue) || 
342
                                                                        NoSpaceAfter.contains(lastword) || 
343
                                                                        wordvalue.startsWith("-") || 
344
                                                                        NoSpaceAfter.contains(endOfLastWord))
345
                                                        {
346
                                                                writer.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\">");
347
                                                                multiwriter.write("<span title=\""+interpvalue+"\" id=\""+wordid+"\">");
348
                                                        }
349
                                                        else
350
                                                        {
351
                                                                writer.write(" <span title=\""+interpvalue+"\" id=\""+wordid+"\">");
352
                                                                multiwriter.write(" <span title=\""+interpvalue+"\" id=\""+wordid+"\">");
353
                                                        }
354
                                                        writer.write(wordvalue.replace("<", "&lt;")+"</span>");
355
                                                        multiwriter.write(wordvalue.replace("<", "&lt;")+"</span>");
356
                                                        lastword=wordvalue;
357
                                                        break;
358
                                                
359
                                        }
360
                                        break;
361
                                
362
                                case XMLStreamConstants.CHARACTERS:
363
                                        if(flagform)
364
                                                if(parser.getText().length() > 0)
365
                                                        wordvalue+=(parser.getText());
366
                                        if(flaginterp)
367
                                                if(parser.getText().length() > 0)
368
                                                        interpvalue+=(parser.getText());
369
                                        break;
370
                        }
371
                }        
372
                writer.write("</body>");
373
                writer.write("</html>");
374
                writer.close();
375
                multiwriter.write("</body>");
376
                multiwriter.write("</html>");
377
                multiwriter.close();
378
                
379
                parser.close();
380
                inputData.close();
381
        }
382

    
383
        /**
384
         * The main method.
385
         *
386
         * @param args the arguments
387
         */
388
        public static void main(String[] args)
389
        {
390
                /*String rootDir = System.getProperty("user.home")+"/xml/hyperprince/";
391
                new File(rootDir,"HTML/").deleteDir();
392
                new File(rootDir+"HTML/").mkdir();
393
                new File(rootDir+"HTML/multi/").mkdir();
394
                List<File> filelist = new File(rootDir,"txm").listFiles();
395
                
396
                for(File srcfile : filelist)
397
                {
398
                        File resultfile = new File(rootDir+"HTML",srcfile.getName().substring(0,2)+".html");
399
                        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
400
                        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
401
                        println("build discours xml-tei file : "+srcfile+" to : "+resultfile );
402
                        
403
                        def builder = new pager(srcfile,resultfile, NoSpaceBefore, NoSpaceAfter,1500);
404
                }
405
                
406
                return;*/
407
                
408
                
409
                String userDir = System.getProperty("user.home");
410
                String lang= "fr"
411
                String rootDir;
412
                println "DEV MODE";//exception means we debug
413
                if(!org.txm.Toolbox.isInitialized()){
414
                        rootDir = "D:\\Travail_Sev\\Projets\\Corpus\\TXM\\hyperprince"; //userDir+"/xml/hyperprince/";//"D:\\Travail_Sev\\Projets\\Corpus\\TXM\\hyperprince";
415
                        org.txm.Toolbox.workspace = new Workspace(new File(userDir,"TXM/workspaces/default.xml"));
416
                        org.txm.Toolbox.setParam(org.txm.Toolbox.INSTALL_DIR,new File(userDir,"TXM"));
417
                }
418
                
419
                Workspace w = org.txm.Toolbox.workspace;
420
                Project p = w.getProject("default")
421
                p.removeBase("hyperprince")
422
                Base b = p.addBase("hyperprince");
423
                b.addDirectory(new File(rootDir,"txm"));
424
                
425
                
426
                println "-- EDITION"
427
                new File(rootDir+"/HTML/").deleteDir();
428
                new File(rootDir+"/HTML/").mkdir();
429
                new File(rootDir,"/HTML/multi").mkdir();
430
                List<File> filelist = new File(rootDir,"txm").listFiles();
431

    
432
                for(String textname : b.getTextsID())
433
                {
434
                        Text text = b .getText(textname);
435
                        File srcfile = text.getSource();
436
                        File resultfile = new File(rootDir+"/HTML",srcfile.getName().substring(0,srcfile.getName().length()-4)+".html");
437
                        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
438
                        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
439
                        println("build hyperprince xml-tei file : "+srcfile+" to : "+resultfile );
440
                        
441
                        def ed = new pager(srcfile,resultfile, NoSpaceBefore, NoSpaceAfter,500);
442
                        
443
                        Edition editionweb = text.addEdition("default","html",resultfile);
444
                        //println("pages "+ed.getPageFiles())
445
                        //println("idx "+ed.getIdx())
446
                        for(int i = 0 ; i < ed.getPageFiles().size();i++)
447
                        {
448
                                File f = ed.getPageFiles().get(i);
449
                                String idx = ed.getIdx().get(i);
450
                                editionweb.addPage(f,idx);
451
                        }
452
                        
453
                        //Edition (version Pincemin)
454
//                        Edition editionbp = text.addEdition("onepage","html",resultfile);
455
//                        editionbp.addPage(resultfile,ed.getIdx().get(0));
456
                }
457
                w.save();
458
        }
459
}