Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / xtz / XTZFacsPagerStep.groovy @ 187

History | View | Annotate | Download (10.4 kB)

1
package org.txm.importer.xtz
2

    
3
import org.txm.importer.*
4
import org.xml.sax.Attributes
5
import org.txm.importer.filters.*
6
import java.util.ArrayList
7
import javax.xml.parsers.*
8
import javax.xml.stream.*
9
import java.net.URL
10
import org.xml.sax.InputSource
11
import org.xml.sax.helpers.DefaultHandler
12
import java.io.FileFilter
13

    
14
class XTZFacsPagerStep extends Step {
15

    
16
        private def url
17
        private def inputData
18
        private def factory
19
        private XMLStreamReader parser
20
        OutputStreamWriter writer
21
        StaxStackWriter pagedWriter = null
22

    
23
        File editionDir
24
        File imageDirectory // contains all the images sub-diretories
25
        int iImages
26
        File textImagesDirectory // contains the images for the txtname
27
        File xmlFile
28
        File htmlFile
29

    
30
        def pages = []
31
        def imageAttributes = [:];
32
        
33
        def tag, attribute, txtname, corpusname
34
        boolean firstWord
35
        boolean cutBefore = true;
36
        String wtag;
37
        boolean debug = false;
38

    
39
        public XTZFacsPagerStep(File xmlFile, File editionDir, File imageDirectory, String txtname, String corpusname, String tag, String attribute, String wtag, boolean debug) {
40
                inputData = xmlFile.toURI().toURL().openStream()
41
                factory = XMLInputFactory.newInstance()
42
                parser = factory.createXMLStreamReader(inputData)
43

    
44
                this.xmlFile = xmlFile
45
                this.editionDir = editionDir
46
                editionDir.mkdirs()
47
                this.imageDirectory = imageDirectory
48
                this.tag = tag
49
                this.attribute = attribute
50
                this.txtname = txtname
51
                this.corpusname= corpusname
52
                this.wtag= wtag;
53
                this.debug = debug
54

    
55
                imageAttributes = ["src":"", 
56
                        "id":"txm_image", 
57
                        "alt":"Le fac-similé de cette page n'est pas accessible", 
58
                        //"style":"display:block;width:100%;", 
59
                        "onLoad":"viewer.toolbarImages='images/icons';viewer.onload=viewer.toolbar;new viewer({image: this, frame: ['100%','100%']});"
60
                        ]
61
                
62
                if (imageDirectory != null)
63
                        textImagesDirectory = new File(imageDirectory, txtname)
64
                iImages = 0;
65
        }
66

    
67
        int n = 1;
68
        private boolean createNextOutput() {
69
                try {
70
                        def tags = closeMultiWriter();
71
                        for (int i = 0 ; i < tags.size() ; i++) {
72
                                String tag = tags[i]
73
                                if ("body" != tag) {
74
                                        tags.remove(i--)
75
                                } else {
76
                                        tags.remove(i--) // remove "body"
77
                                        break; // remove elements until "body tag
78
                                }
79
                        }
80

    
81
                        if (wordid != null) {//wordid = "w_0";
82
                                //println " add page $n $wordid, page=$pages"
83
                                pages << [htmlFile, wordid] // store the previous page
84
                                n++
85
                        }
86

    
87
                        // Page suivante
88
                        htmlFile = new File(editionDir, "${txtname}_${n}.html")
89

    
90
                        firstWord = true
91
                        //println "SET FIRST WORD=true"
92
                        pagedWriter = new StaxStackWriter(htmlFile, "UTF-8");
93
                        if (debug) println "Create file $htmlFile"
94
                        pagedWriter.writeStartDocument("UTF-8", "1.0")
95
                        pagedWriter.writeStartElement("html");
96
                        pagedWriter.writeStartElement("head");
97
                        pagedWriter.writeStartElement("title")
98
                        pagedWriter.writeCharacters(corpusname+" Edition - Page "+n)
99
                        pagedWriter.writeEndElement(); // </title>
100
                        pagedWriter.writeStartElement("meta", ["http-equiv":"Content-Type", "content":"text/html","charset":"UTF-8"]);
101
                        pagedWriter.writeEndElement()
102
                        pagedWriter.writeStartElement("link", ["rel":"stylesheet", "media":"all", "type":"text/css","href":"txm.css"]);
103
                        pagedWriter.writeEndElement()
104
                        pagedWriter.writeStartElement("link", ["rel":"stylesheet", "media":"all", "type":"text/css","href":"${corpusname}.css"]);
105
                        pagedWriter.writeEndElement()
106
                        pagedWriter.writeStartElement("script", ["type":"text/javascript", "src":"js/viewer/Simple_Viewer_beta_1.1-min.js"]);
107
                        pagedWriter.writeEndElement()
108
                        pagedWriter.writeStartElement("script", ["type":"text/javascript", "src":"js/viewer/toolbar-ext.js"]);
109
                        pagedWriter.writeEndElement()
110
                        pagedWriter.writeStartElement("link", ["rel":"stylesheet", "media":"all", "type":"text/css","href":"js/viewer/toolbar-ext.css"]);
111
                        pagedWriter.writeEndElement()
112
                        pagedWriter.writeEndElement() // </head>
113
                        pagedWriter.writeStartElement("body") //<body>
114
                        pagedWriter.writeStartElement("div", ["id":"txmeditionpage"]);
115
                        pagedWriter.writeStartElements(tags);
116

    
117
                        wordid = "${wtag}_0"; // default value if no word is found
118
                        return true;
119
                } catch (Exception e) {
120
                        System.out.println("Error while creating HTML file: "+e);
121
                        return false;
122
                }
123
        }
124

    
125
        private def closeMultiWriter()
126
        {
127
                if (pagedWriter != null) {
128
                        def tags = pagedWriter.getTagStack().clone();
129

    
130
                        if (firstWord) { // there was no words
131
                                pagedWriter.writeCharacters("");
132
                                pagedWriter.write("<span id=\"${wtag}_0\"/>");
133
                        }
134
                        pagedWriter.writeEndElements();
135
                        pagedWriter.close();
136
                        return tags;
137
                } else {
138
                        return [];
139
                }
140
        }
141

    
142
        private writeImg(String src) {
143
                
144
                
145
                if (src == null || src.length() == 0) {
146
                        pagedWriter.writeStartElement("p", ["class":"no-img"]);
147
                        pagedWriter.writeCharacters("Cette page n'est pas disponible en mode fac-similé.");
148
                        pagedWriter.writeEndElement(); // </div>
149
                } else {
150
                        imageAttributes["src"] = src
151
                        pagedWriter.writeEmptyElement("img", imageAttributes);
152
                }
153
                
154
        }
155

    
156
        /**
157
         * Process the XML-TXM file. if imageDirectory is not null, then it will be use to fetch the images, if not then the pb@facs attribute contains the image path
158
         */
159
        public boolean process() {
160
                if (textImagesDirectory != null) {
161
                        return processWithImages();
162
                } else {
163
                        return processWithoutImages();
164
                }
165
        }
166
        
167
        String wordid = null;
168
        def imagesFiles;
169
        public boolean processWithImages() {
170

    
171
                if (!textImagesDirectory.exists()) {
172
                        println ("No images directory found: "+textImagesDirectory)
173
                        return false;
174
                }
175

    
176
                // Scan textImagesDirectory for images
177
                imagesFiles = textImagesDirectory.listFiles(new FileFilter() {
178
                                        public boolean accept(File f) {
179
                                                return !f.isHidden() && !f.isDirectory()
180
                                        }
181
                                });
182
                if (imagesFiles == null || imagesFiles.size() == 0) {
183
                        println ("No facs images found in "+textImagesDirectory)
184
                        return false;
185
                }
186
                imagesFiles.sort(); // sort by name
187
                iImages = 0;
188
                //println "For text "+xmlFile.getName()
189
                //println " using images : $imagesFiles"
190

    
191
                // parse XML-TXM files
192
                boolean start = false
193
                String localname
194

    
195
                createNextOutput(); // empty page
196

    
197
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
198
                        switch (event) {
199
                                case XMLStreamConstants.START_ELEMENT:
200
                                        localname = parser.getLocalName();
201
                                        switch (localname) {
202
                                                case "text":
203
                                                        start = true
204
                                                        printStartPage();
205
                                                        break;
206
                                                        break;
207
                                                case wtag:
208
                                                        if (firstWord) {
209
                                                                wordid = parser.getAttributeValue(null, "id");
210
                                                                //println "found word: $wordid"
211
                                                                firstWord = false;
212
                                                        }
213
                                                        break;
214
                                                case tag:
215
                                                        if (debug) println "** TAG $tag $attribute : "+parser.getAttributeValue(null, attribute);
216

    
217
                                                        if (iImages >= imagesFiles.size() ) {
218
                                                                println " ERROR Cound not find the $iImages th image for file $xmlFile at location "+parser.getLocation().getLineNumber()
219
                                                        } else {
220
                                                                String imgPath = "res/images/$corpusname/${editionDir.getName()}/$txtname/"+imagesFiles[iImages].getName()
221
                                                                if (imgPath == null) {
222
                                                                        println " ERROR in $xmlFile no value found for tag=$tag attribute=@$attribute iImages=@iImages at location "+parser.getLocation().getLineNumber()
223
                                                                } else {
224
                                                                        if (cutBefore) {
225
                                                                                if (debug) println " cut before"
226
                                                                                createNextOutput()
227
                                                                                if (debug) println " write img $imgPath"
228
                                                                                writeImg(imgPath)
229
                                                                        } else {
230
                                                                                if (debug) println " write img $imgPath"
231
                                                                                writeImg(imgPath)
232
                                                                                if (debug) println " cut after"
233
                                                                                createNextOutput()
234
                                                                        }
235
                                                                }
236
                                                        }
237
                                                        iImages++ // next image file
238
                                                        break;
239
                                        }
240
                                        break;
241
                        }
242
                }
243
                closeMultiWriter()
244
                pages << [htmlFile, wordid] // add the last page (no pb encountered
245

    
246
                return pages.size() > 1
247
        }
248

    
249
        public boolean processWithoutImages() {
250

    
251
                // parse XML-TXM files
252
                boolean start = false
253
                String localname
254

    
255
                createNextOutput(); // empty page
256

    
257
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
258
                        switch (event) {
259
                                case XMLStreamConstants.START_ELEMENT:
260
                                        localname = parser.getLocalName();
261
                                        switch (localname) {
262
                                                case "text":
263
                                                        start = true
264
                                                        printStartPage()
265
                                                        break;
266
                                                        
267
                                                case wtag:
268
                                                        if (firstWord) {
269
                                                                wordid = parser.getAttributeValue(null, "id");
270
                                                                //println "found word: $wordid"
271
                                                                firstWord = false;
272
                                                        }
273
                                                        break;
274
                                                        
275
                                                case tag:
276
                                                        String url = "" // get image URL 
277
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
278
                                                                if (parser.getAttributeLocalName(i).equals(attribute)) {
279
                                                                        url = parser.getAttributeValue(i);
280
                                                                        continue;
281
                                                                }
282
                                                        }
283
                                                        if (debug) println "** TAG $tag $attribute : $url"
284

    
285
                                                        if (cutBefore) {
286
                                                                if (debug) println " cut before"
287
                                                                createNextOutput()
288
                                                                if (debug) println " write img $url"
289
                                                                writeImg(url)
290
                                                        } else {
291
                                                                if (debug) println " write img $url"
292
                                                                writeImg(url)
293
                                                                if (debug) println " cut after"
294
                                                                url()
295
                                                        }
296

    
297
                                                        break;
298
                                        }
299
                                        break;
300
                        }
301
                }
302
                closeMultiWriter()
303
                pages << [htmlFile, wordid] // add the last page (no pb encountered
304

    
305
                return pages.size() > 1
306
        }
307

    
308
        public void printStartPage() {
309
                LinkedHashMap attributes = new LinkedHashMap();
310
                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
311
                        attributes[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i).toString()
312
                }
313
                pagedWriter.write("\n");
314
                pagedWriter.writeStartElement("p")
315
                if (attributes.containsKey("id")) {
316
                        pagedWriter.writeElement("h3", attributes["id"])
317
                }
318
                
319
                pagedWriter.writeStartElement("table");
320
                for (String k : attributes.keySet()) {
321
                        if (k == "id") continue;
322
                        pagedWriter.writeStartElement("tr");
323
                        pagedWriter.writeElement("td", k);
324
                        pagedWriter.writeElement("td", attributes[k]);
325
                        pagedWriter.writeEndElement();
326
                }
327
                pagedWriter.writeEndElement() // table
328
                pagedWriter.writeEndElement() // p
329
                
330
                pagedWriter.writeCharacters("");
331
        }
332
        
333
        public def getPageFiles() {
334
                return pages
335
        }
336
        
337
        public static void main(String[] args) {
338
                File txmFile = new File("/home/mdecorde/TXM/corpora/BVHEPISTEMON2016/txm/BVHEPISTEMON2016/1546_RabelaisTL.xml")
339
                String txtname = "1532_RabelaisPrnstctn"
340
                String corpusname = "BVH"
341
                File newEditionDirectory = new File("/tmp/xtzpagertest/")
342
                newEditionDirectory.deleteDir()
343
                newEditionDirectory.mkdir()
344
                File imageDirectory = null;
345
                
346
                def ed = new XTZFacsPagerStep(txmFile, newEditionDirectory, imageDirectory, txtname, corpusname, "pb", "facs", "w", true);
347
                if (!ed.process()) {
348
                        println "Fail to build edition for text: $txmFile"
349
                }
350
        }
351
}