Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZFacsPagerStep.groovy @ 1000

History | View | Annotate | Download (10.4 kB)

1
package org.txm.scripts.importer.xtz
2

    
3
import org.txm.scripts.importer.*
4
import org.xml.sax.Attributes
5
import org.txm.importer.scripts.filters.*
6
import java.util.ArrayList
7
import javax.xml.parsers.*
8
import javax.xml.stream.*
9
import java.net.URL
10
import org.xml.sax.InputSource
11
import org.xml.sax.helpers.DefaultHandler
12
import java.io.FileFilter
13
import org.txm.importer.xtz.*
14

    
15
class XTZFacsPagerStep extends Step {
16

    
17
        private def url
18
        private def inputData
19
        private def factory
20
        private XMLStreamReader parser
21
        OutputStreamWriter writer
22
        StaxStackWriter pagedWriter = null
23

    
24
        File editionDir
25
        File imageDirectory // contains all the images sub-diretories
26
        int iImages
27
        File textImagesDirectory // contains the images for the txtname
28
        File xmlFile
29
        File htmlFile
30

    
31
        def pages = []
32
        def imageAttributes = [:];
33
        
34
        def tag, attribute, txtname, corpusname
35
        boolean firstWord
36
        boolean cutBefore = true;
37
        String wtag;
38
        boolean debug = false;
39

    
40
        public XTZFacsPagerStep(File xmlFile, File editionDir, File imageDirectory, String txtname, String corpusname, String tag, String attribute, String wtag, boolean debug) {
41
                inputData = xmlFile.toURI().toURL().openStream()
42
                factory = XMLInputFactory.newInstance()
43
                parser = factory.createXMLStreamReader(inputData)
44

    
45
                this.xmlFile = xmlFile
46
                this.editionDir = editionDir
47
                editionDir.mkdirs()
48
                this.imageDirectory = imageDirectory
49
                this.tag = tag
50
                this.attribute = attribute
51
                this.txtname = txtname
52
                this.corpusname= corpusname
53
                this.wtag= wtag;
54
                this.debug = debug
55

    
56
                imageAttributes = ["src":"", 
57
                        "id":"txm_image", 
58
                        "alt":"Le fac-similé de cette page n'est pas accessible", 
59
                        //"style":"display:block;width:100%;", 
60
                        "onLoad":"viewer.toolbarImages='images/icons';viewer.onload=viewer.toolbar;new viewer({image: this, frame: ['100%','100%']});"
61
                        ]
62
                
63
                if (imageDirectory != null)
64
                        textImagesDirectory = new File(imageDirectory, txtname)
65
                iImages = 0;
66
        }
67

    
68
        int n = 1;
69
        private boolean createNextOutput() {
70
                try {
71
                        def tags = closeMultiWriter();
72
                        for (int i = 0 ; i < tags.size() ; i++) {
73
                                String tag = tags[i]
74
                                if ("div" != tag) {
75
                                        tags.remove(i--)
76
                                } else {
77
                                        tags.remove(i--) // remove first "div"
78
                                        break; // remove elements until first "div" tag
79
                                }
80
                        }
81

    
82
                        if (wordid != null) {//wordid = "w_0";
83
                                //println " add page $n $wordid, page=$pages"
84
                                pages << [htmlFile, wordid] // store the previous page
85
                                n++
86
                        }
87

    
88
                        // Page suivante
89
                        htmlFile = new File(editionDir, "${txtname}_${n}.html")
90

    
91
                        firstWord = true
92
                        //println "SET FIRST WORD=true"
93
                        pagedWriter = new StaxStackWriter(htmlFile, "UTF-8");
94
                        if (debug) println "Create file $htmlFile"
95
                        pagedWriter.writeStartDocument("UTF-8", "1.0")
96
                        pagedWriter.writeStartElement("html");
97
                        pagedWriter.writeStartElement("head");
98
                        pagedWriter.writeStartElement("title")
99
                        pagedWriter.writeCharacters(corpusname+" Edition - Page "+n)
100
                        pagedWriter.writeEndElement(); // </title>
101
                        pagedWriter.writeStartElement("meta", ["http-equiv":"Content-Type", "content":"text/html","charset":"UTF-8"]);
102
                        pagedWriter.writeEndElement()
103
                        pagedWriter.writeStartElement("link", ["rel":"stylesheet", "media":"all", "type":"text/css","href":"txm.css"]);
104
                        pagedWriter.writeEndElement()
105
                        pagedWriter.writeStartElement("link", ["rel":"stylesheet", "media":"all", "type":"text/css","href":"${corpusname}.css"]);
106
                        pagedWriter.writeEndElement()
107
                        pagedWriter.writeStartElement("script", ["type":"text/javascript", "src":"js/viewer/Simple_Viewer_beta_1.1-min.js"]);
108
                        pagedWriter.writeEndElement()
109
                        pagedWriter.writeStartElement("script", ["type":"text/javascript", "src":"js/viewer/toolbar-ext.js"]);
110
                        pagedWriter.writeEndElement()
111
                        pagedWriter.writeStartElement("link", ["rel":"stylesheet", "media":"all", "type":"text/css","href":"js/viewer/toolbar-ext.css"]);
112
                        pagedWriter.writeEndElement()
113
                        pagedWriter.writeEndElement() // </head>
114
                        pagedWriter.writeStartElement("body") //<body>
115
                        pagedWriter.writeStartElement("div", ["id":"txmeditionpage"]);
116
                        pagedWriter.writeStartElements(tags);
117

    
118
                        wordid = "${wtag}_0"; // default value if no word is found
119
                        return true;
120
                } catch (Exception e) {
121
                        System.out.println("Error while creating HTML file: "+e);
122
                        return false;
123
                }
124
        }
125

    
126
        private def closeMultiWriter()
127
        {
128
                if (pagedWriter != null) {
129
                        def tags = pagedWriter.getTagStack().clone();
130

    
131
                        if (firstWord) { // there was no words
132
                                pagedWriter.writeCharacters("");
133
                                pagedWriter.write("<span id=\"${wtag}_0\"/>");
134
                        }
135
                        pagedWriter.writeEndElements();
136
                        pagedWriter.close();
137
                        return tags;
138
                } else {
139
                        return [];
140
                }
141
        }
142

    
143
        private writeImg(String src) {
144
                
145
                
146
                if (src == null || src.length() == 0) {
147
                        pagedWriter.writeStartElement("p", ["class":"no-img"]);
148
                        pagedWriter.writeCharacters("Cette page n'est pas disponible en mode fac-similé.");
149
                        pagedWriter.writeEndElement(); // </div>
150
                } else {
151
                        imageAttributes["src"] = src
152
                        pagedWriter.writeEmptyElement("img", imageAttributes);
153
                }
154
                
155
        }
156

    
157
        /**
158
         * Process the XML-TXM file. if imageDirectory is not null, then it will be use to fetch the images, if not then the pb@facs attribute contains the image path
159
         */
160
        public boolean process() {
161
                if (textImagesDirectory != null) {
162
                        return processWithImages();
163
                } else {
164
                        return processWithoutImages();
165
                }
166
        }
167
        
168
        String wordid = null;
169
        def imagesFiles;
170
        public boolean processWithImages() {
171

    
172
                if (!textImagesDirectory.exists()) {
173
                        println ("No images directory found: "+textImagesDirectory)
174
                        return false;
175
                }
176

    
177
                // Scan textImagesDirectory for images
178
                imagesFiles = textImagesDirectory.listFiles(new FileFilter() {
179
                                        public boolean accept(File f) {
180
                                                return !f.isHidden() && !f.isDirectory()
181
                                        }
182
                                });
183
                if (imagesFiles == null || imagesFiles.size() == 0) {
184
                        println ("No facs images found in "+textImagesDirectory)
185
                        return false;
186
                }
187
                imagesFiles.sort(); // sort by name
188
                iImages = 0;
189
                //println "For text "+xmlFile.getName()
190
                //println " using images : $imagesFiles"
191

    
192
                // parse XML-TXM files
193
                boolean start = false
194
                String localname
195

    
196
                createNextOutput(); // empty page
197

    
198
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
199
                        switch (event) {
200
                                case XMLStreamConstants.START_ELEMENT:
201
                                        localname = parser.getLocalName();
202
                                        switch (localname) {
203
                                                case "text":
204
                                                        start = true
205
                                                        printStartPage();
206
                                                        break;
207
                                                        break;
208
                                                case wtag:
209
                                                        if (firstWord) {
210
                                                                wordid = parser.getAttributeValue(null, "id");
211
                                                                //println "found word: $wordid"
212
                                                                firstWord = false;
213
                                                        }
214
                                                        break;
215
                                                case tag:
216
                                                        if (debug) println "** TAG $tag $attribute : "+parser.getAttributeValue(null, attribute);
217

    
218
                                                        if (iImages >= imagesFiles.size() ) {
219
                                                                println " ERROR Cound not find the $iImages th image for file $xmlFile at location "+parser.getLocation().getLineNumber()
220
                                                        } else {
221
                                                                String imgPath = "res/images/$corpusname/${editionDir.getName()}/$txtname/"+imagesFiles[iImages].getName()
222
                                                                if (imgPath == null) {
223
                                                                        println " ERROR in $xmlFile no value found for tag=$tag attribute=@$attribute iImages=@iImages at location "+parser.getLocation().getLineNumber()
224
                                                                } else {
225
                                                                        if (cutBefore) {
226
                                                                                if (debug) println " cut before"
227
                                                                                createNextOutput()
228
                                                                                if (debug) println " write img $imgPath"
229
                                                                                writeImg(imgPath)
230
                                                                        } else {
231
                                                                                if (debug) println " write img $imgPath"
232
                                                                                writeImg(imgPath)
233
                                                                                if (debug) println " cut after"
234
                                                                                createNextOutput()
235
                                                                        }
236
                                                                }
237
                                                        }
238
                                                        iImages++ // next image file
239
                                                        break;
240
                                        }
241
                                        break;
242
                        }
243
                }
244
                closeMultiWriter()
245
                pages << [htmlFile, wordid] // add the last page (no pb encountered
246

    
247
                return pages.size() > 1
248
        }
249

    
250
        public boolean processWithoutImages() {
251

    
252
                // parse XML-TXM files
253
                boolean start = false
254
                String localname
255

    
256
                createNextOutput(); // empty page
257

    
258
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
259
                        switch (event) {
260
                                case XMLStreamConstants.START_ELEMENT:
261
                                        localname = parser.getLocalName();
262
                                        switch (localname) {
263
                                                case "text":
264
                                                        start = true
265
                                                        printStartPage()
266
                                                        break;
267
                                                        
268
                                                case wtag:
269
                                                        if (firstWord) {
270
                                                                wordid = parser.getAttributeValue(null, "id");
271
                                                                //println "found word: $wordid"
272
                                                                firstWord = false;
273
                                                        }
274
                                                        break;
275
                                                        
276
                                                case tag:
277
                                                        String url = "" // get image URL 
278
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
279
                                                                if (parser.getAttributeLocalName(i).equals(attribute)) {
280
                                                                        url = parser.getAttributeValue(i);
281
                                                                        continue;
282
                                                                }
283
                                                        }
284
                                                        if (debug) println "** TAG $tag $attribute : $url"
285

    
286
                                                        if (cutBefore) {
287
                                                                if (debug) println " cut before"
288
                                                                createNextOutput()
289
                                                                if (debug) println " write img $url"
290
                                                                writeImg(url)
291
                                                        } else {
292
                                                                if (debug) println " write img $url"
293
                                                                writeImg(url)
294
                                                                if (debug) println " cut after"
295
                                                                url()
296
                                                        }
297

    
298
                                                        break;
299
                                        }
300
                                        break;
301
                        }
302
                }
303
                closeMultiWriter()
304
                pages << [htmlFile, wordid] // add the last page (no pb encountered
305

    
306
                return pages.size() > 1
307
        }
308

    
309
        public void printStartPage() {
310
                LinkedHashMap attributes = new LinkedHashMap();
311
                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
312
                        attributes[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i).toString()
313
                }
314
                pagedWriter.write("\n");
315
                pagedWriter.writeStartElement("p")
316
                if (attributes.containsKey("id")) {
317
                        pagedWriter.writeElement("h3", attributes["id"])
318
                }
319
                
320
                pagedWriter.writeStartElement("table");
321
                for (String k : attributes.keySet()) {
322
                        if (k == "id") continue;
323
                        pagedWriter.writeStartElement("tr");
324
                        pagedWriter.writeElement("td", k);
325
                        pagedWriter.writeElement("td", attributes[k]);
326
                        pagedWriter.writeEndElement();
327
                }
328
                pagedWriter.writeEndElement() // table
329
                pagedWriter.writeEndElement() // p
330
                
331
                pagedWriter.writeCharacters("");
332
        }
333
        
334
        public def getPageFiles() {
335
                return pages
336
        }
337
        
338
        public static void main(String[] args) {
339
                File txmFile = new File("/home/mdecorde/TXM/corpora/BVHEPISTEMON2016/txm/BVHEPISTEMON2016/1546_RabelaisTL.xml")
340
                String txtname = "1532_RabelaisPrnstctn"
341
                String corpusname = "BVH"
342
                File newEditionDirectory = new File("/tmp/xtzpagertest/")
343
                newEditionDirectory.deleteDir()
344
                newEditionDirectory.mkdir()
345
                File imageDirectory = null;
346
                
347
                def ed = new XTZFacsPagerStep(txmFile, newEditionDirectory, imageDirectory, txtname, corpusname, "pb", "facs", "w", true);
348
                if (!ed.process()) {
349
                        println "Fail to build edition for text: $txmFile"
350
                }
351
        }
352
}