Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZFacsPagerStep.groovy @ 2126

History | View | Annotate | Download (10.5 kB)

1 986 mdecorde
package org.txm.scripts.importer.xtz
2 321 mdecorde
3 986 mdecorde
import org.txm.scripts.importer.*
4 321 mdecorde
import org.xml.sax.Attributes
5 1000 mdecorde
import org.txm.importer.scripts.filters.*
6 321 mdecorde
import java.util.ArrayList
7 321 mdecorde
import javax.xml.parsers.*
8 321 mdecorde
import javax.xml.stream.*
9 321 mdecorde
import java.net.URL
10 321 mdecorde
import org.xml.sax.InputSource
11 321 mdecorde
import org.xml.sax.helpers.DefaultHandler
12 321 mdecorde
import java.io.FileFilter
13 1000 mdecorde
import org.txm.importer.xtz.*
14 321 mdecorde
15 321 mdecorde
class XTZFacsPagerStep extends Step {
16 321 mdecorde
17 321 mdecorde
        private def url
18 321 mdecorde
        private def inputData
19 321 mdecorde
        private def factory
20 321 mdecorde
        private XMLStreamReader parser
21 321 mdecorde
        OutputStreamWriter writer
22 321 mdecorde
        StaxStackWriter pagedWriter = null
23 321 mdecorde
24 321 mdecorde
        File editionDir
25 321 mdecorde
        File imageDirectory // contains all the images sub-diretories
26 321 mdecorde
        int iImages
27 321 mdecorde
        File textImagesDirectory // contains the images for the txtname
28 321 mdecorde
        File xmlFile
29 321 mdecorde
        File htmlFile
30 321 mdecorde
31 321 mdecorde
        def pages = []
32 321 mdecorde
        def imageAttributes = [:];
33 321 mdecorde
34 321 mdecorde
        def tag, attribute, txtname, corpusname
35 321 mdecorde
        boolean firstWord
36 321 mdecorde
        boolean cutBefore = true;
37 321 mdecorde
        String wtag;
38 321 mdecorde
        boolean debug = false;
39 321 mdecorde
40 321 mdecorde
        public XTZFacsPagerStep(File xmlFile, File editionDir, File imageDirectory, String txtname, String corpusname, String tag, String attribute, String wtag, boolean debug) {
41 321 mdecorde
                inputData = xmlFile.toURI().toURL().openStream()
42 321 mdecorde
                factory = XMLInputFactory.newInstance()
43 321 mdecorde
                parser = factory.createXMLStreamReader(inputData)
44 321 mdecorde
45 321 mdecorde
                this.xmlFile = xmlFile
46 321 mdecorde
                this.editionDir = editionDir
47 321 mdecorde
                editionDir.mkdirs()
48 321 mdecorde
                this.imageDirectory = imageDirectory
49 321 mdecorde
                this.tag = tag
50 321 mdecorde
                this.attribute = attribute
51 321 mdecorde
                this.txtname = txtname
52 321 mdecorde
                this.corpusname= corpusname
53 321 mdecorde
                this.wtag= wtag;
54 321 mdecorde
                this.debug = debug
55 321 mdecorde
56 321 mdecorde
                imageAttributes = ["src":"",
57 321 mdecorde
                        "id":"txm_image",
58 321 mdecorde
                        "alt":"Le fac-similé de cette page n'est pas accessible",
59 321 mdecorde
                        //"style":"display:block;width:100%;",
60 321 mdecorde
                        "onLoad":"viewer.toolbarImages='images/icons';viewer.onload=viewer.toolbar;new viewer({image: this, frame: ['100%','100%']});"
61 321 mdecorde
                        ]
62 321 mdecorde
63 321 mdecorde
                if (imageDirectory != null)
64 321 mdecorde
                        textImagesDirectory = new File(imageDirectory, txtname)
65 321 mdecorde
                iImages = 0;
66 321 mdecorde
        }
67 321 mdecorde
68 321 mdecorde
        int n = 1;
69 321 mdecorde
        private boolean createNextOutput() {
70 321 mdecorde
                try {
71 321 mdecorde
                        def tags = closeMultiWriter();
72 321 mdecorde
                        for (int i = 0 ; i < tags.size() ; i++) {
73 321 mdecorde
                                String tag = tags[i]
74 479 mdecorde
                                if ("div" != tag) {
75 321 mdecorde
                                        tags.remove(i--)
76 321 mdecorde
                                } else {
77 479 mdecorde
                                        tags.remove(i--) // remove first "div"
78 479 mdecorde
                                        break; // remove elements until first "div" tag
79 321 mdecorde
                                }
80 321 mdecorde
                        }
81 321 mdecorde
82 321 mdecorde
                        if (wordid != null) {//wordid = "w_0";
83 321 mdecorde
                                //println " add page $n $wordid, page=$pages"
84 321 mdecorde
                                pages << [htmlFile, wordid] // store the previous page
85 321 mdecorde
                                n++
86 321 mdecorde
                        }
87 321 mdecorde
88 321 mdecorde
                        // Page suivante
89 321 mdecorde
                        htmlFile = new File(editionDir, "${txtname}_${n}.html")
90 321 mdecorde
91 321 mdecorde
                        firstWord = true
92 321 mdecorde
                        //println "SET FIRST WORD=true"
93 321 mdecorde
                        pagedWriter = new StaxStackWriter(htmlFile, "UTF-8");
94 321 mdecorde
                        if (debug) println "Create file $htmlFile"
95 321 mdecorde
                        pagedWriter.writeStartDocument("UTF-8", "1.0")
96 321 mdecorde
                        pagedWriter.writeStartElement("html");
97 321 mdecorde
                        pagedWriter.writeStartElement("head");
98 321 mdecorde
                        pagedWriter.writeStartElement("title")
99 321 mdecorde
                        pagedWriter.writeCharacters(corpusname+" Edition - Page "+n)
100 321 mdecorde
                        pagedWriter.writeEndElement(); // </title>
101 321 mdecorde
                        pagedWriter.writeStartElement("meta", ["http-equiv":"Content-Type", "content":"text/html","charset":"UTF-8"]);
102 321 mdecorde
                        pagedWriter.writeEndElement()
103 321 mdecorde
                        pagedWriter.writeStartElement("link", ["rel":"stylesheet", "media":"all", "type":"text/css","href":"txm.css"]);
104 321 mdecorde
                        pagedWriter.writeEndElement()
105 321 mdecorde
                        pagedWriter.writeStartElement("link", ["rel":"stylesheet", "media":"all", "type":"text/css","href":"${corpusname}.css"]);
106 321 mdecorde
                        pagedWriter.writeEndElement()
107 321 mdecorde
                        pagedWriter.writeStartElement("script", ["type":"text/javascript", "src":"js/viewer/Simple_Viewer_beta_1.1-min.js"]);
108 321 mdecorde
                        pagedWriter.writeEndElement()
109 321 mdecorde
                        pagedWriter.writeStartElement("script", ["type":"text/javascript", "src":"js/viewer/toolbar-ext.js"]);
110 321 mdecorde
                        pagedWriter.writeEndElement()
111 321 mdecorde
                        pagedWriter.writeStartElement("link", ["rel":"stylesheet", "media":"all", "type":"text/css","href":"js/viewer/toolbar-ext.css"]);
112 321 mdecorde
                        pagedWriter.writeEndElement()
113 321 mdecorde
                        pagedWriter.writeEndElement() // </head>
114 321 mdecorde
                        pagedWriter.writeStartElement("body") //<body>
115 321 mdecorde
                        pagedWriter.writeStartElement("div", ["id":"txmeditionpage"]);
116 321 mdecorde
                        pagedWriter.writeStartElements(tags);
117 321 mdecorde
118 321 mdecorde
                        wordid = "${wtag}_0"; // default value if no word is found
119 321 mdecorde
                        return true;
120 321 mdecorde
                } catch (Exception e) {
121 321 mdecorde
                        System.out.println("Error while creating HTML file: "+e);
122 321 mdecorde
                        return false;
123 321 mdecorde
                }
124 321 mdecorde
        }
125 321 mdecorde
126 321 mdecorde
        private def closeMultiWriter()
127 321 mdecorde
        {
128 321 mdecorde
                if (pagedWriter != null) {
129 321 mdecorde
                        def tags = pagedWriter.getTagStack().clone();
130 321 mdecorde
131 321 mdecorde
                        if (firstWord) { // there was no words
132 321 mdecorde
                                pagedWriter.writeCharacters("");
133 321 mdecorde
                                pagedWriter.write("<span id=\"${wtag}_0\"/>");
134 321 mdecorde
                        }
135 321 mdecorde
                        pagedWriter.writeEndElements();
136 321 mdecorde
                        pagedWriter.close();
137 321 mdecorde
                        return tags;
138 321 mdecorde
                } else {
139 321 mdecorde
                        return [];
140 321 mdecorde
                }
141 321 mdecorde
        }
142 321 mdecorde
143 321 mdecorde
        private writeImg(String src) {
144 321 mdecorde
145 321 mdecorde
146 321 mdecorde
                if (src == null || src.length() == 0) {
147 321 mdecorde
                        pagedWriter.writeStartElement("p", ["class":"no-img"]);
148 321 mdecorde
                        pagedWriter.writeCharacters("Cette page n'est pas disponible en mode fac-similé.");
149 321 mdecorde
                        pagedWriter.writeEndElement(); // </div>
150 321 mdecorde
                } else {
151 321 mdecorde
                        imageAttributes["src"] = src
152 321 mdecorde
                        pagedWriter.writeEmptyElement("img", imageAttributes);
153 321 mdecorde
                }
154 321 mdecorde
155 321 mdecorde
        }
156 321 mdecorde
157 321 mdecorde
        /**
158 321 mdecorde
         * Process the XML-TXM file. if imageDirectory is not null, then it will be use to fetch the images, if not then the pb@facs attribute contains the image path
159 321 mdecorde
         */
160 321 mdecorde
        public boolean process() {
161 321 mdecorde
                if (textImagesDirectory != null) {
162 321 mdecorde
                        return processWithImages();
163 321 mdecorde
                } else {
164 321 mdecorde
                        return processWithoutImages();
165 321 mdecorde
                }
166 321 mdecorde
        }
167 321 mdecorde
168 321 mdecorde
        String wordid = null;
169 321 mdecorde
        def imagesFiles;
170 321 mdecorde
        public boolean processWithImages() {
171 321 mdecorde
172 321 mdecorde
                if (!textImagesDirectory.exists()) {
173 321 mdecorde
                        println ("No images directory found: "+textImagesDirectory)
174 321 mdecorde
                        return false;
175 321 mdecorde
                }
176 321 mdecorde
177 321 mdecorde
                // Scan textImagesDirectory for images
178 321 mdecorde
                imagesFiles = textImagesDirectory.listFiles(new FileFilter() {
179 321 mdecorde
                                        public boolean accept(File f) {
180 321 mdecorde
                                                return !f.isHidden() && !f.isDirectory()
181 321 mdecorde
                                        }
182 321 mdecorde
                                });
183 321 mdecorde
                if (imagesFiles == null || imagesFiles.size() == 0) {
184 321 mdecorde
                        println ("No facs images found in "+textImagesDirectory)
185 321 mdecorde
                        return false;
186 321 mdecorde
                }
187 321 mdecorde
                imagesFiles.sort(); // sort by name
188 321 mdecorde
                iImages = 0;
189 321 mdecorde
                //println "For text "+xmlFile.getName()
190 321 mdecorde
                //println " using images : $imagesFiles"
191 321 mdecorde
192 321 mdecorde
                // parse XML-TXM files
193 321 mdecorde
                boolean start = false
194 321 mdecorde
                String localname
195 321 mdecorde
196 321 mdecorde
                createNextOutput(); // empty page
197 321 mdecorde
198 321 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
199 321 mdecorde
                        switch (event) {
200 321 mdecorde
                                case XMLStreamConstants.START_ELEMENT:
201 321 mdecorde
                                        localname = parser.getLocalName();
202 321 mdecorde
                                        switch (localname) {
203 321 mdecorde
                                                case "text":
204 321 mdecorde
                                                        start = true
205 321 mdecorde
                                                        printStartPage();
206 321 mdecorde
                                                        break;
207 321 mdecorde
                                                        break;
208 321 mdecorde
                                                case wtag:
209 321 mdecorde
                                                        if (firstWord) {
210 321 mdecorde
                                                                wordid = parser.getAttributeValue(null, "id");
211 321 mdecorde
                                                                //println "found word: $wordid"
212 321 mdecorde
                                                                firstWord = false;
213 321 mdecorde
                                                        }
214 321 mdecorde
                                                        break;
215 321 mdecorde
                                                case tag:
216 321 mdecorde
                                                        if (debug) println "** TAG $tag $attribute : "+parser.getAttributeValue(null, attribute);
217 321 mdecorde
218 321 mdecorde
                                                        if (iImages >= imagesFiles.size() ) {
219 321 mdecorde
                                                                println " ERROR Cound not find the $iImages th image for file $xmlFile at location "+parser.getLocation().getLineNumber()
220 321 mdecorde
                                                        } else {
221 321 mdecorde
                                                                String imgPath = "res/images/$corpusname/${editionDir.getName()}/$txtname/"+imagesFiles[iImages].getName()
222 321 mdecorde
                                                                if (imgPath == null) {
223 321 mdecorde
                                                                        println " ERROR in $xmlFile no value found for tag=$tag attribute=@$attribute iImages=@iImages at location "+parser.getLocation().getLineNumber()
224 321 mdecorde
                                                                } else {
225 321 mdecorde
                                                                        if (cutBefore) {
226 321 mdecorde
                                                                                if (debug) println " cut before"
227 321 mdecorde
                                                                                createNextOutput()
228 321 mdecorde
                                                                                if (debug) println " write img $imgPath"
229 321 mdecorde
                                                                                writeImg(imgPath)
230 321 mdecorde
                                                                        } else {
231 321 mdecorde
                                                                                if (debug) println " write img $imgPath"
232 321 mdecorde
                                                                                writeImg(imgPath)
233 321 mdecorde
                                                                                if (debug) println " cut after"
234 321 mdecorde
                                                                                createNextOutput()
235 321 mdecorde
                                                                        }
236 321 mdecorde
                                                                }
237 321 mdecorde
                                                        }
238 321 mdecorde
                                                        iImages++ // next image file
239 321 mdecorde
                                                        break;
240 321 mdecorde
                                        }
241 321 mdecorde
                                        break;
242 321 mdecorde
                        }
243 321 mdecorde
                }
244 321 mdecorde
                closeMultiWriter()
245 1688 mdecorde
                if (parser != null) parser.close();
246 1688 mdecorde
                if (inputData != null) inputData.close();
247 321 mdecorde
                pages << [htmlFile, wordid] // add the last page (no pb encountered
248 321 mdecorde
249 321 mdecorde
                return pages.size() > 1
250 321 mdecorde
        }
251 321 mdecorde
252 321 mdecorde
        public boolean processWithoutImages() {
253 321 mdecorde
254 321 mdecorde
                // parse XML-TXM files
255 321 mdecorde
                boolean start = false
256 321 mdecorde
                String localname
257 321 mdecorde
258 321 mdecorde
                createNextOutput(); // empty page
259 321 mdecorde
260 321 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
261 321 mdecorde
                        switch (event) {
262 321 mdecorde
                                case XMLStreamConstants.START_ELEMENT:
263 321 mdecorde
                                        localname = parser.getLocalName();
264 321 mdecorde
                                        switch (localname) {
265 321 mdecorde
                                                case "text":
266 321 mdecorde
                                                        start = true
267 321 mdecorde
                                                        printStartPage()
268 321 mdecorde
                                                        break;
269 321 mdecorde
270 321 mdecorde
                                                case wtag:
271 321 mdecorde
                                                        if (firstWord) {
272 321 mdecorde
                                                                wordid = parser.getAttributeValue(null, "id");
273 321 mdecorde
                                                                //println "found word: $wordid"
274 321 mdecorde
                                                                firstWord = false;
275 321 mdecorde
                                                        }
276 321 mdecorde
                                                        break;
277 321 mdecorde
278 321 mdecorde
                                                case tag:
279 321 mdecorde
                                                        String url = "" // get image URL
280 321 mdecorde
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
281 321 mdecorde
                                                                if (parser.getAttributeLocalName(i).equals(attribute)) {
282 321 mdecorde
                                                                        url = parser.getAttributeValue(i);
283 321 mdecorde
                                                                        continue;
284 321 mdecorde
                                                                }
285 321 mdecorde
                                                        }
286 321 mdecorde
                                                        if (debug) println "** TAG $tag $attribute : $url"
287 321 mdecorde
288 321 mdecorde
                                                        if (cutBefore) {
289 321 mdecorde
                                                                if (debug) println " cut before"
290 321 mdecorde
                                                                createNextOutput()
291 321 mdecorde
                                                                if (debug) println " write img $url"
292 321 mdecorde
                                                                writeImg(url)
293 321 mdecorde
                                                        } else {
294 321 mdecorde
                                                                if (debug) println " write img $url"
295 321 mdecorde
                                                                writeImg(url)
296 321 mdecorde
                                                                if (debug) println " cut after"
297 321 mdecorde
                                                                url()
298 321 mdecorde
                                                        }
299 321 mdecorde
300 321 mdecorde
                                                        break;
301 321 mdecorde
                                        }
302 321 mdecorde
                                        break;
303 321 mdecorde
                        }
304 321 mdecorde
                }
305 321 mdecorde
                closeMultiWriter()
306 321 mdecorde
                pages << [htmlFile, wordid] // add the last page (no pb encountered
307 321 mdecorde
308 321 mdecorde
                return pages.size() > 1
309 321 mdecorde
        }
310 321 mdecorde
311 321 mdecorde
        public void printStartPage() {
312 321 mdecorde
                LinkedHashMap attributes = new LinkedHashMap();
313 321 mdecorde
                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
314 321 mdecorde
                        attributes[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i).toString()
315 321 mdecorde
                }
316 321 mdecorde
                pagedWriter.write("\n");
317 321 mdecorde
                pagedWriter.writeStartElement("p")
318 321 mdecorde
                if (attributes.containsKey("id")) {
319 321 mdecorde
                        pagedWriter.writeElement("h3", attributes["id"])
320 321 mdecorde
                }
321 321 mdecorde
322 321 mdecorde
                pagedWriter.writeStartElement("table");
323 321 mdecorde
                for (String k : attributes.keySet()) {
324 321 mdecorde
                        if (k == "id") continue;
325 321 mdecorde
                        pagedWriter.writeStartElement("tr");
326 321 mdecorde
                        pagedWriter.writeElement("td", k);
327 321 mdecorde
                        pagedWriter.writeElement("td", attributes[k]);
328 321 mdecorde
                        pagedWriter.writeEndElement();
329 321 mdecorde
                }
330 321 mdecorde
                pagedWriter.writeEndElement() // table
331 321 mdecorde
                pagedWriter.writeEndElement() // p
332 321 mdecorde
333 321 mdecorde
                pagedWriter.writeCharacters("");
334 321 mdecorde
        }
335 321 mdecorde
336 321 mdecorde
        public def getPageFiles() {
337 321 mdecorde
                return pages
338 321 mdecorde
        }
339 321 mdecorde
340 321 mdecorde
        public static void main(String[] args) {
341 321 mdecorde
                File txmFile = new File("/home/mdecorde/TXM/corpora/BVHEPISTEMON2016/txm/BVHEPISTEMON2016/1546_RabelaisTL.xml")
342 321 mdecorde
                String txtname = "1532_RabelaisPrnstctn"
343 321 mdecorde
                String corpusname = "BVH"
344 321 mdecorde
                File newEditionDirectory = new File("/tmp/xtzpagertest/")
345 321 mdecorde
                newEditionDirectory.deleteDir()
346 321 mdecorde
                newEditionDirectory.mkdir()
347 321 mdecorde
                File imageDirectory = null;
348 321 mdecorde
349 321 mdecorde
                def ed = new XTZFacsPagerStep(txmFile, newEditionDirectory, imageDirectory, txtname, corpusname, "pb", "facs", "w", true);
350 321 mdecorde
                if (!ed.process()) {
351 321 mdecorde
                        println "Fail to build edition for text: $txmFile"
352 321 mdecorde
                }
353 321 mdecorde
        }
354 321 mdecorde
}