root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZFacsPagerStep.groovy @ 2126
History | View | Annotate | Download (10.5 kB)
1 | 986 | mdecorde | package org.txm.scripts.importer.xtz
|
---|---|---|---|
2 | 321 | mdecorde | |
3 | 986 | mdecorde | import org.txm.scripts.importer.* |
4 | 321 | mdecorde | import org.xml.sax.Attributes |
5 | 1000 | mdecorde | import org.txm.importer.scripts.filters.* |
6 | 321 | mdecorde | import java.util.ArrayList |
7 | 321 | mdecorde | import javax.xml.parsers.* |
8 | 321 | mdecorde | import javax.xml.stream.* |
9 | 321 | mdecorde | import java.net.URL |
10 | 321 | mdecorde | import org.xml.sax.InputSource |
11 | 321 | mdecorde | import org.xml.sax.helpers.DefaultHandler |
12 | 321 | mdecorde | import java.io.FileFilter |
13 | 1000 | mdecorde | import org.txm.importer.xtz.* |
14 | 321 | mdecorde | |
15 | 321 | mdecorde | class XTZFacsPagerStep extends Step { |
16 | 321 | mdecorde | |
17 | 321 | mdecorde | private def url |
18 | 321 | mdecorde | private def inputData |
19 | 321 | mdecorde | private def factory |
20 | 321 | mdecorde | private XMLStreamReader parser
|
21 | 321 | mdecorde | OutputStreamWriter writer
|
22 | 321 | mdecorde | StaxStackWriter pagedWriter = null
|
23 | 321 | mdecorde | |
24 | 321 | mdecorde | File editionDir
|
25 | 321 | mdecorde | File imageDirectory // contains all the images sub-diretories |
26 | 321 | mdecorde | int iImages
|
27 | 321 | mdecorde | File textImagesDirectory // contains the images for the txtname |
28 | 321 | mdecorde | File xmlFile
|
29 | 321 | mdecorde | File htmlFile
|
30 | 321 | mdecorde | |
31 | 321 | mdecorde | def pages = [] |
32 | 321 | mdecorde | def imageAttributes = [:];
|
33 | 321 | mdecorde | |
34 | 321 | mdecorde | def tag, attribute, txtname, corpusname
|
35 | 321 | mdecorde | boolean firstWord
|
36 | 321 | mdecorde | boolean cutBefore = true; |
37 | 321 | mdecorde | String wtag;
|
38 | 321 | mdecorde | boolean debug = false; |
39 | 321 | mdecorde | |
40 | 321 | mdecorde | public XTZFacsPagerStep(File xmlFile, File editionDir, File imageDirectory, String txtname, String corpusname, String tag, String attribute, String wtag, boolean debug) { |
41 | 321 | mdecorde | inputData = xmlFile.toURI().toURL().openStream() |
42 | 321 | mdecorde | factory = XMLInputFactory.newInstance() |
43 | 321 | mdecorde | parser = factory.createXMLStreamReader(inputData) |
44 | 321 | mdecorde | |
45 | 321 | mdecorde | this.xmlFile = xmlFile
|
46 | 321 | mdecorde | this.editionDir = editionDir
|
47 | 321 | mdecorde | editionDir.mkdirs() |
48 | 321 | mdecorde | this.imageDirectory = imageDirectory
|
49 | 321 | mdecorde | this.tag = tag
|
50 | 321 | mdecorde | this.attribute = attribute
|
51 | 321 | mdecorde | this.txtname = txtname
|
52 | 321 | mdecorde | this.corpusname= corpusname
|
53 | 321 | mdecorde | this.wtag= wtag;
|
54 | 321 | mdecorde | this.debug = debug
|
55 | 321 | mdecorde | |
56 | 321 | mdecorde | imageAttributes = ["src":"", |
57 | 321 | mdecorde | "id":"txm_image", |
58 | 321 | mdecorde | "alt":"Le fac-similé de cette page n'est pas accessible", |
59 | 321 | mdecorde | //"style":"display:block;width:100%;",
|
60 | 321 | mdecorde | "onLoad":"viewer.toolbarImages='images/icons';viewer.onload=viewer.toolbar;new viewer({image: this, frame: ['100%','100%']});" |
61 | 321 | mdecorde | ] |
62 | 321 | mdecorde | |
63 | 321 | mdecorde | if (imageDirectory != null) |
64 | 321 | mdecorde | textImagesDirectory = new File(imageDirectory, txtname) |
65 | 321 | mdecorde | iImages = 0;
|
66 | 321 | mdecorde | } |
67 | 321 | mdecorde | |
68 | 321 | mdecorde | int n = 1; |
69 | 321 | mdecorde | private boolean createNextOutput() { |
70 | 321 | mdecorde | try {
|
71 | 321 | mdecorde | def tags = closeMultiWriter();
|
72 | 321 | mdecorde | for (int i = 0 ; i < tags.size() ; i++) { |
73 | 321 | mdecorde | String tag = tags[i]
|
74 | 479 | mdecorde | if ("div" != tag) { |
75 | 321 | mdecorde | tags.remove(i--) |
76 | 321 | mdecorde | } else {
|
77 | 479 | mdecorde | tags.remove(i--) // remove first "div"
|
78 | 479 | mdecorde | break; // remove elements until first "div" tag |
79 | 321 | mdecorde | } |
80 | 321 | mdecorde | } |
81 | 321 | mdecorde | |
82 | 321 | mdecorde | if (wordid != null) {//wordid = "w_0"; |
83 | 321 | mdecorde | //println " add page $n $wordid, page=$pages"
|
84 | 321 | mdecorde | pages << [htmlFile, wordid] // store the previous page
|
85 | 321 | mdecorde | n++ |
86 | 321 | mdecorde | } |
87 | 321 | mdecorde | |
88 | 321 | mdecorde | // Page suivante
|
89 | 321 | mdecorde | htmlFile = new File(editionDir, "${txtname}_${n}.html") |
90 | 321 | mdecorde | |
91 | 321 | mdecorde | firstWord = true
|
92 | 321 | mdecorde | //println "SET FIRST WORD=true"
|
93 | 321 | mdecorde | pagedWriter = new StaxStackWriter(htmlFile, "UTF-8"); |
94 | 321 | mdecorde | if (debug) println "Create file $htmlFile" |
95 | 321 | mdecorde | pagedWriter.writeStartDocument("UTF-8", "1.0") |
96 | 321 | mdecorde | pagedWriter.writeStartElement("html");
|
97 | 321 | mdecorde | pagedWriter.writeStartElement("head");
|
98 | 321 | mdecorde | pagedWriter.writeStartElement("title")
|
99 | 321 | mdecorde | pagedWriter.writeCharacters(corpusname+" Edition - Page "+n)
|
100 | 321 | mdecorde | pagedWriter.writeEndElement(); // </title>
|
101 | 321 | mdecorde | pagedWriter.writeStartElement("meta", ["http-equiv":"Content-Type", "content":"text/html","charset":"UTF-8"]); |
102 | 321 | mdecorde | pagedWriter.writeEndElement() |
103 | 321 | mdecorde | pagedWriter.writeStartElement("link", ["rel":"stylesheet", "media":"all", "type":"text/css","href":"txm.css"]); |
104 | 321 | mdecorde | pagedWriter.writeEndElement() |
105 | 321 | mdecorde | pagedWriter.writeStartElement("link", ["rel":"stylesheet", "media":"all", "type":"text/css","href":"${corpusname}.css"]); |
106 | 321 | mdecorde | pagedWriter.writeEndElement() |
107 | 321 | mdecorde | pagedWriter.writeStartElement("script", ["type":"text/javascript", "src":"js/viewer/Simple_Viewer_beta_1.1-min.js"]); |
108 | 321 | mdecorde | pagedWriter.writeEndElement() |
109 | 321 | mdecorde | pagedWriter.writeStartElement("script", ["type":"text/javascript", "src":"js/viewer/toolbar-ext.js"]); |
110 | 321 | mdecorde | pagedWriter.writeEndElement() |
111 | 321 | mdecorde | pagedWriter.writeStartElement("link", ["rel":"stylesheet", "media":"all", "type":"text/css","href":"js/viewer/toolbar-ext.css"]); |
112 | 321 | mdecorde | pagedWriter.writeEndElement() |
113 | 321 | mdecorde | pagedWriter.writeEndElement() // </head>
|
114 | 321 | mdecorde | pagedWriter.writeStartElement("body") //<body> |
115 | 321 | mdecorde | pagedWriter.writeStartElement("div", ["id":"txmeditionpage"]); |
116 | 321 | mdecorde | pagedWriter.writeStartElements(tags); |
117 | 321 | mdecorde | |
118 | 321 | mdecorde | wordid = "${wtag}_0"; // default value if no word is found |
119 | 321 | mdecorde | return true; |
120 | 321 | mdecorde | } catch (Exception e) { |
121 | 321 | mdecorde | System.out.println("Error while creating HTML file: "+e); |
122 | 321 | mdecorde | return false; |
123 | 321 | mdecorde | } |
124 | 321 | mdecorde | } |
125 | 321 | mdecorde | |
126 | 321 | mdecorde | private def closeMultiWriter() |
127 | 321 | mdecorde | { |
128 | 321 | mdecorde | if (pagedWriter != null) { |
129 | 321 | mdecorde | def tags = pagedWriter.getTagStack().clone();
|
130 | 321 | mdecorde | |
131 | 321 | mdecorde | if (firstWord) { // there was no words |
132 | 321 | mdecorde | pagedWriter.writeCharacters("");
|
133 | 321 | mdecorde | pagedWriter.write("<span id=\"${wtag}_0\"/>");
|
134 | 321 | mdecorde | } |
135 | 321 | mdecorde | pagedWriter.writeEndElements(); |
136 | 321 | mdecorde | pagedWriter.close(); |
137 | 321 | mdecorde | return tags;
|
138 | 321 | mdecorde | } else {
|
139 | 321 | mdecorde | return []; |
140 | 321 | mdecorde | } |
141 | 321 | mdecorde | } |
142 | 321 | mdecorde | |
143 | 321 | mdecorde | private writeImg(String src) { |
144 | 321 | mdecorde | |
145 | 321 | mdecorde | |
146 | 321 | mdecorde | if (src == null || src.length() == 0) { |
147 | 321 | mdecorde | pagedWriter.writeStartElement("p", ["class":"no-img"]); |
148 | 321 | mdecorde | pagedWriter.writeCharacters("Cette page n'est pas disponible en mode fac-similé.");
|
149 | 321 | mdecorde | pagedWriter.writeEndElement(); // </div>
|
150 | 321 | mdecorde | } else {
|
151 | 321 | mdecorde | imageAttributes["src"] = src
|
152 | 321 | mdecorde | pagedWriter.writeEmptyElement("img", imageAttributes);
|
153 | 321 | mdecorde | } |
154 | 321 | mdecorde | |
155 | 321 | mdecorde | } |
156 | 321 | mdecorde | |
157 | 321 | mdecorde | /**
|
158 | 321 | mdecorde | * Process the XML-TXM file. if imageDirectory is not null, then it will be use to fetch the images, if not then the pb@facs attribute contains the image path
|
159 | 321 | mdecorde | */
|
160 | 321 | mdecorde | public boolean process() { |
161 | 321 | mdecorde | if (textImagesDirectory != null) { |
162 | 321 | mdecorde | return processWithImages();
|
163 | 321 | mdecorde | } else {
|
164 | 321 | mdecorde | return processWithoutImages();
|
165 | 321 | mdecorde | } |
166 | 321 | mdecorde | } |
167 | 321 | mdecorde | |
168 | 321 | mdecorde | String wordid = null; |
169 | 321 | mdecorde | def imagesFiles;
|
170 | 321 | mdecorde | public boolean processWithImages() { |
171 | 321 | mdecorde | |
172 | 321 | mdecorde | if (!textImagesDirectory.exists()) {
|
173 | 321 | mdecorde | println ("No images directory found: "+textImagesDirectory)
|
174 | 321 | mdecorde | return false; |
175 | 321 | mdecorde | } |
176 | 321 | mdecorde | |
177 | 321 | mdecorde | // Scan textImagesDirectory for images
|
178 | 321 | mdecorde | imagesFiles = textImagesDirectory.listFiles(new FileFilter() { |
179 | 321 | mdecorde | public boolean accept(File f) { |
180 | 321 | mdecorde | return !f.isHidden() && !f.isDirectory()
|
181 | 321 | mdecorde | } |
182 | 321 | mdecorde | }); |
183 | 321 | mdecorde | if (imagesFiles == null || imagesFiles.size() == 0) { |
184 | 321 | mdecorde | println ("No facs images found in "+textImagesDirectory)
|
185 | 321 | mdecorde | return false; |
186 | 321 | mdecorde | } |
187 | 321 | mdecorde | imagesFiles.sort(); // sort by name
|
188 | 321 | mdecorde | iImages = 0;
|
189 | 321 | mdecorde | //println "For text "+xmlFile.getName()
|
190 | 321 | mdecorde | //println " using images : $imagesFiles"
|
191 | 321 | mdecorde | |
192 | 321 | mdecorde | // parse XML-TXM files
|
193 | 321 | mdecorde | boolean start = false |
194 | 321 | mdecorde | String localname
|
195 | 321 | mdecorde | |
196 | 321 | mdecorde | createNextOutput(); // empty page
|
197 | 321 | mdecorde | |
198 | 321 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
199 | 321 | mdecorde | switch (event) {
|
200 | 321 | mdecorde | case XMLStreamConstants.START_ELEMENT:
|
201 | 321 | mdecorde | localname = parser.getLocalName(); |
202 | 321 | mdecorde | switch (localname) {
|
203 | 321 | mdecorde | case "text": |
204 | 321 | mdecorde | start = true
|
205 | 321 | mdecorde | printStartPage(); |
206 | 321 | mdecorde | break;
|
207 | 321 | mdecorde | break;
|
208 | 321 | mdecorde | case wtag: |
209 | 321 | mdecorde | if (firstWord) {
|
210 | 321 | mdecorde | wordid = parser.getAttributeValue(null, "id"); |
211 | 321 | mdecorde | //println "found word: $wordid"
|
212 | 321 | mdecorde | firstWord = false;
|
213 | 321 | mdecorde | } |
214 | 321 | mdecorde | break;
|
215 | 321 | mdecorde | case tag: |
216 | 321 | mdecorde | if (debug) println "** TAG $tag $attribute : "+parser.getAttributeValue(null, attribute); |
217 | 321 | mdecorde | |
218 | 321 | mdecorde | if (iImages >= imagesFiles.size() ) {
|
219 | 321 | mdecorde | println " ERROR Cound not find the $iImages th image for file $xmlFile at location "+parser.getLocation().getLineNumber()
|
220 | 321 | mdecorde | } else {
|
221 | 321 | mdecorde | String imgPath = "res/images/$corpusname/${editionDir.getName()}/$txtname/"+imagesFiles[iImages].getName() |
222 | 321 | mdecorde | if (imgPath == null) { |
223 | 321 | mdecorde | println " ERROR in $xmlFile no value found for tag=$tag attribute=@$attribute iImages=@iImages at location "+parser.getLocation().getLineNumber()
|
224 | 321 | mdecorde | } else {
|
225 | 321 | mdecorde | if (cutBefore) {
|
226 | 321 | mdecorde | if (debug) println " cut before" |
227 | 321 | mdecorde | createNextOutput() |
228 | 321 | mdecorde | if (debug) println " write img $imgPath" |
229 | 321 | mdecorde | writeImg(imgPath) |
230 | 321 | mdecorde | } else {
|
231 | 321 | mdecorde | if (debug) println " write img $imgPath" |
232 | 321 | mdecorde | writeImg(imgPath) |
233 | 321 | mdecorde | if (debug) println " cut after" |
234 | 321 | mdecorde | createNextOutput() |
235 | 321 | mdecorde | } |
236 | 321 | mdecorde | } |
237 | 321 | mdecorde | } |
238 | 321 | mdecorde | iImages++ // next image file
|
239 | 321 | mdecorde | break;
|
240 | 321 | mdecorde | } |
241 | 321 | mdecorde | break;
|
242 | 321 | mdecorde | } |
243 | 321 | mdecorde | } |
244 | 321 | mdecorde | closeMultiWriter() |
245 | 1688 | mdecorde | if (parser != null) parser.close(); |
246 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
247 | 321 | mdecorde | pages << [htmlFile, wordid] // add the last page (no pb encountered
|
248 | 321 | mdecorde | |
249 | 321 | mdecorde | return pages.size() > 1 |
250 | 321 | mdecorde | } |
251 | 321 | mdecorde | |
252 | 321 | mdecorde | public boolean processWithoutImages() { |
253 | 321 | mdecorde | |
254 | 321 | mdecorde | // parse XML-TXM files
|
255 | 321 | mdecorde | boolean start = false |
256 | 321 | mdecorde | String localname
|
257 | 321 | mdecorde | |
258 | 321 | mdecorde | createNextOutput(); // empty page
|
259 | 321 | mdecorde | |
260 | 321 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
261 | 321 | mdecorde | switch (event) {
|
262 | 321 | mdecorde | case XMLStreamConstants.START_ELEMENT:
|
263 | 321 | mdecorde | localname = parser.getLocalName(); |
264 | 321 | mdecorde | switch (localname) {
|
265 | 321 | mdecorde | case "text": |
266 | 321 | mdecorde | start = true
|
267 | 321 | mdecorde | printStartPage() |
268 | 321 | mdecorde | break;
|
269 | 321 | mdecorde | |
270 | 321 | mdecorde | case wtag: |
271 | 321 | mdecorde | if (firstWord) {
|
272 | 321 | mdecorde | wordid = parser.getAttributeValue(null, "id"); |
273 | 321 | mdecorde | //println "found word: $wordid"
|
274 | 321 | mdecorde | firstWord = false;
|
275 | 321 | mdecorde | } |
276 | 321 | mdecorde | break;
|
277 | 321 | mdecorde | |
278 | 321 | mdecorde | case tag: |
279 | 321 | mdecorde | String url = "" // get image URL |
280 | 321 | mdecorde | for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
281 | 321 | mdecorde | if (parser.getAttributeLocalName(i).equals(attribute)) {
|
282 | 321 | mdecorde | url = parser.getAttributeValue(i); |
283 | 321 | mdecorde | continue;
|
284 | 321 | mdecorde | } |
285 | 321 | mdecorde | } |
286 | 321 | mdecorde | if (debug) println "** TAG $tag $attribute : $url" |
287 | 321 | mdecorde | |
288 | 321 | mdecorde | if (cutBefore) {
|
289 | 321 | mdecorde | if (debug) println " cut before" |
290 | 321 | mdecorde | createNextOutput() |
291 | 321 | mdecorde | if (debug) println " write img $url" |
292 | 321 | mdecorde | writeImg(url) |
293 | 321 | mdecorde | } else {
|
294 | 321 | mdecorde | if (debug) println " write img $url" |
295 | 321 | mdecorde | writeImg(url) |
296 | 321 | mdecorde | if (debug) println " cut after" |
297 | 321 | mdecorde | url() |
298 | 321 | mdecorde | } |
299 | 321 | mdecorde | |
300 | 321 | mdecorde | break;
|
301 | 321 | mdecorde | } |
302 | 321 | mdecorde | break;
|
303 | 321 | mdecorde | } |
304 | 321 | mdecorde | } |
305 | 321 | mdecorde | closeMultiWriter() |
306 | 321 | mdecorde | pages << [htmlFile, wordid] // add the last page (no pb encountered
|
307 | 321 | mdecorde | |
308 | 321 | mdecorde | return pages.size() > 1 |
309 | 321 | mdecorde | } |
310 | 321 | mdecorde | |
311 | 321 | mdecorde | public void printStartPage() { |
312 | 321 | mdecorde | LinkedHashMap attributes = new LinkedHashMap(); |
313 | 321 | mdecorde | for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
314 | 321 | mdecorde | attributes[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i).toString() |
315 | 321 | mdecorde | } |
316 | 321 | mdecorde | pagedWriter.write("\n");
|
317 | 321 | mdecorde | pagedWriter.writeStartElement("p")
|
318 | 321 | mdecorde | if (attributes.containsKey("id")) { |
319 | 321 | mdecorde | pagedWriter.writeElement("h3", attributes["id"]) |
320 | 321 | mdecorde | } |
321 | 321 | mdecorde | |
322 | 321 | mdecorde | pagedWriter.writeStartElement("table");
|
323 | 321 | mdecorde | for (String k : attributes.keySet()) { |
324 | 321 | mdecorde | if (k == "id") continue; |
325 | 321 | mdecorde | pagedWriter.writeStartElement("tr");
|
326 | 321 | mdecorde | pagedWriter.writeElement("td", k);
|
327 | 321 | mdecorde | pagedWriter.writeElement("td", attributes[k]);
|
328 | 321 | mdecorde | pagedWriter.writeEndElement(); |
329 | 321 | mdecorde | } |
330 | 321 | mdecorde | pagedWriter.writeEndElement() // table
|
331 | 321 | mdecorde | pagedWriter.writeEndElement() // p
|
332 | 321 | mdecorde | |
333 | 321 | mdecorde | pagedWriter.writeCharacters("");
|
334 | 321 | mdecorde | } |
335 | 321 | mdecorde | |
336 | 321 | mdecorde | public def getPageFiles() { |
337 | 321 | mdecorde | return pages
|
338 | 321 | mdecorde | } |
339 | 321 | mdecorde | |
340 | 321 | mdecorde | public static void main(String[] args) { |
341 | 321 | mdecorde | File txmFile = new File("/home/mdecorde/TXM/corpora/BVHEPISTEMON2016/txm/BVHEPISTEMON2016/1546_RabelaisTL.xml") |
342 | 321 | mdecorde | String txtname = "1532_RabelaisPrnstctn" |
343 | 321 | mdecorde | String corpusname = "BVH" |
344 | 321 | mdecorde | File newEditionDirectory = new File("/tmp/xtzpagertest/") |
345 | 321 | mdecorde | newEditionDirectory.deleteDir() |
346 | 321 | mdecorde | newEditionDirectory.mkdir() |
347 | 321 | mdecorde | File imageDirectory = null; |
348 | 321 | mdecorde | |
349 | 321 | mdecorde | def ed = new XTZFacsPagerStep(txmFile, newEditionDirectory, imageDirectory, txtname, corpusname, "pb", "facs", "w", true); |
350 | 321 | mdecorde | if (!ed.process()) {
|
351 | 321 | mdecorde | println "Fail to build edition for text: $txmFile"
|
352 | 321 | mdecorde | } |
353 | 321 | mdecorde | } |
354 | 321 | mdecorde | } |