Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / xtz / XTZPager.groovy @ 966

History | View | Annotate | Download (16.4 kB)

1 321 mdecorde
package org.txm.importer.xtz
2 321 mdecorde
3 321 mdecorde
import java.io.File;
4 321 mdecorde
import java.util.ArrayList;
5 321 mdecorde
6 321 mdecorde
import org.txm.objects.BaseParameters
7 321 mdecorde
import org.w3c.dom.Element
8 321 mdecorde
import org.txm.importer.*
9 321 mdecorde
import org.txm.stat.utils.ConsoleProgressBar
10 321 mdecorde
import org.txm.utils.BundleUtils;
11 479 mdecorde
import org.txm.utils.io.FileCopy;
12 321 mdecorde
import org.txm.utils.i18n.*
13 321 mdecorde
14 321 mdecorde
import javax.xml.stream.*
15 321 mdecorde
16 321 mdecorde
class XTZPager extends Pager {
17 321 mdecorde
18 321 mdecorde
        BaseParameters params;
19 321 mdecorde
20 321 mdecorde
        Element corpusElem;
21 321 mdecorde
        String lang;
22 321 mdecorde
        String page_element;
23 321 mdecorde
        String wordTag;
24 321 mdecorde
        int wordsPerPage;
25 321 mdecorde
26 321 mdecorde
        File cssDirectory, jsDirectory, imagesDirectory;
27 321 mdecorde
28 321 mdecorde
        public XTZPager(ImportModule module) {
29 321 mdecorde
                super(module, "default");
30 321 mdecorde
31 321 mdecorde
                params = module.getParameters()
32 321 mdecorde
33 321 mdecorde
                corpusElem = params.corpora.get(corpusname);
34 321 mdecorde
                lang = corpusElem.getAttribute("lang");
35 321 mdecorde
                wordsPerPage = params.getWordsPerPage("default")
36 321 mdecorde
                page_element = params.getPageElement("default")
37 321 mdecorde
                wordTag = module.getParameters().getWordElement().getTextContent()
38 321 mdecorde
39 321 mdecorde
                cssDirectory = new File(module.getSourceDirectory(), "css")
40 321 mdecorde
                jsDirectory = new File(module.getSourceDirectory(), "js")
41 321 mdecorde
                imagesDirectory = new File(module.getSourceDirectory(), "images")
42 321 mdecorde
        }
43 321 mdecorde
44 321 mdecorde
        public void process(ArrayList<File> files) {
45 321 mdecorde
                super.process(files);
46 321 mdecorde
47 321 mdecorde
                if (files == null) {
48 321 mdecorde
                        files = inputDirectory.listFiles();
49 321 mdecorde
                        if (files != null) Collections.sort(files);
50 321 mdecorde
                }
51 321 mdecorde
52 321 mdecorde
                if (!doDefaultEditionStep()) return;
53 321 mdecorde
                if (!doFacsEditionStep()) return;
54 321 mdecorde
                if (!doPostEditionXSLStep()) return;
55 321 mdecorde
56 321 mdecorde
                isSuccessFul = true;
57 321 mdecorde
                println ""
58 321 mdecorde
        }
59 321 mdecorde
60 321 mdecorde
        public boolean doDefaultEditionStep() {
61 321 mdecorde
62 321 mdecorde
                boolean build_edition = module.getParameters().getDoEdition("default")
63 321 mdecorde
                if (!build_edition) {
64 321 mdecorde
                        return true;
65 321 mdecorde
                }
66 321 mdecorde
67 321 mdecorde
                def second = 0
68 321 mdecorde
69 321 mdecorde
                println "-- Building 'default' edition of  ${files.size()} texts..."
70 321 mdecorde
71 321 mdecorde
                def css = ["css/txm.css", "css/${corpusname}.css"] // default CSS inclusion
72 321 mdecorde
73 321 mdecorde
                // scan existing css files that must be declared in each HTML page
74 321 mdecorde
                if (cssDirectory.exists()) {
75 321 mdecorde
                        def cssFiles = cssDirectory.listFiles();
76 321 mdecorde
                        if (cssFiles != null)
77 321 mdecorde
                                for (File cssFile : cssFiles) {
78 321 mdecorde
                                        if (cssFile.isFile() && !cssFile.isHidden() && cssFile.getName().endsWith(".css"))
79 321 mdecorde
                                                css << "css/"+cssFile.getName();
80 321 mdecorde
                                }
81 321 mdecorde
                }
82 321 mdecorde
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
83 321 mdecorde
                for (File txmFile : files) {
84 321 mdecorde
                        cpb.tick()
85 321 mdecorde
                        String textname = txmFile.getName();
86 321 mdecorde
                        int i = textname.lastIndexOf(".");
87 321 mdecorde
                        if (i > 0) textname = textname.substring(0, i);
88 321 mdecorde
89 321 mdecorde
                        File firstHTMLPageFile = new File(outputDirectory, textname+"_1.html");
90 321 mdecorde
                        if (firstHTMLPageFile.exists() && firstHTMLPageFile.lastModified() >= txmFile.lastModified()) continue;
91 321 mdecorde
92 321 mdecorde
                        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
93 321 mdecorde
                        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
94 321 mdecorde
95 321 mdecorde
                        Element texts = params.getTextsElement(corpusElem);
96 321 mdecorde
                        Element text = params.getTextElement(texts, textname);
97 321 mdecorde
                        if (text == null) { // just in case
98 321 mdecorde
                                text = params.addText(corpusElem, textname, txmFile)
99 321 mdecorde
                        }
100 321 mdecorde
                        def ed = new XTZDefaultPagerStep(this, txmFile, textname, NoSpaceBefore, NoSpaceAfter, css);
101 321 mdecorde
                        Element edition = params.getEditionElement(text, "default");
102 321 mdecorde
                        if (edition == null) {
103 321 mdecorde
                                edition = params.addEdition(text, "default", outputDirectory.getAbsolutePath(), "html");
104 321 mdecorde
                        } else { // remove existing pages if any
105 321 mdecorde
                                def children = edition.getChildNodes();
106 321 mdecorde
                                while (children.getLength() > 0 ) {
107 321 mdecorde
                                        edition.removeChild(children.item(0))
108 321 mdecorde
                                }
109 321 mdecorde
                        }
110 321 mdecorde
111 321 mdecorde
                        for (i = 0 ; i < ed.getPageFiles().size();) {
112 321 mdecorde
                                File f = ed.getPageFiles().get(i);
113 321 mdecorde
                                String wordid = ed.getIdx().get(i);
114 321 mdecorde
                                params.addPage(edition, ""+(++i), wordid);
115 321 mdecorde
                        }
116 321 mdecorde
                }
117 321 mdecorde
118 321 mdecorde
                // copy default TXM css file in the "facs" edition directory
119 321 mdecorde
                File csshtmlDirectory = new File(outputDirectory, "css")
120 321 mdecorde
                csshtmlDirectory.mkdirs()
121 812 mdecorde
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/css", "txm.css", csshtmlDirectory);
122 321 mdecorde
123 321 mdecorde
                // copy CSS files in the "default" edition directory
124 321 mdecorde
                if (cssDirectory.exists()) {
125 321 mdecorde
                        FileCopy.copyFiles(cssDirectory, csshtmlDirectory)
126 321 mdecorde
                }
127 321 mdecorde
                if (jsDirectory.exists()) {
128 321 mdecorde
                        File jshtmlDirectory = new File(outputDirectory, "js")
129 321 mdecorde
                        FileCopy.copyFiles(jsDirectory, jshtmlDirectory)
130 321 mdecorde
                }
131 321 mdecorde
                if (imagesDirectory.exists()) {
132 321 mdecorde
                        File imageshtmlDirectory = new File(outputDirectory, "images")
133 321 mdecorde
                        FileCopy.copyFiles(imagesDirectory, imageshtmlDirectory)
134 321 mdecorde
                }
135 321 mdecorde
136 321 mdecorde
                // save changes
137 321 mdecorde
                println ""
138 321 mdecorde
                return true;
139 321 mdecorde
        }
140 321 mdecorde
141 321 mdecorde
        public boolean doFacsEditionStep() {
142 321 mdecorde
143 321 mdecorde
                boolean mustBuildFacsEdition = module.getParameters().getDoFacsEdition();
144 321 mdecorde
                if (!mustBuildFacsEdition) return true;
145 321 mdecorde
146 321 mdecorde
                String imageDirectoryPath = module.getParameters().getFacsEditionImageDirectory().trim();
147 321 mdecorde
                File imageDirectory = new File(imageDirectoryPath);
148 321 mdecorde
                if (!imageDirectoryPath.startsWith("http") && imageDirectoryPath.length()== 0 && !imageDirectory.exists() && !imageDirectory.isDirectory()) {
149 321 mdecorde
                        imageDirectory = null;
150 321 mdecorde
                }
151 321 mdecorde
152 321 mdecorde
                def second = 0
153 321 mdecorde
154 321 mdecorde
                println "-- Building 'facs' edition of ${files.size()} texts..."
155 321 mdecorde
                File newEditionDirectory = new File(htmlDirectory, "facs");
156 321 mdecorde
                newEditionDirectory.mkdir();
157 321 mdecorde
158 321 mdecorde
                // declare the "facs" edition
159 321 mdecorde
                Element editionDefinitionElement = params.getEditionDefinitionElement(corpusElem, "facs");
160 321 mdecorde
                if (editionDefinitionElement == null) {
161 321 mdecorde
                        editionDefinitionElement = params.addEditionDefinition(corpusElem, "facs", "groovy", "XTZPager.groovy")
162 321 mdecorde
                }
163 321 mdecorde
                Element textsElement = params.getTextsElement(corpusElem)
164 321 mdecorde
165 321 mdecorde
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
166 321 mdecorde
                for (File txmFile : files) {
167 321 mdecorde
                        cpb.tick()
168 321 mdecorde
                        String txtname = txmFile.getName();
169 321 mdecorde
                        int i = txtname.lastIndexOf(".");
170 321 mdecorde
                        if (i > 0) txtname = txtname.substring(0, i);
171 321 mdecorde
172 321 mdecorde
                        File firstHTMLPageFile = new File(newEditionDirectory, txtname+"_1.html");
173 321 mdecorde
                        if (firstHTMLPageFile.exists() && firstHTMLPageFile.lastModified() >= txmFile.lastModified()) continue;
174 321 mdecorde
175 321 mdecorde
                        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
176 321 mdecorde
                        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
177 321 mdecorde
178 321 mdecorde
                        // the text should already exists
179 321 mdecorde
                        Element text = params.getTextElement(textsElement, txtname)
180 321 mdecorde
                        if (text == null) { // just in case
181 321 mdecorde
                                text = params.addText(corpusElem, txtname, txmFile)
182 321 mdecorde
                        }
183 321 mdecorde
                        Element edition = params.getEditionElement(text, "facs");
184 321 mdecorde
                        if (edition == null) {
185 321 mdecorde
                                edition = params.addEdition(text, "facs", newEditionDirectory.getAbsolutePath(), "html");
186 321 mdecorde
                        } else {
187 321 mdecorde
                                def children = edition.getChildNodes();
188 321 mdecorde
                                while (children.getLength() > 0 ) {
189 321 mdecorde
                                        edition.removeChild(children.item(0))
190 321 mdecorde
                                }
191 321 mdecorde
                        }
192 321 mdecorde
193 321 mdecorde
                        try {
194 321 mdecorde
                                def ed = new XTZFacsPagerStep(txmFile, newEditionDirectory, imageDirectory, txtname, corpusname, "pb", "facs", wordTag, debug);
195 321 mdecorde
                                if (!ed.process()) {
196 321 mdecorde
                                        println "Fail to build edition for text: $txmFile"
197 321 mdecorde
                                        continue;
198 321 mdecorde
                                }
199 321 mdecorde
200 698 mdecorde
201 321 mdecorde
                                def pages = ed.getPageFiles()
202 321 mdecorde
                                for (i = 0 ; i < pages.size();) {
203 321 mdecorde
                                        File f = pages[i][0];
204 321 mdecorde
                                        String wordid = pages[i][1]
205 698 mdecorde
                                        //TODO replace '""+(++i)' with something that fetch/findout the page 'name'
206 698 mdecorde
                                        // TODO or move the Edition and Page corpus declaration in the XTZDefaultPagerStep
207 321 mdecorde
                                        params.addPage(edition, ""+(++i), wordid);
208 321 mdecorde
                                        //println "add facs page: $f $wordid"
209 321 mdecorde
                                }
210 321 mdecorde
                        } catch (Exception e) {
211 321 mdecorde
                                println "Error while processing $txmFile text: "+e
212 321 mdecorde
                                e.printStackTrace();
213 321 mdecorde
                                return false;
214 321 mdecorde
                        }
215 321 mdecorde
                }
216 321 mdecorde
217 321 mdecorde
                if (!imageDirectoryPath.startsWith("http") && imageDirectory != null) { // copy files only if local
218 321 mdecorde
                        File editionImagesDirectory = new File(newEditionDirectory, "res/images/"+corpusname+"/facs");
219 321 mdecorde
                        editionImagesDirectory.mkdirs();
220 321 mdecorde
                        FileCopy.copyFiles(imageDirectory, editionImagesDirectory);
221 321 mdecorde
                }
222 321 mdecorde
223 321 mdecorde
                // copy SimpleViewer files in the "facs" edition directory
224 321 mdecorde
                File jshtmlDirectory = new File(newEditionDirectory, "js")
225 321 mdecorde
                jshtmlDirectory.mkdirs()
226 812 mdecorde
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/js", "viewer", jshtmlDirectory);
227 812 mdecorde
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/", "images", newEditionDirectory);
228 321 mdecorde
229 321 mdecorde
                // copy default TXM css file in the "facs" edition directory
230 321 mdecorde
                File csshtmlDirectory = new File(newEditionDirectory, "css")
231 321 mdecorde
                csshtmlDirectory.mkdirs()
232 812 mdecorde
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/css", "txm.css", csshtmlDirectory);
233 321 mdecorde
234 321 mdecorde
                // copy CSS/JS/Images sources files in the "facs" edition directory
235 321 mdecorde
                if (cssDirectory.exists()) {
236 321 mdecorde
                        FileCopy.copyFiles(cssDirectory, csshtmlDirectory)
237 321 mdecorde
                }
238 321 mdecorde
                if (jsDirectory.exists()) {
239 321 mdecorde
                        FileCopy.copyFiles(jsDirectory, jshtmlDirectory)
240 321 mdecorde
                }
241 321 mdecorde
                if (imagesDirectory.exists()) {
242 321 mdecorde
                        File imageshtmlDirectory = new File(newEditionDirectory, "images")
243 321 mdecorde
                        FileCopy.copyFiles(imagesDirectory, imageshtmlDirectory)
244 321 mdecorde
                }
245 321 mdecorde
246 321 mdecorde
                Element editionsElement = params.getEditionsElement(corpusElem);
247 321 mdecorde
                editionsElement.setAttribute("default","default,facs");
248 321 mdecorde
249 321 mdecorde
                println ""
250 321 mdecorde
                return true;
251 321 mdecorde
        }
252 321 mdecorde
253 321 mdecorde
        /**
254 321 mdecorde
         * read from $bindir/txm and write the result in $bindir/txm
255 321 mdecorde
         *
256 321 mdecorde
         */
257 321 mdecorde
        public boolean doPostEditionXSLStep() {
258 321 mdecorde
259 321 mdecorde
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/4-edition")
260 321 mdecorde
                if (xslDirectory.exists()) {
261 321 mdecorde
262 321 mdecorde
                        // prepare XSL parameters
263 321 mdecorde
                        def xslParams = module.getParameters().getXsltParams(corpusElem);
264 321 mdecorde
                        String s = module.getParameters().getWordsPerPage("default");
265 321 mdecorde
                        if (s != null && s.length() > 0)
266 321 mdecorde
267 321 mdecorde
                        // shared XSL parameters
268 321 mdecorde
                        xslParams["number-words-per-page"] = Integer.parseInt(s);
269 321 mdecorde
                        xslParams["pagination-element"] = module.getParameters().getPageElement("default")
270 321 mdecorde
                        xslParams["import-xml-path"] = module.getParameters().paramFile.toURI().toString()
271 321 mdecorde
                        //println "XSL PARAMS: "+xslParams
272 321 mdecorde
273 321 mdecorde
                        def xslFiles = xslDirectory.listFiles()
274 321 mdecorde
                        xslFiles.sort()
275 321 mdecorde
                        def editionsCreated = [:]
276 321 mdecorde
                        for (File xslFile : xslFiles) {
277 321 mdecorde
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
278 321 mdecorde
                                if (!xslFile.getName().matches("[1-9]-.+")) continue;
279 321 mdecorde
280 321 mdecorde
                                String xslName = xslFile.getName().substring(2); // remove the "1-", "2-", etc.
281 321 mdecorde
                                int idx2 = xslName.indexOf(".")
282 321 mdecorde
                                if (idx2 > 0) xslName = xslName.substring(0, idx2)
283 321 mdecorde
                                else {
284 321 mdecorde
                                        println "$xslFile is not a '.xsl' file"
285 321 mdecorde
                                        continue;
286 321 mdecorde
                                }
287 321 mdecorde
                                int idx3 = xslName.indexOf("-")
288 321 mdecorde
                                if (idx3 < 0) {
289 321 mdecorde
                                        println "$xslFile file does not follow the '{Number}-{editionName}-{step}.xsl' name pattern"
290 321 mdecorde
                                        continue;
291 321 mdecorde
                                }
292 321 mdecorde
                                String pagerStep = xslName.substring(idx3 + 1);
293 321 mdecorde
                                String editionName = xslName.substring(0, idx3);
294 321 mdecorde
295 321 mdecorde
                                int idx = editionName.indexOf(".")
296 321 mdecorde
                                if (idx > 0) editionName = editionName.substring(0, idx);
297 321 mdecorde
                                println "-- Building '$editionName' XSL edition with step '$pagerStep'..."
298 321 mdecorde
299 321 mdecorde
                                File newEditionDirectory = new File(htmlDirectory, editionName);
300 321 mdecorde
                                xslParams["output-directory"] = newEditionDirectory.toURI().toString()
301 321 mdecorde
302 321 mdecorde
                                if (editionsCreated[editionName] == null) { // first XSL, replace an edition
303 321 mdecorde
                                        editionsCreated[editionName] = xslFile
304 321 mdecorde
                                        //if (!importModule.isUpdatingCorpus()) {
305 321 mdecorde
                                        //TODO: optimisation if update is enable,
306 321 mdecorde
                                        newEditionDirectory.deleteDir(); // delete previous edition if any
307 321 mdecorde
                                        //}
308 321 mdecorde
                                        newEditionDirectory.mkdir()
309 321 mdecorde
310 321 mdecorde
                                        boolean deleteOutputFiles = "pager" == pagerStep;
311 321 mdecorde
                                        if (ApplyXsl2.processImportSources(xslFile, inputDirectory, newEditionDirectory, xslParams, deleteOutputFiles)) {
312 321 mdecorde
                                                println ""
313 321 mdecorde
                                        } else {
314 321 mdecorde
                                                reason = "Fail to apply edition XSL: $xslFile"
315 321 mdecorde
                                                return false;
316 321 mdecorde
                                        }
317 321 mdecorde
318 321 mdecorde
                                        // copy CSS files in the newEditionDirector edition directory
319 321 mdecorde
                                        if (cssDirectory.exists()) {
320 321 mdecorde
                                                File csshtmlDirectory = new File(newEditionDirectory, "css")
321 321 mdecorde
                                                FileCopy.copyFiles(cssDirectory, csshtmlDirectory)
322 321 mdecorde
                                        }
323 321 mdecorde
                                        if (jsDirectory.exists()) {
324 321 mdecorde
                                                File jshtmlDirectory = new File(newEditionDirectory, "js")
325 321 mdecorde
                                                FileCopy.copyFiles(jsDirectory, jshtmlDirectory)
326 321 mdecorde
                                        }
327 321 mdecorde
                                        if (imagesDirectory.exists()) {
328 321 mdecorde
                                                File imageshtmlDirectory = new File(newEditionDirectory, "images")
329 321 mdecorde
                                                FileCopy.copyFiles(imagesDirectory, imageshtmlDirectory)
330 321 mdecorde
                                        }
331 321 mdecorde
                                } else { // N+1 XSL working with HTML files
332 321 mdecorde
                                        def htmlFiles = newEditionDirectory.listFiles()
333 321 mdecorde
                                        htmlFiles.sort()
334 321 mdecorde
335 321 mdecorde
                                        if (ApplyXsl2.processImportSources(xslFile, htmlFiles, xslParams)) {
336 321 mdecorde
                                                if ("pager".equals(pagerStep)) {
337 321 mdecorde
                                                        // delete the one page HTML files only if the XSL step is "pager"
338 321 mdecorde
                                                        for (File f : htmlFiles) f.delete();
339 321 mdecorde
                                                }
340 321 mdecorde
                                                //        println ""
341 321 mdecorde
                                        } else {
342 321 mdecorde
                                                reason = "Fail to apply edition XSL: $xslFile"
343 321 mdecorde
                                                return false;
344 321 mdecorde
                                        }
345 321 mdecorde
                                }
346 321 mdecorde
                        }
347 321 mdecorde
348 321 mdecorde
                        // UPDATE import.xml: for each XML-TXM file, we must retrieve the first word ID from the XSL output files
349 321 mdecorde
                        //println "retrieve word ids from $inputDirectory"
350 321 mdecorde
                        println "-- Fetching page word IDs..."
351 321 mdecorde
                        ConsoleProgressBar cpb = new ConsoleProgressBar(editionsCreated.keySet().size())
352 321 mdecorde
                        for (String editionName : editionsCreated.keySet()) {
353 321 mdecorde
                                cpb.tick()
354 321 mdecorde
                                //ensure edition definition is declared
355 321 mdecorde
                                def editionsElement = params.getEditionsElement(corpusElem)
356 321 mdecorde
                                def e = params.getEditionDefinitionElement(editionsElement, editionName);
357 321 mdecorde
                                if (e == null) {
358 321 mdecorde
                                        params.addEditionDefinition(corpusElem, editionName, "xsl", editionsCreated[editionName].toString());
359 321 mdecorde
                                }
360 321 mdecorde
361 321 mdecorde
                                File newEditionDirectory = new File(htmlDirectory, editionName);
362 321 mdecorde
                                File xslFile = editionsCreated[editionName]
363 321 mdecorde
                                for (File txmFile : inputDirectory.listFiles()) {
364 321 mdecorde
                                        if (txmFile.isDirectory()) continue;
365 321 mdecorde
                                        String textName = txmFile.getName()
366 321 mdecorde
                                        int idx4 = textName.indexOf(".")
367 321 mdecorde
                                        if (idx4 > 0) textName = textName.substring(0, idx4);
368 321 mdecorde
369 321 mdecorde
                                        getFirstWordIDs(textName, editionName, newEditionDirectory, xslFile);
370 321 mdecorde
                                }
371 321 mdecorde
                        }
372 321 mdecorde
                        println ""
373 321 mdecorde
                }
374 321 mdecorde
                return true;
375 321 mdecorde
        }
376 321 mdecorde
377 321 mdecorde
        private void getFirstWordIDs(String textName, String editionName, File newEditionDirectory, File xslFile) {
378 321 mdecorde
                //                println "call getFirstWordIDs textName=$textName editionName=$editionName dir=$newEditionDirectory xsl=$xslFile"
379 321 mdecorde
                Element textsElement = params.getTextsElement(corpusElem)
380 321 mdecorde
                Element textElement = params.getTextElement(textsElement, textName)
381 321 mdecorde
                if (textElement == null) { // just in case
382 321 mdecorde
                        textElement = params.addText(corpusElem, textName, new File(inputDirectory, textName+".xml"))
383 321 mdecorde
                }
384 321 mdecorde
385 321 mdecorde
                Element editionElement = params.getEditionElement(textElement, editionName);
386 321 mdecorde
                if (editionElement == null) { // the edition does not exist, declare it
387 321 mdecorde
                        editionElement = params.addEdition(textElement, editionName, newEditionDirectory.getAbsolutePath(), "html");
388 321 mdecorde
                } else { // the edition already exists, remove pages from the existing edition
389 321 mdecorde
                        def children = editionElement.getChildNodes();
390 321 mdecorde
                        while (children.getLength() > 0 ) {
391 321 mdecorde
                                editionElement.removeChild(children.item(0))
392 321 mdecorde
                        }
393 321 mdecorde
                }
394 321 mdecorde
                editionElement.setAttribute("mode", "xsl");
395 321 mdecorde
                editionElement.setAttribute("script", xslFile.getName());
396 321 mdecorde
397 321 mdecorde
                LinkedHashMap<File, String> words = new LinkedHashMap<File, String>()
398 321 mdecorde
                def files = []
399 321 mdecorde
                newEditionDirectory.eachFile() {it -> if (it.isFile()) files << it}
400 321 mdecorde
401 321 mdecorde
                files.sort() { f1, f2 ->
402 321 mdecorde
                        String s1 = f1.getName()
403 321 mdecorde
                        String s2 = f2.getName()
404 321 mdecorde
                        int n1 = Integer.parseInt(s1.substring(s1.lastIndexOf("_")+1, s1.lastIndexOf(".")))
405 321 mdecorde
                        int n2 = Integer.parseInt(s2.substring(s2.lastIndexOf("_")+1, s2.lastIndexOf(".")))
406 321 mdecorde
                        return n1 - n2;
407 321 mdecorde
                }
408 321 mdecorde
409 321 mdecorde
                for (File f : files) {
410 321 mdecorde
                        String pagename = f.getName();
411 321 mdecorde
                        if (pagename.startsWith(textName+"_")) { // this is a page
412 321 mdecorde
                                String firstWordID = getMetaContent(f);
413 321 mdecorde
                                pagename = pagename.substring((textName+"_").length(), pagename.indexOf(".html")) // !!!!
414 321 mdecorde
                                params.addPage(editionElement, pagename, firstWordID)
415 321 mdecorde
                        }
416 321 mdecorde
                }
417 321 mdecorde
        }
418 321 mdecorde
419 321 mdecorde
        public static String getMetaContent(File f) {
420 321 mdecorde
                def inputData = f.toURI().toURL().openStream();
421 321 mdecorde
                def factory = XMLInputFactory.newInstance();
422 321 mdecorde
                factory.setProperty("javax.xml.stream.supportDTD", false); // ignore the DTD declared in doctype
423 321 mdecorde
424 321 mdecorde
                def parser = factory.createXMLStreamReader(inputData);
425 321 mdecorde
                String META = "meta"
426 321 mdecorde
                String BODY = "body"
427 321 mdecorde
                String NAME = "name"
428 321 mdecorde
                String DESCRIPTION = "txm:first-word-id"
429 321 mdecorde
                String CONTENT = "content"
430 321 mdecorde
431 321 mdecorde
                String content = "";
432 321 mdecorde
                String desc = "";
433 321 mdecorde
434 321 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
435 321 mdecorde
                        if (event == XMLStreamConstants.START_ELEMENT) { // start elem
436 321 mdecorde
437 321 mdecorde
                                if (META.equals(parser.getLocalName())) { // ana elem
438 321 mdecorde
                                        desc = "";
439 321 mdecorde
                                        // fetch attribute values
440 321 mdecorde
                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) { // scan attributes
441 321 mdecorde
                                                if (NAME.equals(parser.getAttributeLocalName(i))) { // found @name
442 321 mdecorde
                                                        desc = parser.getAttributeValue(i)
443 321 mdecorde
                                                } else if (CONTENT.equals(parser.getAttributeLocalName(i))) { // found @content
444 321 mdecorde
                                                        content = parser.getAttributeValue(i)
445 321 mdecorde
                                                }
446 321 mdecorde
                                        }
447 321 mdecorde
                                        if (DESCRIPTION.equals(desc)) { // stop now
448 321 mdecorde
                                                break;
449 321 mdecorde
                                        }
450 321 mdecorde
                                } else if (BODY.equals(parser.getLocalName())) { // no need to go further, meta@name="description" not found :(
451 321 mdecorde
                                        content = "";
452 321 mdecorde
                                        break;
453 321 mdecorde
                                }
454 321 mdecorde
                        }
455 321 mdecorde
                }
456 321 mdecorde
                parser.close()
457 321 mdecorde
458 321 mdecorde
                return content;
459 321 mdecorde
        }
460 321 mdecorde
461 321 mdecorde
        public static void main(def args) {
462 321 mdecorde
                println "RESULT: "+getMetaContent(new File("/home/mdecorde/TXM/corpora/QGRAALXTZ/HTML/QGRAALXTZ/default", "qgraal_cm_test201510_page_160_2.html"))
463 321 mdecorde
        }
464 321 mdecorde
}