Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / xtz / XTZPager.groovy @ 187

History | View | Annotate | Download (16.2 kB)

1
package org.txm.importer.xtz
2

    
3
import java.io.File;
4
import java.util.ArrayList;
5

    
6
import org.txm.objects.BaseParameters
7
import org.w3c.dom.Element
8
import org.txm.importer.*
9
import org.txm.stat.utils.ConsoleProgressBar
10
import org.txm.utils.BundleUtils;
11
import org.txm.utils.FileCopy;
12
import org.txm.utils.i18n.*
13

    
14
import javax.xml.stream.*
15

    
16
class XTZPager extends Pager {
17

    
18
        BaseParameters params;
19

    
20
        Element corpusElem;
21
        String lang;
22
        String page_element;
23
        String wordTag;
24
        int wordsPerPage;
25

    
26
        File cssDirectory, jsDirectory, imagesDirectory;
27

    
28
        public XTZPager(ImportModule module) {
29
                super(module, "default");
30

    
31
                params = module.getParameters()
32

    
33
                corpusElem = params.corpora.get(corpusname);
34
                lang = corpusElem.getAttribute("lang");
35
                wordsPerPage = params.getWordsPerPage("default")
36
                page_element = params.getPageElement("default")
37
                wordTag = module.getParameters().getWordElement().getTextContent()
38

    
39
                cssDirectory = new File(module.getSourceDirectory(), "css")
40
                jsDirectory = new File(module.getSourceDirectory(), "js")
41
                imagesDirectory = new File(module.getSourceDirectory(), "images")
42
        }
43

    
44
        public void process(ArrayList<File> files) {
45
                super.process(files);
46
                
47
                if (files == null) {
48
                        files = inputDirectory.listFiles();
49
                        if (files != null) Collections.sort(files);
50
                }
51
                
52
                if (!doDefaultEditionStep()) return;
53
                if (!doFacsEditionStep()) return;
54
                if (!doPostEditionXSLStep()) return;
55

    
56
                isSuccessFul = true;
57
                println ""
58
        }
59

    
60
        public boolean doDefaultEditionStep() {
61
                
62
                boolean build_edition = module.getParameters().getDoEdition("default")
63
                if (!build_edition) {
64
                        return true;
65
                }
66
                
67
                def second = 0
68

    
69
                println "-- Building 'default' edition of  ${files.size()} texts..."
70

    
71
                def css = ["css/txm.css", "css/${corpusname}.css"] // default CSS inclusion
72

    
73
                // scan existing css files that must be declared in each HTML page
74
                if (cssDirectory.exists()) {
75
                        def cssFiles = cssDirectory.listFiles();
76
                        if (cssFiles != null)
77
                                for (File cssFile : cssFiles) {
78
                                        if (cssFile.isFile() && !cssFile.isHidden() && cssFile.getName().endsWith(".css"))
79
                                                css << "css/"+cssFile.getName();
80
                                }
81
                }
82
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
83
                for (File txmFile : files) {
84
                        cpb.tick()
85
                        String textname = txmFile.getName();
86
                        int i = textname.lastIndexOf(".");
87
                        if (i > 0) textname = textname.substring(0, i);
88

    
89
                        File firstHTMLPageFile = new File(outputDirectory, textname+"_1.html");
90
                        if (firstHTMLPageFile.exists() && firstHTMLPageFile.lastModified() >= txmFile.lastModified()) continue;
91

    
92
                        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
93
                        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
94

    
95
                        Element texts = params.getTextsElement(corpusElem);
96
                        Element text = params.getTextElement(texts, textname);
97
                        if (text == null) { // just in case
98
                                text = params.addText(corpusElem, textname, txmFile)
99
                        }
100
                        def ed = new XTZDefaultPagerStep(this, txmFile, textname, NoSpaceBefore, NoSpaceAfter, css);
101
                        Element edition = params.getEditionElement(text, "default");
102
                        if (edition == null) {
103
                                edition = params.addEdition(text, "default", outputDirectory.getAbsolutePath(), "html");
104
                        } else { // remove existing pages if any
105
                                def children = edition.getChildNodes();
106
                                while (children.getLength() > 0 ) {
107
                                        edition.removeChild(children.item(0))
108
                                }
109
                        }
110

    
111
                        for (i = 0 ; i < ed.getPageFiles().size();) {
112
                                File f = ed.getPageFiles().get(i);
113
                                String wordid = ed.getIdx().get(i);
114
                                params.addPage(edition, ""+(++i), wordid);
115
                        }
116
                }
117
                
118
                // copy default TXM css file in the "facs" edition directory
119
                File csshtmlDirectory = new File(outputDirectory, "css")
120
                csshtmlDirectory.mkdirs()
121
                BundleUtils.copyFiles("org.txm.toolbox", "res", "org/txm/css", "txm.css", csshtmlDirectory);
122
                
123
                // copy CSS files in the "default" edition directory
124
                if (cssDirectory.exists()) {
125
                        FileCopy.copyFiles(cssDirectory, csshtmlDirectory)
126
                }
127
                if (jsDirectory.exists()) {
128
                        File jshtmlDirectory = new File(outputDirectory, "js")
129
                        FileCopy.copyFiles(jsDirectory, jshtmlDirectory)
130
                }
131
                if (imagesDirectory.exists()) {
132
                        File imageshtmlDirectory = new File(outputDirectory, "images")
133
                        FileCopy.copyFiles(imagesDirectory, imageshtmlDirectory)
134
                }
135

    
136
                // save changes
137
                println ""
138
                return true;
139
        }
140

    
141
        public boolean doFacsEditionStep() {
142

    
143
                boolean mustBuildFacsEdition = module.getParameters().getDoFacsEdition();
144
                if (!mustBuildFacsEdition) return true;
145
                
146
                String imageDirectoryPath = module.getParameters().getFacsEditionImageDirectory().trim();
147
                File imageDirectory = new File(imageDirectoryPath);
148
                if (!imageDirectoryPath.startsWith("http") && imageDirectoryPath.length()== 0 && !imageDirectory.exists() && !imageDirectory.isDirectory()) {
149
                        imageDirectory = null;
150
                }
151

    
152
                def second = 0
153

    
154
                println "-- Building 'facs' edition of ${files.size()} texts..."
155
                File newEditionDirectory = new File(htmlDirectory, "facs");
156
                newEditionDirectory.mkdir();
157

    
158
                // declare the "facs" edition
159
                Element editionDefinitionElement = params.getEditionDefinitionElement(corpusElem, "facs");
160
                if (editionDefinitionElement == null) {
161
                        editionDefinitionElement = params.addEditionDefinition(corpusElem, "facs", "groovy", "XTZPager.groovy")
162
                }
163
                Element textsElement = params.getTextsElement(corpusElem)
164

    
165
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
166
                for (File txmFile : files) {
167
                        cpb.tick()
168
                        String txtname = txmFile.getName();
169
                        int i = txtname.lastIndexOf(".");
170
                        if (i > 0) txtname = txtname.substring(0, i);
171

    
172
                        File firstHTMLPageFile = new File(newEditionDirectory, txtname+"_1.html");
173
                        if (firstHTMLPageFile.exists() && firstHTMLPageFile.lastModified() >= txmFile.lastModified()) continue;
174

    
175
                        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
176
                        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
177

    
178
                        // the text should already exists
179
                        Element text = params.getTextElement(textsElement, txtname)
180
                        if (text == null) { // just in case
181
                                text = params.addText(corpusElem, txtname, txmFile)
182
                        }
183
                        Element edition = params.getEditionElement(text, "facs");
184
                        if (edition == null) {
185
                                edition = params.addEdition(text, "facs", newEditionDirectory.getAbsolutePath(), "html");
186
                        } else {
187
                                def children = edition.getChildNodes();
188
                                while (children.getLength() > 0 ) {
189
                                        edition.removeChild(children.item(0))
190
                                }
191
                        }
192

    
193
                        try {
194
                                def ed = new XTZFacsPagerStep(txmFile, newEditionDirectory, imageDirectory, txtname, corpusname, "pb", "facs", wordTag, debug);
195
                                if (!ed.process()) {
196
                                        println "Fail to build edition for text: $txmFile"
197
                                        continue;
198
                                }
199

    
200
                                def pages = ed.getPageFiles()
201
                                for (i = 0 ; i < pages.size();) {
202
                                        File f = pages[i][0];
203
                                        String wordid = pages[i][1]
204
                                        params.addPage(edition, ""+(++i), wordid);
205
                                        //println "add facs page: $f $wordid"
206
                                }
207
                        } catch (Exception e) {
208
                                println "Error while processing $txmFile text: "+e
209
                                e.printStackTrace();
210
                                return false;
211
                        }
212
                }
213

    
214
                if (!imageDirectoryPath.startsWith("http") && imageDirectory != null) { // copy files only if local
215
                        File editionImagesDirectory = new File(newEditionDirectory, "res/images/"+corpusname+"/facs");
216
                        editionImagesDirectory.mkdirs();
217
                        FileCopy.copyFiles(imageDirectory, editionImagesDirectory);
218
                }
219
                
220
                // copy SimpleViewer files in the "facs" edition directory
221
                File jshtmlDirectory = new File(newEditionDirectory, "js")
222
                jshtmlDirectory.mkdirs()
223
                BundleUtils.copyFiles("org.txm.toolbox", "res", "org/txm/js", "viewer", jshtmlDirectory);
224
                BundleUtils.copyFiles("org.txm.toolbox", "res", "org/txm/", "images", newEditionDirectory);
225
                
226
                // copy default TXM css file in the "facs" edition directory
227
                File csshtmlDirectory = new File(newEditionDirectory, "css")
228
                csshtmlDirectory.mkdirs()
229
                BundleUtils.copyFiles("org.txm.toolbox", "res", "org/txm/css", "txm.css", csshtmlDirectory);
230
                
231
                // copy CSS/JS/Images sources files in the "facs" edition directory
232
                if (cssDirectory.exists()) {
233
                        FileCopy.copyFiles(cssDirectory, csshtmlDirectory)
234
                }
235
                if (jsDirectory.exists()) {
236
                        FileCopy.copyFiles(jsDirectory, jshtmlDirectory)
237
                }
238
                if (imagesDirectory.exists()) {
239
                        File imageshtmlDirectory = new File(newEditionDirectory, "images")
240
                        FileCopy.copyFiles(imagesDirectory, imageshtmlDirectory)
241
                }
242
                
243
                Element editionsElement = params.getEditionsElement(corpusElem);
244
                editionsElement.setAttribute("default","default,facs");
245

    
246
                println ""
247
                return true;
248
        }
249

    
250
        /**
251
         * read from $bindir/txm and write the result in $bindir/txm
252
         *
253
         */
254
        public boolean doPostEditionXSLStep() {
255

    
256
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/4-edition")
257
                if (xslDirectory.exists()) {
258

    
259
                        // prepare XSL parameters
260
                        def xslParams = module.getParameters().getXsltParams(corpusElem);
261
                        String s = module.getParameters().getWordsPerPage("default");
262
                        if (s != null && s.length() > 0)
263

    
264
                        // shared XSL parameters
265
                        xslParams["number-words-per-page"] = Integer.parseInt(s);
266
                        xslParams["pagination-element"] = module.getParameters().getPageElement("default")
267
                        xslParams["import-xml-path"] = module.getParameters().paramFile.toURI().toString()
268
                        //println "XSL PARAMS: "+xslParams
269

    
270
                        def xslFiles = xslDirectory.listFiles()
271
                        xslFiles.sort()
272
                        def editionsCreated = [:]
273
                        for (File xslFile : xslFiles) {
274
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
275
                                if (!xslFile.getName().matches("[1-9]-.+")) continue;
276
                                
277
                                String xslName = xslFile.getName().substring(2); // remove the "1-", "2-", etc.
278
                                int idx2 = xslName.indexOf(".")
279
                                if (idx2 > 0) xslName = xslName.substring(0, idx2)
280
                                else {
281
                                        println "$xslFile is not a '.xsl' file"
282
                                        continue;
283
                                }
284
                                int idx3 = xslName.indexOf("-")
285
                                if (idx3 < 0) {
286
                                        println "$xslFile file does not follow the '{Number}-{editionName}-{step}.xsl' name pattern"
287
                                        continue;
288
                                }
289
                                String pagerStep = xslName.substring(idx3 + 1);
290
                                String editionName = xslName.substring(0, idx3);
291

    
292
                                int idx = editionName.indexOf(".")
293
                                if (idx > 0) editionName = editionName.substring(0, idx);
294
                                println "-- Building '$editionName' XSL edition with step '$pagerStep'..."
295

    
296
                                File newEditionDirectory = new File(htmlDirectory, editionName);
297
                                xslParams["output-directory"] = newEditionDirectory.toURI().toString()
298

    
299
                                if (editionsCreated[editionName] == null) { // first XSL, replace an edition
300
                                        editionsCreated[editionName] = xslFile
301
                                        //if (!importModule.isUpdatingCorpus()) {
302
                                        //TODO: optimisation if update is enable,
303
                                        newEditionDirectory.deleteDir(); // delete previous edition if any
304
                                        //}
305
                                        newEditionDirectory.mkdir()
306

    
307
                                        boolean deleteOutputFiles = "pager" == pagerStep;
308
                                        if (ApplyXsl2.processImportSources(xslFile, inputDirectory, newEditionDirectory, xslParams, deleteOutputFiles)) {
309
                                                println ""
310
                                        } else {
311
                                                reason = "Fail to apply edition XSL: $xslFile"
312
                                                return false;
313
                                        }
314

    
315
                                        // copy CSS files in the newEditionDirector edition directory
316
                                        if (cssDirectory.exists()) {
317
                                                File csshtmlDirectory = new File(newEditionDirectory, "css")
318
                                                FileCopy.copyFiles(cssDirectory, csshtmlDirectory)
319
                                        }
320
                                        if (jsDirectory.exists()) {
321
                                                File jshtmlDirectory = new File(newEditionDirectory, "js")
322
                                                FileCopy.copyFiles(jsDirectory, jshtmlDirectory)
323
                                        }
324
                                        if (imagesDirectory.exists()) {
325
                                                File imageshtmlDirectory = new File(newEditionDirectory, "images")
326
                                                FileCopy.copyFiles(imagesDirectory, imageshtmlDirectory)
327
                                        }
328
                                } else { // N+1 XSL working with HTML files
329
                                        def htmlFiles = newEditionDirectory.listFiles()
330
                                        htmlFiles.sort()
331

    
332
                                        if (ApplyXsl2.processImportSources(xslFile, htmlFiles, xslParams)) {
333
                                                if ("pager".equals(pagerStep)) {
334
                                                        // delete the one page HTML files only if the XSL step is "pager"
335
                                                        for (File f : htmlFiles) f.delete();
336
                                                }
337
                                                //        println ""
338
                                        } else {
339
                                                reason = "Fail to apply edition XSL: $xslFile"
340
                                                return false;
341
                                        }
342
                                }
343
                        }
344

    
345
                        // UPDATE import.xml: for each XML-TXM file, we must retrieve the first word ID from the XSL output files
346
                        //println "retrieve word ids from $inputDirectory"
347
                        println "-- Fetching page word IDs..."
348
                        ConsoleProgressBar cpb = new ConsoleProgressBar(editionsCreated.keySet().size())
349
                        for (String editionName : editionsCreated.keySet()) {
350
                                cpb.tick()
351
                                //ensure edition definition is declared
352
                                def editionsElement = params.getEditionsElement(corpusElem)
353
                                def e = params.getEditionDefinitionElement(editionsElement, editionName);
354
                                if (e == null) {
355
                                        params.addEditionDefinition(corpusElem, editionName, "xsl", editionsCreated[editionName].toString());
356
                                }
357

    
358
                                File newEditionDirectory = new File(htmlDirectory, editionName);
359
                                File xslFile = editionsCreated[editionName]
360
                                for (File txmFile : inputDirectory.listFiles()) {
361
                                        if (txmFile.isDirectory()) continue;
362
                                        String textName = txmFile.getName()
363
                                        int idx4 = textName.indexOf(".")
364
                                        if (idx4 > 0) textName = textName.substring(0, idx4);
365

    
366
                                        getFirstWordIDs(textName, editionName, newEditionDirectory, xslFile);
367
                                }
368
                        }
369
                        println ""
370
                }
371
                return true;
372
        }
373

    
374
        private void getFirstWordIDs(String textName, String editionName, File newEditionDirectory, File xslFile) {
375
                //                println "call getFirstWordIDs textName=$textName editionName=$editionName dir=$newEditionDirectory xsl=$xslFile"
376
                Element textsElement = params.getTextsElement(corpusElem)
377
                Element textElement = params.getTextElement(textsElement, textName)
378
                if (textElement == null) { // just in case
379
                        textElement = params.addText(corpusElem, textName, new File(inputDirectory, textName+".xml"))
380
                }
381

    
382
                Element editionElement = params.getEditionElement(textElement, editionName);
383
                if (editionElement == null) { // the edition does not exist, declare it
384
                        editionElement = params.addEdition(textElement, editionName, newEditionDirectory.getAbsolutePath(), "html");
385
                } else { // the edition already exists, remove pages from the existing edition
386
                        def children = editionElement.getChildNodes();
387
                        while (children.getLength() > 0 ) {
388
                                editionElement.removeChild(children.item(0))
389
                        }
390
                }
391
                editionElement.setAttribute("mode", "xsl");
392
                editionElement.setAttribute("script", xslFile.getName());
393

    
394
                LinkedHashMap<File, String> words = new LinkedHashMap<File, String>()
395
                def files = []
396
                newEditionDirectory.eachFile() {it -> if (it.isFile()) files << it}
397

    
398
                files.sort() { f1, f2 ->
399
                        String s1 = f1.getName()
400
                        String s2 = f2.getName()
401
                        int n1 = Integer.parseInt(s1.substring(s1.lastIndexOf("_")+1, s1.lastIndexOf(".")))
402
                        int n2 = Integer.parseInt(s2.substring(s2.lastIndexOf("_")+1, s2.lastIndexOf(".")))
403
                        return n1 - n2;
404
                }
405

    
406
                for (File f : files) {
407
                        String pagename = f.getName();
408
                        if (pagename.startsWith(textName+"_")) { // this is a page
409
                                String firstWordID = getMetaContent(f);
410
                                pagename = pagename.substring((textName+"_").length(), pagename.indexOf(".html")) // !!!!
411
                                params.addPage(editionElement, pagename, firstWordID)
412
                        }
413
                }
414
        }
415

    
416
        public static String getMetaContent(File f) {
417
                def inputData = f.toURI().toURL().openStream();
418
                def factory = XMLInputFactory.newInstance();
419
                factory.setProperty("javax.xml.stream.supportDTD", false); // ignore the DTD declared in doctype
420

    
421
                def parser = factory.createXMLStreamReader(inputData);
422
                String META = "meta"
423
                String BODY = "body"
424
                String NAME = "name"
425
                String DESCRIPTION = "txm:first-word-id"
426
                String CONTENT = "content"
427

    
428
                String content = "";
429
                String desc = "";
430

    
431
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
432
                        if (event == XMLStreamConstants.START_ELEMENT) { // start elem
433

    
434
                                if (META.equals(parser.getLocalName())) { // ana elem
435
                                        desc = "";
436
                                        // fetch attribute values
437
                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) { // scan attributes
438
                                                if (NAME.equals(parser.getAttributeLocalName(i))) { // found @name
439
                                                        desc = parser.getAttributeValue(i)
440
                                                } else if (CONTENT.equals(parser.getAttributeLocalName(i))) { // found @content
441
                                                        content = parser.getAttributeValue(i)
442
                                                }
443
                                        }
444
                                        if (DESCRIPTION.equals(desc)) { // stop now
445
                                                break;
446
                                        }
447
                                } else if (BODY.equals(parser.getLocalName())) { // no need to go further, meta@name="description" not found :(
448
                                        content = "";
449
                                        break;
450
                                }
451
                        }
452
                }
453
                parser.close()
454

    
455
                return content;
456
        }
457

    
458
        public static void main(def args) {
459
                println "RESULT: "+getMetaContent(new File("/home/mdecorde/TXM/corpora/QGRAALXTZ/HTML/QGRAALXTZ/default", "qgraal_cm_test201510_page_160_2.html"))
460
        }
461
}