Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZPager.groovy @ 2288

History | View | Annotate | Download (16.3 kB)

1
package org.txm.scripts.importer.xtz
2

    
3
import java.io.File;
4
import java.util.ArrayList;
5

    
6
import org.txm.objects.BaseParameters
7
import org.w3c.dom.Element
8

    
9
import org.txm.scripts.importer.*
10
import org.txm.utils.ConsoleProgressBar
11
import org.txm.utils.BundleUtils;
12
import org.txm.utils.io.FileCopy;
13
import org.txm.utils.i18n.*
14
import org.txm.importer.xtz.*
15
import javax.xml.stream.*
16

    
17
import org.txm.objects.*
18
import org.txm.importer.ApplyXsl2
19
import org.txm.utils.logger.Log
20

    
21
class XTZPager extends Pager {
22

    
23
        Project project;
24

    
25
        Element corpusElem;
26
        String lang;
27
        String page_element;
28
        String wordTag;
29
        int wordsPerPage;
30

    
31
        File cssDirectory, jsDirectory, imagesDirectory;
32

    
33
        public XTZPager(ImportModule module) {
34
                super(module, "default");
35

    
36
                project = module.getProject()
37

    
38
                lang = project.getLang();
39
                wordsPerPage = project.getEditionDefinition("default").getWordsPerPage()
40
                page_element = project.getEditionDefinition("default").getPageElement()
41
                wordTag = project.getTokenizerWordElement()
42

    
43
                cssDirectory = new File(module.getSourceDirectory(), "css")
44
                jsDirectory = new File(module.getSourceDirectory(), "js")
45
                imagesDirectory = new File(module.getSourceDirectory(), "images")
46
        }
47

    
48
        @Override
49
        public void process(List<String> orderedTextIDs) {
50
                super.process(orderedTextIDs);
51

    
52
                if (orderedTextIDs == null) { module.getProject().getTextsID() }
53

    
54
                if (!doDefaultEditionStep()) return;
55
                if (!doFacsEditionStep()) return;
56

    
57
                // remove extra XSL editions -> they will be recreated by the doPostEditionXSLStep call
58
                for (EditionDefinition eDef : project.getEditionDefinitions()) {
59
                        if (eDef.getName() != "facs" && eDef.getName() != "default") {
60
                                eDef.delete();
61
                        }
62
                }
63
                if (!doPostEditionXSLStep()) return;
64

    
65
                isSuccessFul = true;
66
                println ""
67
        }
68

    
69
        public boolean doDefaultEditionStep() {
70

    
71
                boolean build_edition = project.getEditionDefinition("default").getBuildEdition()
72
                if (!build_edition) {
73
                        return true;
74
                }
75

    
76
                def second = 0
77
                def texts = orderedTextIDs.collect() { id -> module.getProject().getText(id) }
78
                def textsToProcess = texts.findAll() { text ->
79
                        File txmFile = text.getXMLTXMFile()
80
                        File firstHTMLPageFile = new File(outputDirectory, text.getName()+"_1.html");
81
                        boolean mustBuild = false;
82
                        if (!firstHTMLPageFile.exists() || txmFile.lastModified() >= firstHTMLPageFile.lastModified()) {
83
                                return true
84
                        }
85
                                
86
                        if (!text.isDirty() && !mustBuild) {
87
                                Log.finer("skipping 'default html' step of $text");
88
                                return false
89
                        }
90
                        
91
                        return true
92
                }
93
                println "-- Building 'default' edition of ${textsToProcess.size()}/${texts.size()} texts..."
94

    
95
                def css = ["css/txm.css", "css/${corpusname}.css"] // default CSS inclusion
96

    
97
                // scan existing css files that must be declared in each HTML page
98
                if (cssDirectory.exists()) {
99
                        def cssFiles = cssDirectory.listFiles();
100
                        if (cssFiles != null) {
101
                                for (File cssFile : cssFiles) {
102
                                        if (cssFile.isFile() && !cssFile.isHidden() && cssFile.getName().endsWith(".css"))
103
                                                css << "css/"+cssFile.getName();
104
                                }
105
                        }
106
                }
107

    
108
                ConsoleProgressBar cpb = new ConsoleProgressBar(textsToProcess.size())
109
                for (Text text : textsToProcess) {
110

    
111
                        File txmFile = text.getXMLTXMFile()
112
                        try {
113
                                cpb.tick()
114

    
115
                                String textname = text.getName()
116

    
117
                                Edition edition = text.getEdition("default")
118
                                if (edition != null) {
119
                                        edition.delete()
120
                                }
121
                                
122
                                edition = new Edition(text);
123

    
124
                                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
125
                                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
126

    
127

    
128
                                def ed = new XTZDefaultPagerStep(this, txmFile, textname, NoSpaceBefore, NoSpaceAfter, css);
129
                                if (!ed.process()) {
130
                                        println "Fail to build 'default' edition for text: $txmFile"
131
                                        continue;
132
                                }
133
                                
134
                                edition.setName("default");
135
                                edition.setIndex(outputDirectory.getAbsolutePath());
136

    
137
                                for (int i = 0 ; i < ed.getPageFiles().size();) {
138
                                        File f = ed.getPageFiles().get(i);
139
                                        String wordid = "w_0";
140
                                        if (i < ed.getIdx().size()) wordid = ed.getIdx().get(i);
141
                                        edition.addPage(""+(++i), wordid);
142
                                }
143
                        } catch(Exception e) {
144
                                println "Error: could not create $txmFile 'default' edition: "+e
145
                                e.printStackTrace()
146
                        }
147
                }
148

    
149
                // copy default TXM css file in the "facs" edition directory
150
                File csshtmlDirectory = new File(outputDirectory, "css")
151
                csshtmlDirectory.mkdirs()
152
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/css", "txm.css", csshtmlDirectory);
153

    
154
                // copy CSS files in the "default" edition directory
155
                if (cssDirectory.exists()) {
156
                        FileCopy.copyFiles(cssDirectory, csshtmlDirectory)
157
                }
158
                if (jsDirectory.exists()) {
159
                        File jshtmlDirectory = new File(outputDirectory, "js")
160
                        FileCopy.copyFiles(jsDirectory, jshtmlDirectory)
161
                }
162
                if (imagesDirectory.exists()) {
163
                        File imageshtmlDirectory = new File(outputDirectory, "images")
164
                        FileCopy.copyFiles(imagesDirectory, imageshtmlDirectory)
165
                }
166

    
167
                // save changes
168
                return true;
169
        }
170

    
171
        public boolean doFacsEditionStep() {
172

    
173
                boolean mustBuildFacsEdition = project.getEditionDefinition("facs").getBuildEdition()
174
                if (!mustBuildFacsEdition) return true;
175

    
176
                String imageDirectoryPath = project.getEditionDefinition("facs").getImagesDirectory();
177
                File imageDirectory = null
178

    
179
                if (imageDirectoryPath != null) {
180
                        imageDirectoryPath = imageDirectoryPath.trim()
181
                        imageDirectory = new File(imageDirectoryPath)
182
                        if (!imageDirectoryPath.startsWith("http") && imageDirectoryPath.length()== 0 && !imageDirectory.exists() && !imageDirectory.isDirectory()) {
183
                                imageDirectory = null;
184
                        }
185
                }
186

    
187
                def second = 0
188

    
189
                def texts = orderedTextIDs.collect() { id -> module.getProject().getText(id) }
190
                println "-- Building 'facs' edition of ${texts.size()} texts..."
191
                File newEditionDirectory = new File(htmlDirectory, "facs");
192
                newEditionDirectory.mkdir();
193

    
194
                ConsoleProgressBar cpb = new ConsoleProgressBar(texts.size())
195
                for (Text text : texts) {
196
                        cpb.tick()
197

    
198
                        File txmFile = text.getXMLTXMFile()
199
                        String txtname = text.getName()
200

    
201
                        File firstHTMLPageFile = new File(newEditionDirectory, txtname+"_1.html");
202
                        boolean mustBuild = false;
203
                        if (!firstHTMLPageFile.exists() || txmFile.lastModified() >= firstHTMLPageFile.lastModified()) {
204
                                mustBuild = true
205
                        }
206
                        
207
                        if (!text.isDirty() && !mustBuild) {
208
                                Log.finer("skipping 'default html' step of $text");
209
                                continue
210
                        }
211

    
212
                        Edition edition = text.getEdition("facs")
213
                        if (edition != null) {
214
                                edition.delete()
215
                        }
216
                        
217
                        edition = new Edition(text);
218

    
219
                        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
220
                        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
221

    
222
                        edition.setName("facs");
223
                        edition.setIndex(outputDirectory.getAbsolutePath());
224

    
225
                        try {
226
                                def ed = new XTZFacsPagerStep(txmFile, newEditionDirectory, imageDirectory, txtname, corpusname, "pb", "facs", wordTag, debug);
227
                                if (!ed.process()) {
228
                                        println "Fail to build 'facs' edition for text: $txmFile"
229
                                        continue;
230
                                }
231

    
232
                                def pages = ed.getPageFiles()
233
                                for (int i = 0 ; i < pages.size();) {
234
                                        File f = pages[i][0];
235
                                        String wordid = pages[i][1]
236
                                        //TODO replace '""+(++i)' with something that fetch/findout the page 'name'
237
                                        // TODO or move the Edition and Page corpus declaration in the XTZDefaultPagerStep
238
                                        edition.addPage(""+(++i), wordid);
239
                                        //println "add facs page: $f $wordid"
240
                                }
241
                        } catch (Exception e) {
242
                                println "Error while processing $txmFile text: "+e
243
                                e.printStackTrace();
244
                                return false;
245
                        }
246
                }
247

    
248

    
249
                if (!imageDirectoryPath.startsWith("http") && imageDirectory != null) { // copy files only if local
250
                        File editionImagesDirectory = new File(newEditionDirectory, "res/images/"+corpusname+"/facs");
251
                        editionImagesDirectory.mkdirs();
252
                        FileCopy.copyFiles(imageDirectory, editionImagesDirectory);
253
                }
254

    
255
                // copy SimpleViewer files in the "facs" edition directory
256
                File jshtmlDirectory = new File(newEditionDirectory, "js")
257
                jshtmlDirectory.mkdirs()
258
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/js", "viewer", jshtmlDirectory);
259
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/", "images", newEditionDirectory);
260

    
261
                // copy default TXM css file in the "facs" edition directory
262
                File csshtmlDirectory = new File(newEditionDirectory, "css")
263
                csshtmlDirectory.mkdirs()
264
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/css", "txm.css", csshtmlDirectory);
265

    
266
                // copy CSS/JS/Images sources files in the "facs" edition directory
267
                if (cssDirectory.exists()) {
268
                        FileCopy.copyFiles(cssDirectory, csshtmlDirectory)
269
                }
270
                if (jsDirectory.exists()) {
271
                        FileCopy.copyFiles(jsDirectory, jshtmlDirectory)
272
                }
273
                if (imagesDirectory.exists()) {
274
                        File imageshtmlDirectory = new File(newEditionDirectory, "images")
275
                        FileCopy.copyFiles(imagesDirectory, imageshtmlDirectory)
276
                }
277

    
278
                project.setDefaultEditionName("default,facs");
279

    
280
                println ""
281
                return true;
282
        }
283

    
284
        /**
285
         * read from $bindir/txm and write the result in $bindir/txm
286
         *
287
         */
288
        public boolean doPostEditionXSLStep() {
289

    
290
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/4-edition")
291
                if (xslDirectory.exists()) {
292

    
293
                        // prepare XSL parameters
294
                        def xslParams = project.getXsltParameters()
295
                        String s = project.getEditionDefinition("default").getWordsPerPage();
296
                        if (s != null && s.length() > 0)
297

    
298
                                // shared XSL parameters
299
                        xslParams["number-words-per-page"] = Integer.parseInt(s);
300
                        xslParams["pagination-element"] = project.getEditionDefinition("default").getPageElement();
301
                        xslParams["import-xml-path"] = project.getProjectDirectory()
302
                        //println "XSL PARAMS: "+xslParams
303

    
304
                        def xslFiles = xslDirectory.listFiles()
305
                        xslFiles = xslFiles.sort() { f ->
306
                                try {
307
                                        return Integer.parseInt(f.getName().substring(0, f.getName().indexOf("-")))
308
                                } catch(Exception e) {}
309
                                return -1;
310
                        }
311
                        def editionsCreated = [:]
312
                        for (File xslFile : xslFiles) {
313
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
314
                                if (!xslFile.getName().matches("[1-9]{1,3}-.+")) continue;
315

    
316
                                String xslName = xslFile.getName().substring(2); // remove the "1-", "2-", etc.
317
                                int idx2 = xslName.indexOf(".")
318
                                if (idx2 > 0) xslName = xslName.substring(0, idx2)
319
                                else {
320
                                        println "$xslFile is not a '.xsl' file"
321
                                        continue;
322
                                }
323
                                int idx3 = xslName.indexOf("-")
324
                                if (idx3 < 0) {
325
                                        println "$xslFile file does not follow the '{Number}-{editionName}-{step}.xsl' name pattern"
326
                                        continue;
327
                                }
328
                                String pagerStep = xslName.substring(idx3 + 1);
329
                                String editionName = xslName.substring(0, idx3);
330

    
331
                                int idx = editionName.indexOf(".")
332
                                if (idx > 0) editionName = editionName.substring(0, idx);
333
                                println "-- Building '$editionName' XSL edition with step '$pagerStep'..."
334

    
335
                                File newEditionDirectory = new File(htmlDirectory, editionName);
336
                                xslParams["output-directory"] = newEditionDirectory.toURI().toString()
337

    
338
                                if (editionsCreated[editionName] == null) { // first XSL, replace an edition
339
                                        editionsCreated[editionName] = xslFile
340

    
341
                                        newEditionDirectory.deleteDir(); // delete previous edition if any
342
                                        newEditionDirectory.mkdir()
343

    
344
                                        boolean deleteOutputFiles = "pager" == pagerStep;
345

    
346
                                        if (ApplyXsl2.processImportSources(xslFile, inputDirectory, newEditionDirectory, xslParams, deleteOutputFiles)) {
347
                                                println ""
348
                                        } else {
349
                                                reason = "Fail to apply edition XSL: $xslFile"
350
                                                return false;
351
                                        }
352

    
353
                                        // copy CSS files in the newEditionDirector edition directory
354
                                        if (cssDirectory.exists()) {
355
                                                File csshtmlDirectory = new File(newEditionDirectory, "css")
356
                                                FileCopy.copyFiles(cssDirectory, csshtmlDirectory)
357
                                        }
358
                                        if (jsDirectory.exists()) {
359
                                                File jshtmlDirectory = new File(newEditionDirectory, "js")
360
                                                FileCopy.copyFiles(jsDirectory, jshtmlDirectory)
361
                                        }
362
                                        if (imagesDirectory.exists()) {
363
                                                File imageshtmlDirectory = new File(newEditionDirectory, "images")
364
                                                FileCopy.copyFiles(imagesDirectory, imageshtmlDirectory)
365
                                        }
366
                                } else { // N+1 XSL working with HTML files
367
                                        def htmlFiles = newEditionDirectory.listFiles()
368
                                        htmlFiles.sort()
369

    
370
                                        if (ApplyXsl2.processImportSources(xslFile, htmlFiles, xslParams)) {
371
                                                if ("pager".equals(pagerStep)) {
372
                                                        // delete the one page HTML files only if the XSL step is "pager"
373
                                                        for (File f : htmlFiles) f.delete();
374
                                                }
375
                                                //        println ""
376
                                        } else {
377
                                                reason = "Fail to apply edition XSL: $xslFile"
378
                                                return false;
379
                                        }
380
                                }
381
                        }
382

    
383
                        // UPDATE import.xml: for each XML-TXM file, we must retrieve the first word ID from the XSL output files
384
                        //println "retrieve word ids from $inputDirectory"
385
                        println "-- Fetching page word IDs..."
386
                        ConsoleProgressBar cpb = new ConsoleProgressBar(editionsCreated.keySet().size())
387
                        for (String editionName : editionsCreated.keySet()) {
388
                                cpb.tick()
389

    
390
                                File newEditionDirectory = new File(htmlDirectory, editionName);
391
                                File xslFile = editionsCreated[editionName]
392
                                for (File txmFile : inputDirectory.listFiles()) {
393
                                        if (txmFile.isDirectory()) continue;
394
                                        String textName = txmFile.getName()
395
                                        int idx4 = textName.indexOf(".")
396
                                        if (idx4 > 0) textName = textName.substring(0, idx4);
397

    
398
                                        getFirstWordIDs(textName, editionName, newEditionDirectory, xslFile, txmFile);
399
                                }
400

    
401
                                def editionDeclaration = project.getEditionDefinition(editionName); // create the edition definition
402
                                editionDeclaration.setBuildEdition(true)
403
                                editionDeclaration.setPageBreakTag(project.getEditionDefinition("default").getPageElement())
404
                                editionDeclaration.setWordsPerPage(project.getEditionDefinition("default").getWordsPerPage())
405
                        }
406
                        println ""
407
                }
408
                return true;
409
        }
410

    
411
        private void getFirstWordIDs(String textName, String editionName, File newEditionDirectory, File xslFile, File txmFile) {
412
                //                println "call getFirstWordIDs textName=$textName editionName=$editionName dir=$newEditionDirectory xsl=$xslFile"
413
                Text t = project.getText(textName);
414
                if (t == null) {
415
                        t = new Text(project);
416
                }
417
                t.setName(textName);
418
                t.setSourceFile(txmFile)
419
                t.setTXMFile(txmFile)
420

    
421
                Edition edition = t.getEdition(editionName)
422
                if (edition == null) { // new edition
423
                        edition = new Edition(t);
424
                } else { // replacing existing edition
425
                        edition.resetPages()
426
                }
427
                edition.setName(editionName);
428
                edition.setIndex(outputDirectory.getAbsolutePath());
429

    
430
                LinkedHashMap<File, String> words = new LinkedHashMap<File, String>()
431
                def files = []
432
                newEditionDirectory.eachFile() {it -> if (it.isFile()) files << it}
433

    
434
                files.sort() { f1, f2 ->
435
                        String s1 = f1.getName()
436
                        String s2 = f2.getName()
437
                        int n1 = Integer.parseInt(s1.substring(s1.lastIndexOf("_")+1, s1.lastIndexOf(".")))
438
                        int n2 = Integer.parseInt(s2.substring(s2.lastIndexOf("_")+1, s2.lastIndexOf(".")))
439
                        return n1 - n2;
440
                }
441

    
442
                for (File f : files) {
443
                        String pagename = f.getName();
444
                        if (pagename.startsWith(textName+"_")) { // this is a page
445
                                String firstWordID = getMetaContent(f);
446
                                pagename = pagename.substring((textName+"_").length(), pagename.indexOf(".html")) // !!!!
447
                                edition.addPage(pagename, firstWordID)
448
                        }
449
                }
450
        }
451

    
452
        public static String getMetaContent(File f) {
453
                def inputData = f.toURI().toURL().openStream();
454
                def factory = XMLInputFactory.newInstance();
455
                factory.setProperty("javax.xml.stream.supportDTD", false); // ignore the DTD declared in doctype
456

    
457
                def parser = factory.createXMLStreamReader(inputData);
458
                String META = "meta"
459
                String BODY = "body"
460
                String NAME = "name"
461
                String DESCRIPTION = "txm:first-word-id"
462
                String CONTENT = "content"
463

    
464
                String content = "";
465
                String desc = "";
466

    
467
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
468
                        if (event == XMLStreamConstants.START_ELEMENT) { // start elem
469

    
470
                                if (META.equals(parser.getLocalName())) { // ana elem
471
                                        desc = "";
472
                                        // fetch attribute values
473
                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) { // scan attributes
474
                                                if (NAME.equals(parser.getAttributeLocalName(i))) { // found @name
475
                                                        desc = parser.getAttributeValue(i)
476
                                                } else if (CONTENT.equals(parser.getAttributeLocalName(i))) { // found @content
477
                                                        content = parser.getAttributeValue(i)
478
                                                }
479
                                        }
480
                                        if (DESCRIPTION.equals(desc)) { // stop now
481
                                                break;
482
                                        }
483
                                } else if (BODY.equals(parser.getLocalName())) { // no need to go further, meta@name="description" not found :(
484
                                        content = "";
485
                                        break;
486
                                }
487
                        }
488
                }
489
                if (parser != null) parser.close();
490
                if (inputData != null) inputData.close();
491

    
492
                return content;
493
        }
494

    
495
        public static void main(def args) {
496
                println "RESULT: "+getMetaContent(new File("/home/mdecorde/TXM/corpora/QGRAALXTZ/HTML/QGRAALXTZ/default", "qgraal_cm_test201510_page_160_2.html"))
497
        }
498
}