Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZPager.groovy @ 2246

History | View | Annotate | Download (16.1 kB)

1
package org.txm.scripts.importer.xtz
2

    
3
import java.io.File;
4
import java.util.ArrayList;
5

    
6
import org.txm.objects.BaseParameters
7
import org.w3c.dom.Element
8

    
9
import org.txm.scripts.importer.*
10
import org.txm.utils.ConsoleProgressBar
11
import org.txm.utils.BundleUtils;
12
import org.txm.utils.io.FileCopy;
13
import org.txm.utils.i18n.*
14
import org.txm.importer.xtz.*
15
import javax.xml.stream.*
16

    
17
import org.txm.objects.*
18
import org.txm.importer.ApplyXsl2
19
import org.txm.utils.logger.Log
20

    
21
class XTZPager extends Pager {
22

    
23
        Project project;
24

    
25
        Element corpusElem;
26
        String lang;
27
        String page_element;
28
        String wordTag;
29
        int wordsPerPage;
30

    
31
        File cssDirectory, jsDirectory, imagesDirectory;
32

    
33
        public XTZPager(ImportModule module) {
34
                super(module, "default");
35

    
36
                project = module.getProject()
37

    
38
                lang = project.getLang();
39
                wordsPerPage = project.getEditionDefinition("default").getWordsPerPage()
40
                page_element = project.getEditionDefinition("default").getPageElement()
41
                wordTag = project.getTokenizerWordElement()
42

    
43
                cssDirectory = new File(module.getSourceDirectory(), "css")
44
                jsDirectory = new File(module.getSourceDirectory(), "js")
45
                imagesDirectory = new File(module.getSourceDirectory(), "images")
46
        }
47

    
48
        @Override
49
        public void process(List<String> orderedTextIDs) {
50
                super.process(orderedTextIDs);
51

    
52
                if (orderedTextIDs == null) { module.getProject().getTextsID() }
53

    
54
                if (!doDefaultEditionStep()) return;
55
                if (!doFacsEditionStep()) return;
56

    
57
                // remove extra XSL editions -> they will be recreated by the doPostEditionXSLStep call
58
                for (EditionDefinition eDef : project.getEditionDefinitions()) {
59
                        if (eDef.getName() != "facs" && eDef.getName() != "default") {
60
                                eDef.delete();
61
                        }
62
                }
63
                if (!doPostEditionXSLStep()) return;
64

    
65
                isSuccessFul = true;
66
                println ""
67
        }
68

    
69
        public boolean doDefaultEditionStep() {
70

    
71
                boolean build_edition = project.getEditionDefinition("default").getBuildEdition()
72
                if (!build_edition) {
73
                        return true;
74
                }
75

    
76
                def second = 0
77
                def texts = module.getProject().getTexts()
78
                println "-- Building 'default' edition of  ${texts.size()} texts..."
79

    
80
                def css = ["css/txm.css", "css/${corpusname}.css"] // default CSS inclusion
81

    
82
                // scan existing css files that must be declared in each HTML page
83
                if (cssDirectory.exists()) {
84
                        def cssFiles = cssDirectory.listFiles();
85
                        if (cssFiles != null) {
86
                                for (File cssFile : cssFiles) {
87
                                        if (cssFile.isFile() && !cssFile.isHidden() && cssFile.getName().endsWith(".css"))
88
                                                css << "css/"+cssFile.getName();
89
                                }
90
                        }
91
                }
92

    
93
                ConsoleProgressBar cpb = new ConsoleProgressBar(texts.size())
94
                for (Text text : texts) {
95

    
96
                        File txmFile = text.getXMLTXMFile()
97
                        try {
98
                                cpb.tick()
99

    
100
                                String textname = text.getName()
101

    
102
                                File firstHTMLPageFile = new File(outputDirectory, textname+"_1.html");
103
                                boolean mustBuild = false;
104
                                if (!firstHTMLPageFile.exists() || txmFile.lastModified() >= firstHTMLPageFile.lastModified()) {
105
                                        mustBuild = true
106
                                }
107
                                
108
                                if (!text.isDirty() && !mustBuild) {
109
                                        Log.finer("skipping 'default html' step of $text");
110
                                        continue
111
                                }
112

    
113
                                Edition edition = text.getEdition("default")
114
                                if (edition != null) {
115
                                        edition.delete()
116
                                }
117
                                
118
                                edition = new Edition(text);
119

    
120
                                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
121
                                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
122

    
123

    
124
                                def ed = new XTZDefaultPagerStep(this, txmFile, textname, NoSpaceBefore, NoSpaceAfter, css);
125
                                if (!ed.process()) {
126
                                        println "Fail to build 'default' edition for text: $txmFile"
127
                                        continue;
128
                                }
129
                                
130
                                edition.setName("default");
131
                                edition.setIndex(outputDirectory.getAbsolutePath());
132

    
133
                                for (int i = 0 ; i < ed.getPageFiles().size();) {
134
                                        File f = ed.getPageFiles().get(i);
135
                                        String wordid = "w_0";
136
                                        if (i < ed.getIdx().size()) wordid = ed.getIdx().get(i);
137
                                        edition.addPage(""+(++i), wordid);
138
                                }
139
                        } catch(Exception e) {
140
                                println "Error: could not create $txmFile 'default' edition: "+e
141
                                e.printStackTrace()
142
                        }
143
                }
144

    
145
                // copy default TXM css file in the "facs" edition directory
146
                File csshtmlDirectory = new File(outputDirectory, "css")
147
                csshtmlDirectory.mkdirs()
148
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/css", "txm.css", csshtmlDirectory);
149

    
150
                // copy CSS files in the "default" edition directory
151
                if (cssDirectory.exists()) {
152
                        FileCopy.copyFiles(cssDirectory, csshtmlDirectory)
153
                }
154
                if (jsDirectory.exists()) {
155
                        File jshtmlDirectory = new File(outputDirectory, "js")
156
                        FileCopy.copyFiles(jsDirectory, jshtmlDirectory)
157
                }
158
                if (imagesDirectory.exists()) {
159
                        File imageshtmlDirectory = new File(outputDirectory, "images")
160
                        FileCopy.copyFiles(imagesDirectory, imageshtmlDirectory)
161
                }
162

    
163
                // save changes
164
                println ""
165
                return true;
166
        }
167

    
168
        public boolean doFacsEditionStep() {
169

    
170
                boolean mustBuildFacsEdition = project.getEditionDefinition("facs").getBuildEdition()
171
                if (!mustBuildFacsEdition) return true;
172

    
173
                String imageDirectoryPath = project.getEditionDefinition("facs").getImagesDirectory();
174
                File imageDirectory = null
175

    
176
                if (imageDirectoryPath != null) {
177
                        imageDirectoryPath = imageDirectoryPath.trim()
178
                        imageDirectory = new File(imageDirectoryPath)
179
                        if (!imageDirectoryPath.startsWith("http") && imageDirectoryPath.length()== 0 && !imageDirectory.exists() && !imageDirectory.isDirectory()) {
180
                                imageDirectory = null;
181
                        }
182
                }
183

    
184
                def second = 0
185

    
186
                def texts = module.getProject().getTexts()
187
                println "-- Building 'facs' edition of ${texts.size()} texts..."
188
                File newEditionDirectory = new File(htmlDirectory, "facs");
189
                newEditionDirectory.mkdir();
190

    
191
                ConsoleProgressBar cpb = new ConsoleProgressBar(texts.size())
192
                for (Text text : texts) {
193
                        cpb.tick()
194

    
195
                        File txmFile = text.getXMLTXMFile()
196
                        String txtname = text.getName()
197

    
198
                        File firstHTMLPageFile = new File(newEditionDirectory, txtname+"_1.html");
199
                        boolean mustBuild = false;
200
                        if (!firstHTMLPageFile.exists() || txmFile.lastModified() >= firstHTMLPageFile.lastModified()) {
201
                                mustBuild = true
202
                        }
203
                        
204
                        if (!text.isDirty() && !mustBuild) {
205
                                Log.finer("skipping 'default html' step of $text");
206
                                continue
207
                        }
208

    
209
                        Edition edition = text.getEdition("facs")
210
                        if (edition != null) {
211
                                edition.delete()
212
                        }
213
                        
214
                        edition = new Edition(text);
215

    
216
                        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
217
                        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
218

    
219
                        edition.setName("facs");
220
                        edition.setIndex(outputDirectory.getAbsolutePath());
221

    
222
                        try {
223
                                def ed = new XTZFacsPagerStep(txmFile, newEditionDirectory, imageDirectory, txtname, corpusname, "pb", "facs", wordTag, debug);
224
                                if (!ed.process()) {
225
                                        println "Fail to build 'facs' edition for text: $txmFile"
226
                                        continue;
227
                                }
228

    
229
                                def pages = ed.getPageFiles()
230
                                for (int i = 0 ; i < pages.size();) {
231
                                        File f = pages[i][0];
232
                                        String wordid = pages[i][1]
233
                                        //TODO replace '""+(++i)' with something that fetch/findout the page 'name'
234
                                        // TODO or move the Edition and Page corpus declaration in the XTZDefaultPagerStep
235
                                        edition.addPage(""+(++i), wordid);
236
                                        //println "add facs page: $f $wordid"
237
                                }
238
                        } catch (Exception e) {
239
                                println "Error while processing $txmFile text: "+e
240
                                e.printStackTrace();
241
                                return false;
242
                        }
243
                }
244

    
245

    
246
                if (!imageDirectoryPath.startsWith("http") && imageDirectory != null) { // copy files only if local
247
                        File editionImagesDirectory = new File(newEditionDirectory, "res/images/"+corpusname+"/facs");
248
                        editionImagesDirectory.mkdirs();
249
                        FileCopy.copyFiles(imageDirectory, editionImagesDirectory);
250
                }
251

    
252
                // copy SimpleViewer files in the "facs" edition directory
253
                File jshtmlDirectory = new File(newEditionDirectory, "js")
254
                jshtmlDirectory.mkdirs()
255
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/js", "viewer", jshtmlDirectory);
256
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/", "images", newEditionDirectory);
257

    
258
                // copy default TXM css file in the "facs" edition directory
259
                File csshtmlDirectory = new File(newEditionDirectory, "css")
260
                csshtmlDirectory.mkdirs()
261
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/css", "txm.css", csshtmlDirectory);
262

    
263
                // copy CSS/JS/Images sources files in the "facs" edition directory
264
                if (cssDirectory.exists()) {
265
                        FileCopy.copyFiles(cssDirectory, csshtmlDirectory)
266
                }
267
                if (jsDirectory.exists()) {
268
                        FileCopy.copyFiles(jsDirectory, jshtmlDirectory)
269
                }
270
                if (imagesDirectory.exists()) {
271
                        File imageshtmlDirectory = new File(newEditionDirectory, "images")
272
                        FileCopy.copyFiles(imagesDirectory, imageshtmlDirectory)
273
                }
274

    
275
                project.setDefaultEditionName("default,facs");
276

    
277
                println ""
278
                return true;
279
        }
280

    
281
        /**
282
         * read from $bindir/txm and write the result in $bindir/txm
283
         *
284
         */
285
        public boolean doPostEditionXSLStep() {
286

    
287
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/4-edition")
288
                if (xslDirectory.exists()) {
289

    
290
                        // prepare XSL parameters
291
                        def xslParams = project.getXsltParameters()
292
                        String s = project.getEditionDefinition("default").getWordsPerPage();
293
                        if (s != null && s.length() > 0)
294

    
295
                                // shared XSL parameters
296
                        xslParams["number-words-per-page"] = Integer.parseInt(s);
297
                        xslParams["pagination-element"] = project.getEditionDefinition("default").getPageElement();
298
                        xslParams["import-xml-path"] = project.getProjectDirectory()
299
                        //println "XSL PARAMS: "+xslParams
300

    
301
                        def xslFiles = xslDirectory.listFiles()
302
                        xslFiles = xslFiles.sort() { f ->
303
                                try {
304
                                        return Integer.parseInt(f.getName().substring(0, f.getName().indexOf("-")))
305
                                } catch(Exception e) {}
306
                                return -1;
307
                        }
308
                        def editionsCreated = [:]
309
                        for (File xslFile : xslFiles) {
310
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
311
                                if (!xslFile.getName().matches("[1-9]{1,3}-.+")) continue;
312

    
313
                                String xslName = xslFile.getName().substring(2); // remove the "1-", "2-", etc.
314
                                int idx2 = xslName.indexOf(".")
315
                                if (idx2 > 0) xslName = xslName.substring(0, idx2)
316
                                else {
317
                                        println "$xslFile is not a '.xsl' file"
318
                                        continue;
319
                                }
320
                                int idx3 = xslName.indexOf("-")
321
                                if (idx3 < 0) {
322
                                        println "$xslFile file does not follow the '{Number}-{editionName}-{step}.xsl' name pattern"
323
                                        continue;
324
                                }
325
                                String pagerStep = xslName.substring(idx3 + 1);
326
                                String editionName = xslName.substring(0, idx3);
327

    
328
                                int idx = editionName.indexOf(".")
329
                                if (idx > 0) editionName = editionName.substring(0, idx);
330
                                println "-- Building '$editionName' XSL edition with step '$pagerStep'..."
331

    
332
                                File newEditionDirectory = new File(htmlDirectory, editionName);
333
                                xslParams["output-directory"] = newEditionDirectory.toURI().toString()
334

    
335
                                if (editionsCreated[editionName] == null) { // first XSL, replace an edition
336
                                        editionsCreated[editionName] = xslFile
337

    
338
                                        newEditionDirectory.deleteDir(); // delete previous edition if any
339
                                        newEditionDirectory.mkdir()
340

    
341
                                        boolean deleteOutputFiles = "pager" == pagerStep;
342

    
343
                                        if (ApplyXsl2.processImportSources(xslFile, inputDirectory, newEditionDirectory, xslParams, deleteOutputFiles)) {
344
                                                println ""
345
                                        } else {
346
                                                reason = "Fail to apply edition XSL: $xslFile"
347
                                                return false;
348
                                        }
349

    
350
                                        // copy CSS files in the newEditionDirector edition directory
351
                                        if (cssDirectory.exists()) {
352
                                                File csshtmlDirectory = new File(newEditionDirectory, "css")
353
                                                FileCopy.copyFiles(cssDirectory, csshtmlDirectory)
354
                                        }
355
                                        if (jsDirectory.exists()) {
356
                                                File jshtmlDirectory = new File(newEditionDirectory, "js")
357
                                                FileCopy.copyFiles(jsDirectory, jshtmlDirectory)
358
                                        }
359
                                        if (imagesDirectory.exists()) {
360
                                                File imageshtmlDirectory = new File(newEditionDirectory, "images")
361
                                                FileCopy.copyFiles(imagesDirectory, imageshtmlDirectory)
362
                                        }
363
                                } else { // N+1 XSL working with HTML files
364
                                        def htmlFiles = newEditionDirectory.listFiles()
365
                                        htmlFiles.sort()
366

    
367
                                        if (ApplyXsl2.processImportSources(xslFile, htmlFiles, xslParams)) {
368
                                                if ("pager".equals(pagerStep)) {
369
                                                        // delete the one page HTML files only if the XSL step is "pager"
370
                                                        for (File f : htmlFiles) f.delete();
371
                                                }
372
                                                //        println ""
373
                                        } else {
374
                                                reason = "Fail to apply edition XSL: $xslFile"
375
                                                return false;
376
                                        }
377
                                }
378
                        }
379

    
380
                        // UPDATE import.xml: for each XML-TXM file, we must retrieve the first word ID from the XSL output files
381
                        //println "retrieve word ids from $inputDirectory"
382
                        println "-- Fetching page word IDs..."
383
                        ConsoleProgressBar cpb = new ConsoleProgressBar(editionsCreated.keySet().size())
384
                        for (String editionName : editionsCreated.keySet()) {
385
                                cpb.tick()
386

    
387
                                File newEditionDirectory = new File(htmlDirectory, editionName);
388
                                File xslFile = editionsCreated[editionName]
389
                                for (File txmFile : inputDirectory.listFiles()) {
390
                                        if (txmFile.isDirectory()) continue;
391
                                        String textName = txmFile.getName()
392
                                        int idx4 = textName.indexOf(".")
393
                                        if (idx4 > 0) textName = textName.substring(0, idx4);
394

    
395
                                        getFirstWordIDs(textName, editionName, newEditionDirectory, xslFile, txmFile);
396
                                }
397

    
398
                                def editionDeclaration = project.getEditionDefinition(editionName); // create the edition definition
399
                                editionDeclaration.setBuildEdition(true)
400
                                editionDeclaration.setPageBreakTag(project.getEditionDefinition("default").getPageElement())
401
                                editionDeclaration.setWordsPerPage(project.getEditionDefinition("default").getWordsPerPage())
402
                        }
403
                        println ""
404
                }
405
                return true;
406
        }
407

    
408
        private void getFirstWordIDs(String textName, String editionName, File newEditionDirectory, File xslFile, File txmFile) {
409
                //                println "call getFirstWordIDs textName=$textName editionName=$editionName dir=$newEditionDirectory xsl=$xslFile"
410
                Text t = project.getText(textName);
411
                if (t == null) {
412
                        t = new Text(project);
413
                }
414
                t.setName(textName);
415
                t.setSourceFile(txmFile)
416
                t.setTXMFile(txmFile)
417

    
418
                Edition edition = t.getEdition(editionName)
419
                if (edition == null) { // new edition
420
                        edition = new Edition(t);
421
                } else { // replacing existing edition
422
                        edition.resetPages()
423
                }
424
                edition.setName(editionName);
425
                edition.setIndex(outputDirectory.getAbsolutePath());
426

    
427
                LinkedHashMap<File, String> words = new LinkedHashMap<File, String>()
428
                def files = []
429
                newEditionDirectory.eachFile() {it -> if (it.isFile()) files << it}
430

    
431
                files.sort() { f1, f2 ->
432
                        String s1 = f1.getName()
433
                        String s2 = f2.getName()
434
                        int n1 = Integer.parseInt(s1.substring(s1.lastIndexOf("_")+1, s1.lastIndexOf(".")))
435
                        int n2 = Integer.parseInt(s2.substring(s2.lastIndexOf("_")+1, s2.lastIndexOf(".")))
436
                        return n1 - n2;
437
                }
438

    
439
                for (File f : files) {
440
                        String pagename = f.getName();
441
                        if (pagename.startsWith(textName+"_")) { // this is a page
442
                                String firstWordID = getMetaContent(f);
443
                                pagename = pagename.substring((textName+"_").length(), pagename.indexOf(".html")) // !!!!
444
                                edition.addPage(pagename, firstWordID)
445
                        }
446
                }
447
        }
448

    
449
        public static String getMetaContent(File f) {
450
                def inputData = f.toURI().toURL().openStream();
451
                def factory = XMLInputFactory.newInstance();
452
                factory.setProperty("javax.xml.stream.supportDTD", false); // ignore the DTD declared in doctype
453

    
454
                def parser = factory.createXMLStreamReader(inputData);
455
                String META = "meta"
456
                String BODY = "body"
457
                String NAME = "name"
458
                String DESCRIPTION = "txm:first-word-id"
459
                String CONTENT = "content"
460

    
461
                String content = "";
462
                String desc = "";
463

    
464
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
465
                        if (event == XMLStreamConstants.START_ELEMENT) { // start elem
466

    
467
                                if (META.equals(parser.getLocalName())) { // ana elem
468
                                        desc = "";
469
                                        // fetch attribute values
470
                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) { // scan attributes
471
                                                if (NAME.equals(parser.getAttributeLocalName(i))) { // found @name
472
                                                        desc = parser.getAttributeValue(i)
473
                                                } else if (CONTENT.equals(parser.getAttributeLocalName(i))) { // found @content
474
                                                        content = parser.getAttributeValue(i)
475
                                                }
476
                                        }
477
                                        if (DESCRIPTION.equals(desc)) { // stop now
478
                                                break;
479
                                        }
480
                                } else if (BODY.equals(parser.getLocalName())) { // no need to go further, meta@name="description" not found :(
481
                                        content = "";
482
                                        break;
483
                                }
484
                        }
485
                }
486
                if (parser != null) parser.close();
487
                if (inputData != null) inputData.close();
488

    
489
                return content;
490
        }
491

    
492
        public static void main(def args) {
493
                println "RESULT: "+getMetaContent(new File("/home/mdecorde/TXM/corpora/QGRAALXTZ/HTML/QGRAALXTZ/default", "qgraal_cm_test201510_page_160_2.html"))
494
        }
495
}