Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZPager.groovy @ 2113

History | View | Annotate | Download (16.1 kB)

1
package org.txm.scripts.importer.xtz
2

    
3
import java.io.File;
4
import java.util.ArrayList;
5

    
6
import org.txm.objects.BaseParameters
7
import org.w3c.dom.Element
8

    
9
import org.txm.scripts.importer.*
10
import org.txm.utils.ConsoleProgressBar
11
import org.txm.utils.BundleUtils;
12
import org.txm.utils.io.FileCopy;
13
import org.txm.utils.i18n.*
14
import org.txm.importer.xtz.*
15
import javax.xml.stream.*
16
import org.txm.objects.*
17
import org.txm.importer.ApplyXsl2
18

    
19
class XTZPager extends Pager {
20

    
21
        Project project;
22

    
23
        Element corpusElem;
24
        String lang;
25
        String page_element;
26
        String wordTag;
27
        int wordsPerPage;
28

    
29
        File cssDirectory, jsDirectory, imagesDirectory;
30

    
31
        public XTZPager(ImportModule module) {
32
                super(module, "default");
33

    
34
                project = module.getProject()
35

    
36
                lang = project.getLang();
37
                wordsPerPage = project.getEditionDefinition("default").getWordsPerPage()
38
                page_element = project.getEditionDefinition("default").getPageElement()
39
                wordTag = project.getTokenizerWordElement()
40

    
41
                cssDirectory = new File(module.getSourceDirectory(), "css")
42
                jsDirectory = new File(module.getSourceDirectory(), "js")
43
                imagesDirectory = new File(module.getSourceDirectory(), "images")
44
        }
45

    
46
        public void process(ArrayList<File> files) {
47
                super.process(files);
48

    
49
                if (files == null) {
50
                        files = inputDirectory.listFiles();
51
                        if (files != null) Collections.sort(files);
52
                }
53

    
54
                if (!doDefaultEditionStep()) return;
55
                if (!doFacsEditionStep()) return;
56
                
57
                // remove extra XSL editions -> they will be recreated by the doPostEditionXSLStep call 
58
                for (EditionDefinition eDef : project.getEditionDefinitions()) {
59
                        if (eDef.getName() != "facs" && eDef.getName() != "default") {
60
                                eDef.delete();
61
                        }
62
                }
63
                if (!doPostEditionXSLStep()) return;
64

    
65
                isSuccessFul = true;
66
                println ""
67
        }
68

    
69
        public boolean doDefaultEditionStep() {
70

    
71
                boolean build_edition = project.getEditionDefinition("default").getBuildEdition()
72
                if (!build_edition) {
73
                        return true;
74
                }
75

    
76
                def second = 0
77

    
78
                println "-- Building 'default' edition of  ${files.size()} texts..."
79

    
80
                def css = ["css/txm.css", "css/${corpusname}.css"] // default CSS inclusion
81

    
82
                // scan existing css files that must be declared in each HTML page
83
                if (cssDirectory.exists()) {
84
                        def cssFiles = cssDirectory.listFiles();
85
                        if (cssFiles != null)
86
                                for (File cssFile : cssFiles) {
87
                                        if (cssFile.isFile() && !cssFile.isHidden() && cssFile.getName().endsWith(".css"))
88
                                                css << "css/"+cssFile.getName();
89
                                }
90
                }
91

    
92
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
93
                for (File txmFile : files) {
94
                        try {
95
                                cpb.tick()
96
                                String textname = txmFile.getName();
97
                                int i = textname.lastIndexOf(".");
98
                                if (i > 0) textname = textname.substring(0, i);
99

    
100
                                File firstHTMLPageFile = new File(outputDirectory, textname+"_1.html");
101
                                if (firstHTMLPageFile.exists() && firstHTMLPageFile.lastModified() >= txmFile.lastModified()) continue;
102

    
103
                                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
104
                                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
105

    
106
                                Text t = project.getText(textname);
107
                                if (t == null) {
108
                                        t = new Text(project);
109
                                }
110
                                t.setName(textname);
111
                                t.setSourceFile(txmFile)
112
                                t.setTXMFile(txmFile)
113

    
114
                                def ed = new XTZDefaultPagerStep(this, txmFile, textname, NoSpaceBefore, NoSpaceAfter, css);
115
                                if (!ed.process()) {
116
                                        println "Fail to build 'default' edition for text: $txmFile"
117
                                        continue;
118
                                }
119
                                Edition edition = t.getEdition("default")
120
                                if (edition == null) {
121
                                        edition = new Edition(t);
122
                                }
123
                                edition.setName("default");
124
                                edition.setIndex(outputDirectory.getAbsolutePath());
125
                                
126
                                for (i = 0 ; i < ed.getPageFiles().size();) {
127
                                        File f = ed.getPageFiles().get(i);
128
                                        String wordid = "w_0";
129
                                        if (i < ed.getIdx().size()) wordid = ed.getIdx().get(i);
130
                                        edition.addPage(""+(++i), wordid);
131
                                }
132
                        } catch(Exception e) {
133
                                println "Error: could not create $txmFile 'default' edition: "+e
134
                                e.printStackTrace()
135
                        }
136
                }
137

    
138
                // copy default TXM css file in the "facs" edition directory
139
                File csshtmlDirectory = new File(outputDirectory, "css")
140
                csshtmlDirectory.mkdirs()
141
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/css", "txm.css", csshtmlDirectory);
142

    
143
                // copy CSS files in the "default" edition directory
144
                if (cssDirectory.exists()) {
145
                        FileCopy.copyFiles(cssDirectory, csshtmlDirectory)
146
                }
147
                if (jsDirectory.exists()) {
148
                        File jshtmlDirectory = new File(outputDirectory, "js")
149
                        FileCopy.copyFiles(jsDirectory, jshtmlDirectory)
150
                }
151
                if (imagesDirectory.exists()) {
152
                        File imageshtmlDirectory = new File(outputDirectory, "images")
153
                        FileCopy.copyFiles(imagesDirectory, imageshtmlDirectory)
154
                }
155

    
156
                // save changes
157
                println ""
158
                return true;
159
        }
160

    
161
        public boolean doFacsEditionStep() {
162

    
163
                boolean mustBuildFacsEdition = project.getEditionDefinition("facs").getBuildEdition()
164
                if (!mustBuildFacsEdition) return true;
165

    
166
                String imageDirectoryPath = project.getEditionDefinition("facs").getImagesDirectory();
167
                File imageDirectory = null
168
                
169
                if (imageDirectoryPath != null) {
170
                        imageDirectoryPath = imageDirectoryPath.trim()
171
                        if (!imageDirectoryPath.startsWith("http") && imageDirectoryPath.length()== 0 && !imageDirectory.exists() && !imageDirectory.isDirectory()) {
172
                                imageDirectory = null;
173
                        }
174
                }
175

    
176
                def second = 0
177

    
178
                println "-- Building 'facs' edition of ${files.size()} texts..."
179
                File newEditionDirectory = new File(htmlDirectory, "facs");
180
                newEditionDirectory.mkdir();
181

    
182
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
183
                for (File txmFile : files) {
184
                        cpb.tick()
185
                        String txtname = txmFile.getName();
186
                        int i = txtname.lastIndexOf(".");
187
                        if (i > 0) txtname = txtname.substring(0, i);
188

    
189
                        File firstHTMLPageFile = new File(newEditionDirectory, txtname+"_1.html");
190
                        if (firstHTMLPageFile.exists() && firstHTMLPageFile.lastModified() >= txmFile.lastModified()) continue;
191

    
192
                        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
193
                        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
194

    
195
                        Text t = project.getText(txtname);
196
                        if (t == null) {
197
                                t = new Text(project);
198
                        }
199
                        t.setName(txtname);
200
                        t.setSourceFile(txmFile)
201
                        t.setTXMFile(txmFile)
202

    
203
                        Edition edition = t.getEdition("facs")
204
                        if (edition == null) {
205
                                edition = new Edition(t);
206
                        }
207
                        edition.setName("facs");
208
                        edition.setIndex(outputDirectory.getAbsolutePath());
209

    
210
                        try {
211
                                def ed = new XTZFacsPagerStep(txmFile, newEditionDirectory, imageDirectory, txtname, corpusname, "pb", "facs", wordTag, debug);
212
                                if (!ed.process()) {
213
                                        println "Fail to build 'facs' edition for text: $txmFile"
214
                                        continue;
215
                                }
216

    
217
                                def pages = ed.getPageFiles()
218
                                for (i = 0 ; i < pages.size();) {
219
                                        File f = pages[i][0];
220
                                        String wordid = pages[i][1]
221
                                        //TODO replace '""+(++i)' with something that fetch/findout the page 'name'
222
                                        // TODO or move the Edition and Page corpus declaration in the XTZDefaultPagerStep
223
                                        edition.addPage(""+(++i), wordid);
224
                                        //println "add facs page: $f $wordid"
225
                                }
226
                        } catch (Exception e) {
227
                                println "Error while processing $txmFile text: "+e
228
                                e.printStackTrace();
229
                                return false;
230
                        }
231
                }
232

    
233
                
234
                if (!imageDirectoryPath.startsWith("http") && imageDirectory != null) { // copy files only if local
235
                        File editionImagesDirectory = new File(newEditionDirectory, "res/images/"+corpusname+"/facs");
236
                        editionImagesDirectory.mkdirs();
237
                        FileCopy.copyFiles(imageDirectory, editionImagesDirectory);
238
                }
239

    
240
                // copy SimpleViewer files in the "facs" edition directory
241
                File jshtmlDirectory = new File(newEditionDirectory, "js")
242
                jshtmlDirectory.mkdirs()
243
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/js", "viewer", jshtmlDirectory);
244
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/", "images", newEditionDirectory);
245

    
246
                // copy default TXM css file in the "facs" edition directory
247
                File csshtmlDirectory = new File(newEditionDirectory, "css")
248
                csshtmlDirectory.mkdirs()
249
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/css", "txm.css", csshtmlDirectory);
250

    
251
                // copy CSS/JS/Images sources files in the "facs" edition directory
252
                if (cssDirectory.exists()) {
253
                        FileCopy.copyFiles(cssDirectory, csshtmlDirectory)
254
                }
255
                if (jsDirectory.exists()) {
256
                        FileCopy.copyFiles(jsDirectory, jshtmlDirectory)
257
                }
258
                if (imagesDirectory.exists()) {
259
                        File imageshtmlDirectory = new File(newEditionDirectory, "images")
260
                        FileCopy.copyFiles(imagesDirectory, imageshtmlDirectory)
261
                }
262

    
263
                project.setDefaultEditionName("default,facs");
264

    
265
                println ""
266
                return true;
267
        }
268

    
269
        /**
270
         * read from $bindir/txm and write the result in $bindir/txm
271
         *
272
         */
273
        public boolean doPostEditionXSLStep() {
274

    
275
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/4-edition")
276
                if (xslDirectory.exists()) {
277

    
278
                        // prepare XSL parameters
279
                        def xslParams = project.getXsltParameters()
280
                        String s = project.getEditionDefinition("default").getWordsPerPage();
281
                        if (s != null && s.length() > 0)
282

    
283
                        // shared XSL parameters
284
                        xslParams["number-words-per-page"] = Integer.parseInt(s);
285
                        xslParams["pagination-element"] = project.getEditionDefinition("default").getPageElement();
286
                        xslParams["import-xml-path"] = project.getProjectDirectory()
287
                        //println "XSL PARAMS: "+xslParams
288

    
289
                        def xslFiles = xslDirectory.listFiles()
290
                        xslFiles = xslFiles.sort() { f -> 
291
                                try {
292
                                return Integer.parseInt(f.getName().substring(0, f.getName().indexOf("-")))
293
                                } catch(Exception e) {}
294
                                return -1;
295
                        }
296
                        def editionsCreated = [:]
297
                        for (File xslFile : xslFiles) {
298
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
299
                                if (!xslFile.getName().matches("[1-9]{1,3}-.+")) continue;
300

    
301
                                String xslName = xslFile.getName().substring(2); // remove the "1-", "2-", etc.
302
                                int idx2 = xslName.indexOf(".")
303
                                if (idx2 > 0) xslName = xslName.substring(0, idx2)
304
                                else {
305
                                        println "$xslFile is not a '.xsl' file"
306
                                        continue;
307
                                }
308
                                int idx3 = xslName.indexOf("-")
309
                                if (idx3 < 0) {
310
                                        println "$xslFile file does not follow the '{Number}-{editionName}-{step}.xsl' name pattern"
311
                                        continue;
312
                                }
313
                                String pagerStep = xslName.substring(idx3 + 1);
314
                                String editionName = xslName.substring(0, idx3);
315

    
316
                                int idx = editionName.indexOf(".")
317
                                if (idx > 0) editionName = editionName.substring(0, idx);
318
                                println "-- Building '$editionName' XSL edition with step '$pagerStep'..."
319

    
320
                                File newEditionDirectory = new File(htmlDirectory, editionName);
321
                                xslParams["output-directory"] = newEditionDirectory.toURI().toString()
322

    
323
                                if (editionsCreated[editionName] == null) { // first XSL, replace an edition
324
                                        editionsCreated[editionName] = xslFile
325
                                        //if (!importModule.isUpdatingCorpus()) {
326
                                        //TODO: optimisation if update is enable,
327
                                        newEditionDirectory.deleteDir(); // delete previous edition if any
328
                                        //}
329
                                        newEditionDirectory.mkdir()
330

    
331
                                        boolean deleteOutputFiles = "pager" == pagerStep;
332
                                        if (ApplyXsl2.processImportSources(xslFile, inputDirectory, newEditionDirectory, xslParams, deleteOutputFiles)) {
333
                                                println ""
334
                                        } else {
335
                                                reason = "Fail to apply edition XSL: $xslFile"
336
                                                return false;
337
                                        }
338

    
339
                                        // copy CSS files in the newEditionDirector edition directory
340
                                        if (cssDirectory.exists()) {
341
                                                File csshtmlDirectory = new File(newEditionDirectory, "css")
342
                                                FileCopy.copyFiles(cssDirectory, csshtmlDirectory)
343
                                        }
344
                                        if (jsDirectory.exists()) {
345
                                                File jshtmlDirectory = new File(newEditionDirectory, "js")
346
                                                FileCopy.copyFiles(jsDirectory, jshtmlDirectory)
347
                                        }
348
                                        if (imagesDirectory.exists()) {
349
                                                File imageshtmlDirectory = new File(newEditionDirectory, "images")
350
                                                FileCopy.copyFiles(imagesDirectory, imageshtmlDirectory)
351
                                        }
352
                                } else { // N+1 XSL working with HTML files
353
                                        def htmlFiles = newEditionDirectory.listFiles()
354
                                        htmlFiles.sort()
355

    
356
                                        if (ApplyXsl2.processImportSources(xslFile, htmlFiles, xslParams)) {
357
                                                if ("pager".equals(pagerStep)) {
358
                                                        // delete the one page HTML files only if the XSL step is "pager"
359
                                                        for (File f : htmlFiles) f.delete();
360
                                                }
361
                                                //        println ""
362
                                        } else {
363
                                                reason = "Fail to apply edition XSL: $xslFile"
364
                                                return false;
365
                                        }
366
                                }
367
                        }
368

    
369
                        // UPDATE import.xml: for each XML-TXM file, we must retrieve the first word ID from the XSL output files
370
                        //println "retrieve word ids from $inputDirectory"
371
                        println "-- Fetching page word IDs..."
372
                        ConsoleProgressBar cpb = new ConsoleProgressBar(editionsCreated.keySet().size())
373
                        for (String editionName : editionsCreated.keySet()) {
374
                                cpb.tick()
375

    
376
                                File newEditionDirectory = new File(htmlDirectory, editionName);
377
                                File xslFile = editionsCreated[editionName]
378
                                for (File txmFile : inputDirectory.listFiles()) {
379
                                        if (txmFile.isDirectory()) continue;
380
                                        String textName = txmFile.getName()
381
                                        int idx4 = textName.indexOf(".")
382
                                        if (idx4 > 0) textName = textName.substring(0, idx4);
383

    
384
                                        getFirstWordIDs(textName, editionName, newEditionDirectory, xslFile, txmFile);
385
                                }
386
                                
387
                                def editionDeclaration = project.getEditionDefinition(editionName); // create the edition definition
388
                                editionDeclaration.setBuildEdition(true)
389
                                editionDeclaration.setPageBreakTag(project.getEditionDefinition("default").getPageElement())
390
                                editionDeclaration.setWordsPerPage(project.getEditionDefinition("default").getWordsPerPage())
391
                        }
392
                        println ""
393
                }
394
                return true;
395
        }
396

    
397
        private void getFirstWordIDs(String textName, String editionName, File newEditionDirectory, File xslFile, File txmFile) {
398
                //                println "call getFirstWordIDs textName=$textName editionName=$editionName dir=$newEditionDirectory xsl=$xslFile"
399
                Text t = project.getText(textName);
400
                if (t == null) {
401
                        t = new Text(project);
402
                }
403
                t.setName(textName);
404
                t.setSourceFile(txmFile)
405
                t.setTXMFile(txmFile)
406

    
407
                Edition edition = t.getEdition(editionName)
408
                if (edition == null) { // new edition
409
                        edition = new Edition(t);
410
                } else { // replacing existing edition
411
                        edition.resetPages()
412
                }
413
                edition.setName(editionName);
414
                edition.setIndex(outputDirectory.getAbsolutePath());
415

    
416
                LinkedHashMap<File, String> words = new LinkedHashMap<File, String>()
417
                def files = []
418
                newEditionDirectory.eachFile() {it -> if (it.isFile()) files << it}
419

    
420
                files.sort() { f1, f2 ->
421
                        String s1 = f1.getName()
422
                        String s2 = f2.getName()
423
                        int n1 = Integer.parseInt(s1.substring(s1.lastIndexOf("_")+1, s1.lastIndexOf(".")))
424
                        int n2 = Integer.parseInt(s2.substring(s2.lastIndexOf("_")+1, s2.lastIndexOf(".")))
425
                        return n1 - n2;
426
                }
427

    
428
                for (File f : files) {
429
                        String pagename = f.getName();
430
                        if (pagename.startsWith(textName+"_")) { // this is a page
431
                                String firstWordID = getMetaContent(f);
432
                                pagename = pagename.substring((textName+"_").length(), pagename.indexOf(".html")) // !!!!
433
                                edition.addPage(pagename, firstWordID)
434
                        }
435
                }
436
        }
437

    
438
        public static String getMetaContent(File f) {
439
                def inputData = f.toURI().toURL().openStream();
440
                def factory = XMLInputFactory.newInstance();
441
                factory.setProperty("javax.xml.stream.supportDTD", false); // ignore the DTD declared in doctype
442

    
443
                def parser = factory.createXMLStreamReader(inputData);
444
                String META = "meta"
445
                String BODY = "body"
446
                String NAME = "name"
447
                String DESCRIPTION = "txm:first-word-id"
448
                String CONTENT = "content"
449

    
450
                String content = "";
451
                String desc = "";
452

    
453
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
454
                        if (event == XMLStreamConstants.START_ELEMENT) { // start elem
455

    
456
                                if (META.equals(parser.getLocalName())) { // ana elem
457
                                        desc = "";
458
                                        // fetch attribute values
459
                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) { // scan attributes
460
                                                if (NAME.equals(parser.getAttributeLocalName(i))) { // found @name
461
                                                        desc = parser.getAttributeValue(i)
462
                                                } else if (CONTENT.equals(parser.getAttributeLocalName(i))) { // found @content
463
                                                        content = parser.getAttributeValue(i)
464
                                                }
465
                                        }
466
                                        if (DESCRIPTION.equals(desc)) { // stop now
467
                                                break;
468
                                        }
469
                                } else if (BODY.equals(parser.getLocalName())) { // no need to go further, meta@name="description" not found :(
470
                                        content = "";
471
                                        break;
472
                                }
473
                        }
474
                }
475
                if (parser != null) parser.close();
476
                if (inputData != null) inputData.close();
477

    
478
                return content;
479
        }
480

    
481
        public static void main(def args) {
482
                println "RESULT: "+getMetaContent(new File("/home/mdecorde/TXM/corpora/QGRAALXTZ/HTML/QGRAALXTZ/default", "qgraal_cm_test201510_page_160_2.html"))
483
        }
484
}