Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZPager.groovy @ 1196

History | View | Annotate | Download (15.1 kB)

1
package org.txm.scripts.importer.xtz
2

    
3
import java.io.File;
4
import java.util.ArrayList;
5

    
6
import org.txm.objects.BaseParameters
7
import org.w3c.dom.Element
8
import org.txm.scripts.importer.*
9
import org.txm.stat.utils.ConsoleProgressBar
10
import org.txm.utils.BundleUtils;
11
import org.txm.utils.io.FileCopy;
12
import org.txm.utils.i18n.*
13
import org.txm.importer.xtz.*
14
import javax.xml.stream.*
15
import org.txm.objects.*
16
import org.txm.importer.ApplyXsl2
17

    
18
class XTZPager extends Pager {
19

    
20
        Project project;
21

    
22
        Element corpusElem;
23
        String lang;
24
        String page_element;
25
        String wordTag;
26
        int wordsPerPage;
27

    
28
        File cssDirectory, jsDirectory, imagesDirectory;
29

    
30
        public XTZPager(ImportModule module) {
31
                super(module, "default");
32

    
33
                project = module.getProject()
34

    
35
                lang = project.getLang();
36
                wordsPerPage = project.getEditionDefinition("default").getWordsPerPage()
37
                page_element = project.getEditionDefinition("default").getPageElement()
38
                wordTag = project.getTokenizerWordElement()
39

    
40
                cssDirectory = new File(module.getSourceDirectory(), "css")
41
                jsDirectory = new File(module.getSourceDirectory(), "js")
42
                imagesDirectory = new File(module.getSourceDirectory(), "images")
43
        }
44

    
45
        public void process(ArrayList<File> files) {
46
                super.process(files);
47

    
48
                if (files == null) {
49
                        files = inputDirectory.listFiles();
50
                        if (files != null) Collections.sort(files);
51
                }
52

    
53
                if (!doDefaultEditionStep()) return;
54
                if (!doFacsEditionStep()) return;
55
                if (!doPostEditionXSLStep()) return;
56

    
57
                isSuccessFul = true;
58
                println ""
59
        }
60

    
61
        public boolean doDefaultEditionStep() {
62

    
63
                boolean build_edition = project.getEditionDefinition("default").getBuildEdition()
64
                if (!build_edition) {
65
                        return true;
66
                }
67

    
68
                def second = 0
69

    
70
                println "-- Building 'default' edition of  ${files.size()} texts..."
71

    
72
                def css = ["css/txm.css", "css/${corpusname}.css"] // default CSS inclusion
73

    
74
                // scan existing css files that must be declared in each HTML page
75
                if (cssDirectory.exists()) {
76
                        def cssFiles = cssDirectory.listFiles();
77
                        if (cssFiles != null)
78
                                for (File cssFile : cssFiles) {
79
                                        if (cssFile.isFile() && !cssFile.isHidden() && cssFile.getName().endsWith(".css"))
80
                                                css << "css/"+cssFile.getName();
81
                                }
82
                }
83

    
84
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
85
                for (File txmFile : files) {
86
                        try {
87
                                cpb.tick()
88
                                String textname = txmFile.getName();
89
                                int i = textname.lastIndexOf(".");
90
                                if (i > 0) textname = textname.substring(0, i);
91

    
92
                                File firstHTMLPageFile = new File(outputDirectory, textname+"_1.html");
93
                                if (firstHTMLPageFile.exists() && firstHTMLPageFile.lastModified() >= txmFile.lastModified()) continue;
94

    
95
                                List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
96
                                List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
97

    
98
                                Text t = project.getText(textname);
99
                                if (t == null) {
100
                                        t = new Text(project);
101
                                }
102
                                t.setName(textname);
103
                                t.setSourceFile(txmFile)
104
                                t.setTXMFile(txmFile)
105

    
106
                                def ed = new XTZDefaultPagerStep(this, txmFile, textname, NoSpaceBefore, NoSpaceAfter, css);
107
                                if (!ed.process()) {
108
                                        println "Fail to build 'default' edition for text: $txmFile"
109
                                        continue;
110
                                }
111
                                Edition edition = t.getEdition("default")
112
                                if (edition == null) {
113
                                        edition = new Edition(t);
114
                                }
115
                                edition.setName("default");
116
                                edition.setIndex(outputDirectory.getAbsolutePath());
117

    
118
                                for (i = 0 ; i < ed.getPageFiles().size();) {
119
                                        File f = ed.getPageFiles().get(i);
120
                                        String wordid = "w_0";
121
                                if (i < ed.getIdx().size()) wordid = ed.getIdx().get(i);
122
                                        edition.addPage(""+(++i), wordid);
123
                                }
124
                        } catch(Exception e) {
125
                                println "Error: could not create $txmFile 'default' edition: "+e
126
                        }
127
                }
128

    
129
                // copy default TXM css file in the "facs" edition directory
130
                File csshtmlDirectory = new File(outputDirectory, "css")
131
                csshtmlDirectory.mkdirs()
132
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/css", "txm.css", csshtmlDirectory);
133

    
134
                // copy CSS files in the "default" edition directory
135
                if (cssDirectory.exists()) {
136
                        FileCopy.copyFiles(cssDirectory, csshtmlDirectory)
137
                }
138
                if (jsDirectory.exists()) {
139
                        File jshtmlDirectory = new File(outputDirectory, "js")
140
                        FileCopy.copyFiles(jsDirectory, jshtmlDirectory)
141
                }
142
                if (imagesDirectory.exists()) {
143
                        File imageshtmlDirectory = new File(outputDirectory, "images")
144
                        FileCopy.copyFiles(imagesDirectory, imageshtmlDirectory)
145
                }
146

    
147
                // save changes
148
                println ""
149
                return true;
150
        }
151

    
152
        public boolean doFacsEditionStep() {
153

    
154
                boolean mustBuildFacsEdition = project.getEditionDefinition("facs").getBuildEdition()
155
                if (!mustBuildFacsEdition) return true;
156

    
157
                String imageDirectoryPath = project.getEditionDefinition("facs").getImagesDirectory().trim();
158
                File imageDirectory = new File(imageDirectoryPath);
159
                if (!imageDirectoryPath.startsWith("http") && imageDirectoryPath.length()== 0 && !imageDirectory.exists() && !imageDirectory.isDirectory()) {
160
                        imageDirectory = null;
161
                }
162

    
163
                def second = 0
164

    
165
                println "-- Building 'facs' edition of ${files.size()} texts..."
166
                File newEditionDirectory = new File(htmlDirectory, "facs");
167
                newEditionDirectory.mkdir();
168

    
169
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
170
                for (File txmFile : files) {
171
                        cpb.tick()
172
                        String txtname = txmFile.getName();
173
                        int i = txtname.lastIndexOf(".");
174
                        if (i > 0) txtname = txtname.substring(0, i);
175

    
176
                        File firstHTMLPageFile = new File(newEditionDirectory, txtname+"_1.html");
177
                        if (firstHTMLPageFile.exists() && firstHTMLPageFile.lastModified() >= txmFile.lastModified()) continue;
178

    
179
                        List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
180
                        List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
181

    
182
                        Text t = project.getText(txtname);
183
                        if (t == null) {
184
                                t = new Text(project);
185
                        }
186
                        t.setName(txtname);
187
                        t.setSourceFile(txmFile)
188
                        t.setTXMFile(txmFile)
189

    
190
                        Edition edition = t.getEdition("facs")
191
                        if (edition == null) {
192
                                edition = new Edition(t);
193
                        }
194
                        edition.setName("facs");
195
                        edition.setIndex(outputDirectory.getAbsolutePath());
196

    
197
                        try {
198
                                def ed = new XTZFacsPagerStep(txmFile, newEditionDirectory, imageDirectory, txtname, corpusname, "pb", "facs", wordTag, debug);
199
                                if (!ed.process()) {
200
                                        println "Fail to build 'facs' edition for text: $txmFile"
201
                                        continue;
202
                                }
203

    
204

    
205
                                def pages = ed.getPageFiles()
206
                                for (i = 0 ; i < pages.size();) {
207
                                        File f = pages[i][0];
208
                                        String wordid = pages[i][1]
209
                                        //TODO replace '""+(++i)' with something that fetch/findout the page 'name'
210
                                        // TODO or move the Edition and Page corpus declaration in the XTZDefaultPagerStep
211
                                        edition.addPage(""+(++i), wordid);
212
                                        //println "add facs page: $f $wordid"
213
                                }
214
                        } catch (Exception e) {
215
                                println "Error while processing $txmFile text: "+e
216
                                e.printStackTrace();
217
                                return false;
218
                        }
219
                }
220

    
221
                if (!imageDirectoryPath.startsWith("http") && imageDirectory != null) { // copy files only if local
222
                        File editionImagesDirectory = new File(newEditionDirectory, "res/images/"+corpusname+"/facs");
223
                        editionImagesDirectory.mkdirs();
224
                        FileCopy.copyFiles(imageDirectory, editionImagesDirectory);
225
                }
226

    
227
                // copy SimpleViewer files in the "facs" edition directory
228
                File jshtmlDirectory = new File(newEditionDirectory, "js")
229
                jshtmlDirectory.mkdirs()
230
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/js", "viewer", jshtmlDirectory);
231
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/", "images", newEditionDirectory);
232

    
233
                // copy default TXM css file in the "facs" edition directory
234
                File csshtmlDirectory = new File(newEditionDirectory, "css")
235
                csshtmlDirectory.mkdirs()
236
                BundleUtils.copyFiles("org.txm.core", "res", "org/txm/css", "txm.css", csshtmlDirectory);
237

    
238
                // copy CSS/JS/Images sources files in the "facs" edition directory
239
                if (cssDirectory.exists()) {
240
                        FileCopy.copyFiles(cssDirectory, csshtmlDirectory)
241
                }
242
                if (jsDirectory.exists()) {
243
                        FileCopy.copyFiles(jsDirectory, jshtmlDirectory)
244
                }
245
                if (imagesDirectory.exists()) {
246
                        File imageshtmlDirectory = new File(newEditionDirectory, "images")
247
                        FileCopy.copyFiles(imagesDirectory, imageshtmlDirectory)
248
                }
249

    
250
                project.setDefaultEditionName("default,facs");
251

    
252
                println ""
253
                return true;
254
        }
255

    
256
        /**
257
         * read from $bindir/txm and write the result in $bindir/txm
258
         *
259
         */
260
        public boolean doPostEditionXSLStep() {
261

    
262
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/4-edition")
263
                if (xslDirectory.exists()) {
264

    
265
                        // prepare XSL parameters
266
                        def xslParams = module.getProject().getXsltParameters();
267
                        String s = module.getProject().getEditionDefinition("default").getWordsPerPage()
268
                        if (s != null && s.length() > 0)
269

    
270
                                // shared XSL parameters
271
                                xslParams["number-words-per-page"] = Integer.parseInt(s);
272
                        xslParams["pagination-element"] = module.getProject().getEditionDefinition("default").getPageElement()
273
                        xslParams["import-xml-path"] = module.getProject().getProjectDirectory()
274
                        //println "XSL PARAMS: "+xslParams
275

    
276
                        def xslFiles = xslDirectory.listFiles()
277
                        xslFiles.sort()
278
                        def editionsCreated = [:]
279
                        for (File xslFile : xslFiles) {
280
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
281
                                if (!xslFile.getName().matches("[1-9]-.+")) continue;
282

    
283
                                String xslName = xslFile.getName().substring(2); // remove the "1-", "2-", etc.
284
                                int idx2 = xslName.indexOf(".")
285
                                if (idx2 > 0) xslName = xslName.substring(0, idx2)
286
                                else {
287
                                        println "$xslFile is not a '.xsl' file"
288
                                        continue;
289
                                }
290
                                int idx3 = xslName.indexOf("-")
291
                                if (idx3 < 0) {
292
                                        println "$xslFile file does not follow the '{Number}-{editionName}-{step}.xsl' name pattern"
293
                                        continue;
294
                                }
295
                                String pagerStep = xslName.substring(idx3 + 1);
296
                                String editionName = xslName.substring(0, idx3);
297

    
298
                                int idx = editionName.indexOf(".")
299
                                if (idx > 0) editionName = editionName.substring(0, idx);
300
                                println "-- Building '$editionName' XSL edition with step '$pagerStep'..."
301

    
302
                                File newEditionDirectory = new File(htmlDirectory, editionName);
303
                                xslParams["output-directory"] = newEditionDirectory.toURI().toString()
304

    
305
                                if (editionsCreated[editionName] == null) { // first XSL, replace an edition
306
                                        editionsCreated[editionName] = xslFile
307
                                        //if (!importModule.isUpdatingCorpus()) {
308
                                        //TODO: optimisation if update is enable,
309
                                        newEditionDirectory.deleteDir(); // delete previous edition if any
310
                                        //}
311
                                        newEditionDirectory.mkdir()
312

    
313
                                        boolean deleteOutputFiles = "pager" == pagerStep;
314
                                        if (ApplyXsl2.processImportSources(xslFile, inputDirectory, newEditionDirectory, xslParams, deleteOutputFiles)) {
315
                                                println ""
316
                                        } else {
317
                                                reason = "Fail to apply edition XSL: $xslFile"
318
                                                return false;
319
                                        }
320

    
321
                                        // copy CSS files in the newEditionDirector edition directory
322
                                        if (cssDirectory.exists()) {
323
                                                File csshtmlDirectory = new File(newEditionDirectory, "css")
324
                                                FileCopy.copyFiles(cssDirectory, csshtmlDirectory)
325
                                        }
326
                                        if (jsDirectory.exists()) {
327
                                                File jshtmlDirectory = new File(newEditionDirectory, "js")
328
                                                FileCopy.copyFiles(jsDirectory, jshtmlDirectory)
329
                                        }
330
                                        if (imagesDirectory.exists()) {
331
                                                File imageshtmlDirectory = new File(newEditionDirectory, "images")
332
                                                FileCopy.copyFiles(imagesDirectory, imageshtmlDirectory)
333
                                        }
334
                                } else { // N+1 XSL working with HTML files
335
                                        def htmlFiles = newEditionDirectory.listFiles()
336
                                        htmlFiles.sort()
337

    
338
                                        if (ApplyXsl2.processImportSources(xslFile, htmlFiles, xslParams)) {
339
                                                if ("pager".equals(pagerStep)) {
340
                                                        // delete the one page HTML files only if the XSL step is "pager"
341
                                                        for (File f : htmlFiles) f.delete();
342
                                                }
343
                                                //        println ""
344
                                        } else {
345
                                                reason = "Fail to apply edition XSL: $xslFile"
346
                                                return false;
347
                                        }
348
                                }
349
                        }
350

    
351
                        // UPDATE import.xml: for each XML-TXM file, we must retrieve the first word ID from the XSL output files
352
                        //println "retrieve word ids from $inputDirectory"
353
                        println "-- Fetching page word IDs..."
354
                        ConsoleProgressBar cpb = new ConsoleProgressBar(editionsCreated.keySet().size())
355
                        for (String editionName : editionsCreated.keySet()) {
356
                                cpb.tick()
357

    
358
                                File newEditionDirectory = new File(htmlDirectory, editionName);
359
                                File xslFile = editionsCreated[editionName]
360
                                for (File txmFile : inputDirectory.listFiles()) {
361
                                        if (txmFile.isDirectory()) continue;
362
                                        String textName = txmFile.getName()
363
                                        int idx4 = textName.indexOf(".")
364
                                        if (idx4 > 0) textName = textName.substring(0, idx4);
365

    
366
                                        getFirstWordIDs(textName, editionName, newEditionDirectory, xslFile, txmFile);
367
                                }
368
                        }
369
                        println ""
370
                }
371
                return true;
372
        }
373

    
374
        private void getFirstWordIDs(String textName, String editionName, File newEditionDirectory, File xslFile, File txmFile) {
375
                //                println "call getFirstWordIDs textName=$textName editionName=$editionName dir=$newEditionDirectory xsl=$xslFile"
376
                Text t = project.getText(textName);
377
                if (t == null) {
378
                        t = new Text(project);
379
                }
380
                t.setName(textName);
381
                t.setSourceFile(txmFile)
382
                t.setTXMFile(txmFile)
383

    
384
                Edition edition = t.getEdition(editionName)
385
                if (edition == null) {
386
                        edition = new Edition(t);
387
                }
388
                edition.setName(editionName);
389
                edition.setIndex(outputDirectory.getAbsolutePath());
390

    
391
                LinkedHashMap<File, String> words = new LinkedHashMap<File, String>()
392
                def files = []
393
                newEditionDirectory.eachFile() {it -> if (it.isFile()) files << it}
394

    
395
                files.sort() { f1, f2 ->
396
                        String s1 = f1.getName()
397
                        String s2 = f2.getName()
398
                        int n1 = Integer.parseInt(s1.substring(s1.lastIndexOf("_")+1, s1.lastIndexOf(".")))
399
                        int n2 = Integer.parseInt(s2.substring(s2.lastIndexOf("_")+1, s2.lastIndexOf(".")))
400
                        return n1 - n2;
401
                }
402

    
403
                for (File f : files) {
404
                        String pagename = f.getName();
405
                        if (pagename.startsWith(textName+"_")) { // this is a page
406
                                String firstWordID = getMetaContent(f);
407
                                pagename = pagename.substring((textName+"_").length(), pagename.indexOf(".html")) // !!!!
408
                                edition.addPage(pagename, firstWordID)
409
                        }
410
                }
411
        }
412

    
413
        public static String getMetaContent(File f) {
414
                def inputData = f.toURI().toURL().openStream();
415
                def factory = XMLInputFactory.newInstance();
416
                factory.setProperty("javax.xml.stream.supportDTD", false); // ignore the DTD declared in doctype
417

    
418
                def parser = factory.createXMLStreamReader(inputData);
419
                String META = "meta"
420
                String BODY = "body"
421
                String NAME = "name"
422
                String DESCRIPTION = "txm:first-word-id"
423
                String CONTENT = "content"
424

    
425
                String content = "";
426
                String desc = "";
427

    
428
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
429
                        if (event == XMLStreamConstants.START_ELEMENT) { // start elem
430

    
431
                                if (META.equals(parser.getLocalName())) { // ana elem
432
                                        desc = "";
433
                                        // fetch attribute values
434
                                        for (int i = 0 ; i < parser.getAttributeCount(); i++) { // scan attributes
435
                                                if (NAME.equals(parser.getAttributeLocalName(i))) { // found @name
436
                                                        desc = parser.getAttributeValue(i)
437
                                                } else if (CONTENT.equals(parser.getAttributeLocalName(i))) { // found @content
438
                                                        content = parser.getAttributeValue(i)
439
                                                }
440
                                        }
441
                                        if (DESCRIPTION.equals(desc)) { // stop now
442
                                                break;
443
                                        }
444
                                } else if (BODY.equals(parser.getLocalName())) { // no need to go further, meta@name="description" not found :(
445
                                        content = "";
446
                                        break;
447
                                }
448
                        }
449
                }
450
                parser.close()
451

    
452
                return content;
453
        }
454

    
455
        public static void main(def args) {
456
                println "RESULT: "+getMetaContent(new File("/home/mdecorde/TXM/corpora/QGRAALXTZ/HTML/QGRAALXTZ/default", "qgraal_cm_test201510_page_160_2.html"))
457
        }
458
}