Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / xtz / XTZImporter.groovy @ 479

History | View | Annotate | Download (14.8 kB)

1
package org.txm.importer.xtz
2

    
3
import filters.Tokeniser.SimpleTokenizerXml
4

    
5
import java.io.File;
6

    
7
import net.sf.saxon.style.XSLParam;
8

    
9
import org.txm.importer.ApplyXsl2;
10
import org.txm.importer.CleanFile;
11
import org.txm.importer.MileStoneProjection;
12
import org.txm.importer.ValidateXml;
13
import org.txm.*;
14

    
15
import javax.xml.parsers.SAXParser
16
import javax.xml.parsers.SAXParserFactory
17
import javax.xml.stream.*;
18

    
19
import org.w3c.dom.Element
20
import org.xml.sax.XMLReader
21
import org.txm.utils.xml.DomUtils;
22
import org.txm.metadatas.Metadatas
23
import org.txm.objects.BaseParameters
24
import org.txm.scripts.teitxm.Xml2Ana
25
import org.txm.stat.utils.ConsoleProgressBar
26
import org.txm.utils.*;
27

    
28
class XTZImporter extends Importer {
29

    
30
        boolean cancelNow = false;
31
        def filesToProcess = []
32

    
33
        File frontXSLResultDirectory;
34
        File tokenizedDirectory;
35
        File sentencedDirectory;
36

    
37
        String xslPath;
38
        def xslParams;
39
        String lang;
40
        String wordTag;
41
        boolean skipDoTokenizeStep = false;
42

    
43
        Metadatas metadata = null; // text metadata
44

    
45
        public XTZImporter(ImportModule module) {
46
                super(module);
47
        }
48

    
49
        public Metadatas getMetadata() {
50
                return metadata;
51
        }
52

    
53
        public void process() {
54

    
55
                BaseParameters params = module.getParameters();
56
                File binDir = module.getBinaryDirectory();
57

    
58
                String corpusname = params.getCorpusName();
59
                Element corpusElem = params.corpora.get(corpusname);
60

    
61
                def srcFiles = [];
62
                def files = inputDirectory.listFiles();
63
                if (files == null) {
64
                        reason = "No file to process in "+inputDirectory
65
                        return; // no file to process in the directory
66
                }
67

    
68
                for (File f : files) {
69
                        if (f.isHidden() || f.isDirectory())
70
                                continue;
71
                        else if (f.getName().endsWith(".properties"))
72
                                continue;
73
                        else if (f.getName().equals("import.xml"))
74
                                continue;
75
                        else if (f.getName().endsWith(".csv"))
76
                                continue;
77
                        else if (f.getName().endsWith(".dtd"))
78
                                continue;
79
                        else if (f.getName().endsWith(".xsl"))
80
                                continue;
81
                        else if (f.getName().endsWith("~"))
82
                                continue;
83
                        else if (f.getName().startsWith("."))
84
                                continue;
85

    
86
                        srcFiles << f
87
                }
88

    
89
                if (srcFiles.size() == 0) {
90
                        reason = "No suitable file to process in "+inputDirectory
91
                        return; // no file to process in the directory
92
                }
93

    
94
                // prepare front XSL if any
95
                xslPath = params.getXsltElement(corpusElem).getAttribute("xsl")
96
                xslParams = params.getXsltParams(corpusElem);
97

    
98
                lang = corpusElem.getAttribute("lang");
99

    
100
                wordTag = module.getParameters().getWordElement().getTextContent()
101
                this.skipDoTokenizeStep = module.getParameters().getSkipTokenization()
102

    
103
                //prepare metadata if any
104
                File allmetadatafile = new File(inputDirectory, "metadata.csv");
105
                if (allmetadatafile.exists()) {
106
                        File copy = new File(binDir, "metadata.csv")
107
                        if (!FileCopy.copy(allmetadatafile, copy)) {
108
                                println "Error: could not create a copy of the metadata file "+allmetadatafile.getAbsoluteFile();
109
                                return;
110
                        }
111
                        metadata = new Metadatas(copy,
112
                        Toolbox.getParam(Toolbox.METADATA_ENCODING),
113
                        Toolbox.getParam(Toolbox.METADATA_COLSEPARATOR),
114
                        Toolbox.getParam(Toolbox.METADATA_TXTSEPARATOR), 1)
115
                }
116

    
117
                String cleanDirectories = module.getParameters().getKeyValueParameters().get(ImportKeys.CLEAN);
118
                
119
                if (!doFixSurrogates()) return;
120
                if (!doSplitMergeXSLStep()) return;
121
                if (!doFrontXSLStep()) return;
122
                if (!doCheckXMLFileStep()) return;
123
                if (!doTokenizeStep()) return;
124
                if (!doPostTokenizeXSLStep()) return;
125
                if (!doEncodeMileStonesStep()) return;
126
                if (!doToXMLTXMStep()) return;
127
                if (!doInjectMetadataStep()) return;
128

    
129
                isSuccessFul = filesToProcess.size() > 0
130

    
131
                if ("true".equals(cleanDirectories)) {
132
                        new File(module.getBinaryDirectory(), "tokenized").deleteDir()
133
                        new File(module.getBinaryDirectory(), "src").deleteDir()
134
                }
135
        }
136

    
137
        /**
138
         * read from source directory and remove the surrogate chars not well managed
139
         * save the result in $bindir/src
140
         *
141
         * 
142
         *
143
         */
144
        public boolean doFixSurrogates() {
145
                File srcDirectory = new File(module.getBinaryDirectory(), "src")
146
                srcDirectory.mkdir()
147
                def files = inputDirectory.listFiles()
148
                if (files == null) {
149
                        reason = "No file to process in $inputDirectory"
150
                        return false;
151
                }
152

    
153
                for (File file : files) {
154
                        def outputFile = new File(srcDirectory, file.getName());
155
                        if (file.getName().toLowerCase().endsWith(".xml")) {
156
                                CleanFile.removeSurrogateFromXmlFile(file, outputFile)
157
                        } else {
158
                                FileCopy.copy(file, outputFile)
159
                        }
160
                }
161

    
162
                inputDirectory = srcDirectory; // the files to process are now in the "src" directory
163
                return true;
164
        }
165

    
166
        /**
167
         * read from source directory and write the result in $bindir/txm
168
         * 
169
         * only one XSL is applied
170
         *
171
         */
172
        public boolean doSplitMergeXSLStep() {
173
                File srcDirectory = new File(module.getBinaryDirectory(), "src")
174
                srcDirectory.mkdir()
175
                xslParams["output-directory"] = srcDirectory.getAbsoluteFile().toURI().toString();
176

    
177
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/1-split-merge")
178
                println "-- Split-Merge XSL Step with $xslDirectory"
179
                def xslFiles = xslDirectory.listFiles()
180
                if (xslDirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
181

    
182
                        xslFiles.sort()
183
                        for (File xslFile : xslFiles) {
184
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
185
                                //if (!xslFile.getName().matches("[1-9]-.+")) continue;
186

    
187
                                if (ApplyXsl2.processImportSources(xslFile, inputDirectory.listFiles(), xslParams, false)) {
188
                                        println ""
189
                                } else {
190
                                        reason = "Fail to apply split-merge XSL: $xslPath"
191
                                }
192
                                break; // THERE IS ONLY ONE XSL FILE TO APPLY
193
                        }
194
                        inputDirectory = srcDirectory; // the files to process are now in the "src" directory
195
                } else {
196
                        //println "Nothing to do."
197
                }
198
                return true;
199
        }
200

    
201
        public boolean doEncodeMileStonesStep() {
202
                filesToProcess = new File(module.getBinaryDirectory(), "tokenized").listFiles()
203
                def milestonesString = module.getParameters().getMilestonesElement().getTextContent().trim()
204
                if (milestonesString.length() == 0) return true;
205

    
206
                def milestones = milestonesString.split(",")
207
                //def milestones = ["lb", "pb", "cb"]
208
                //println "milestones=$milestones type="+milestones.getClass()+" size="+milestones.size()
209
                if (milestones.size() == 0) return true;
210

    
211
                println "-- Encoding milestone $milestones into XML files..."
212
                ConsoleProgressBar cpb = new ConsoleProgressBar(milestones.size())
213
                for (def milestone : milestones) {
214
                        milestone = milestone.trim()// just in case
215
                        cpb.tick()
216

    
217
                        for (File inputFile : filesToProcess) {
218
                                File outputFile = File.createTempFile("tmp", inputFile.getName(),inputFile.getParentFile());
219
                                println "$inputFile, 'text', $wordTag, $milestone, $outputFile"
220
                                MileStoneProjection msp = new MileStoneProjection(inputFile, "text", wordTag, milestone)
221
                                if (!msp.process(outputFile)) {
222
                                        println "Fail to encode $milestone in $inputFile"
223
                                        return false
224
                                } else {
225
                                        if (inputFile.delete()) {
226
                                                outputFile.renameTo(inputFile)
227
                                        } else {
228
                                                println "Fail to encode $milestone in ${inputFile}: could not replace the file."
229
                                                return false
230
                                        }
231
                                }
232
                        }
233
                }
234
                println ""
235
                return true;
236
        }
237

    
238
        /**
239
         * read from $inputDirectory and write the result in $bindir/txm 
240
         *
241
         */
242
        public boolean doFrontXSLStep() {
243

    
244
                //filesToProcess = inputDirectory.listFiles();
245

    
246
                File frontXSLdirectory = new File(module.getSourceDirectory(), "xsl/2-front")
247
                println "-- Front XSL Step with $frontXSLdirectory"
248
                def xslFiles = frontXSLdirectory.listFiles()
249
                if (frontXSLdirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
250

    
251
                        xslFiles.sort()
252
                        for (File xslFile : xslFiles) {
253
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
254
                                //if (!xslFile.getName().matches("[1-9]-.+")) continue;
255

    
256
                                if (ApplyXsl2.processImportSources(xslFile, inputDirectory, outputDirectory, xslParams)) {
257
                                        inputDirectory = outputDirectory; // the files to process are now in the "txm" directory
258
                                        println ""
259
                                } else {
260
                                        reason = "Fail to apply front XSL: $xslPath"
261
                                        return false;
262
                                }
263
                        }
264
                } else {
265
                        //println "Nothing to do."
266
                }
267
                return true;
268
        }
269

    
270
        public boolean doCheckXMLFileStep() {
271
                filesToProcess = []
272
                println "-- Check XML files for well-formedness."
273
                def files = inputDirectory.listFiles()
274

    
275
                if (files == null || files.size() == 0) {
276
                        println "No XML file (*.xml) to process. Aborting"
277
                        return false;
278
                }
279
                files = files.sort()
280

    
281
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
282
                for (File f : files) {
283
                        cpb.tick()
284
                        if (f.getName().toLowerCase().endsWith(".xml"))
285
                                if (!f.isHidden() && f.canRead() && !f.getName().endsWith(".properties")
286
                                && !f.getName().startsWith("metadata") && !f.isDirectory())
287
                        {
288

    
289
                                if (!ValidateXml.test(f)) {
290
                                        continue;
291
                                } else {
292
                                        if (!f.getName().equals("import.xml"))
293
                                                filesToProcess << f
294
                                }
295
                        }
296
                }
297
                println ""
298
                if (filesToProcess.size() == 0) {
299
                        reason = "No file suitable to process in "+inputDirectory
300
                        return false;
301
                } else {
302
                        return true;
303
                }
304
        }
305

    
306

    
307
        public boolean doTokenizeStep() {
308

    
309
                println "-- Tokenizing "+filesToProcess.size()+" files"
310
                new File(module.getBinaryDirectory(),"tokenized").mkdir()
311

    
312
                String outSideTextTagsRegex = "";
313
                String outSideTextTagsAndKeepContentRegex = "";
314
                String noteRegex = "";
315
                // get the element names to ignore
316
                Element textualPlanEelement = module.getParameters().getTextualPlans();
317

    
318
                if (textualPlanEelement != null) {
319
                        Element e1 = module.getParameters().getOutSideTextTagsElement()
320
                        def split  = e1.getTextContent().split(",")
321
                        for (String s : split) {
322
                                outSideTextTagsRegex += "|"+s.trim()
323
                        }
324
                        if (outSideTextTagsRegex.trim().length() > 0) {
325
                                outSideTextTagsRegex = outSideTextTagsRegex.substring(1) // remove the first "|"
326
                        }
327

    
328
                        Element e2 = module.getParameters().getOutSideTextTagsAndKeepContentElement()
329
                        def split2  = e2.getTextContent().split(",")
330
                        for (String s : split2) {
331
                                outSideTextTagsAndKeepContentRegex += "|"+s.trim()
332
                        }
333
                        if (outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
334
                                outSideTextTagsAndKeepContentRegex = outSideTextTagsAndKeepContentRegex.substring(1) // remove the first "|"
335
                        }
336

    
337
                        Element e3 = module.getParameters().getNoteElement()
338
                        def split3  = e3.getTextContent().split(",")
339
                        for (String s : split3) {
340
                                noteRegex += "|"+s.trim()
341
                        }
342
                        if (noteRegex.trim().length() > 0) {
343
                                noteRegex = noteRegex.substring(1) // remove the first "|"
344
                        }
345
                }
346

    
347
                //if (wordTag != "w") {
348
                if (skipDoTokenizeStep ) {
349
                        println "No tokenization do to."
350
                        // ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
351
                        for (File f : filesToProcess) {
352
                                File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
353
                                FileCopy.copy(f, outfile);
354
                        }
355
                        return true;
356
                } else {
357
                        ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
358
                        for (File f : filesToProcess) {
359
                                cpb.tick()
360
                                File infile = f;
361
                                File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
362
                                SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
363
                                if (wordTag.length() > 0)
364
                                        tokenizer.setWord_tags(wordTag);
365
                                tokenizer.setWord_element_to_create(wordTag)
366

    
367
                                if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) {
368
                                        tokenizer.setOutSideTextTags(outSideTextTagsRegex)
369
                                }
370

    
371
                                if (outSideTextTagsAndKeepContentRegex != null && outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
372
                                        tokenizer.setOutSideTextTagsAndKeepContent(outSideTextTagsAndKeepContentRegex)
373
                                }
374

    
375
                                if (noteRegex != null && noteRegex.trim().length() > 0) {
376
                                        tokenizer.setNote(noteRegex)
377
                                }
378

    
379
                                // tokenize ! \o/
380
                                if (!tokenizer.process()) {
381
                                        println("Failed to process "+f)
382
                                        outfile.delete()
383
                                }
384
                        }
385
                }
386

    
387
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
388
                println ""
389

    
390
                return true;
391
        }
392

    
393
        /**
394
         * read from $bindir/tokenized and write the result in $bindir/tokenized
395
         *
396
         */
397
        public boolean doPostTokenizeXSLStep() {
398

    
399
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
400
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/3-posttok")
401
                def xslFiles = xslDirectory.listFiles()
402
                if (!xslDirectory.exists() || xslFiles == null || xslFiles.size() == 0) return true;
403

    
404
                println "-- Posttokenize XSL Step with $xslDirectory"
405
                return ApplyXsl2.processWithMultipleXSL(filesToProcess, xslDirectory, xslParams)
406
        }
407

    
408
        public boolean doInjectMetadataStep() {
409
                if (metadata != null) {
410
                        println("-- INJECTING METADATA - "+metadata.getHeadersList()+" in texts of directory "+outputDirectory)
411
                        ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
412
                        for (File infile : filesToProcess) {
413
                                cpb.tick()
414
                                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
415
                                if (metadata.injectMetadatasInXml(infile, outfile, "text", null)) {
416
                                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
417
                                        if (!infile.exists()) {
418
                                                println "Error: could not replace $infile by $outfile"
419
                                                return false;
420
                                        }
421
                                } else {
422
                                        outfile.delete(); // fail
423
                                }
424
                        }
425
                        println ""
426
                }
427
                return true;
428
        }
429

    
430
        public boolean doToXMLTXMStep() {
431
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
432
                ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
433
                println("-- Building XML-TXM ("+filesToProcess.size()+" files)")
434
                for (File f : filesToProcess) {
435
                        //ArrayList<String> milestones = new ArrayList<String>();
436
                        cpb.tick()
437
                        File file = f;
438
                        String txmfile = f.getName();
439

    
440
                        def correspType = new HashMap<String,String>()
441
                        def correspRef = new HashMap<String,String>()
442
                        //il faut lister les id de tous les respStmt
443
                        def respId = [];
444
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
445
                        def applications = new HashMap<String,HashMap<String,String>>();
446
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
447
                        //pour construire les ref vers les taxonomies
448
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
449
                        //associe un id d'item avec sa description et son URI
450
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
451
                        //informations de respStmt
452
                        //resps (respId <voir ci-dessus>, [description, person, date])
453
                        def resps = new HashMap<String,String[]>();
454
                        //lance le traitement
455
                        def builder = new Xml2Ana(file);
456
                        builder.setWordTag(module.getParameters().getWordElement().getTextContent())
457
                        builder.setConvertAllAtrtibutes true;
458
                        builder.setCorrespondances(correspRef, correspType);
459
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
460
                        //builder.setAddTEIHeader();
461
                        if (!builder.process(new File(outputDirectory, txmfile))) {
462
                                println("Failed to process "+f);
463
                                new File(outputDirectory,txmfile).delete();
464
                        }
465
                }
466
                filesToProcess = outputDirectory.listFiles();
467
                println ""
468
                return true;
469
        }
470

    
471
        @Override
472
        public void checkFiles() {
473
                //isSuccessFul = isSuccessFul && outputDirectory.listFiles() != null;
474
        }
475

    
476
        @Override
477
        public void cancel() {
478
                cancelNow = true;
479
        }
480
}