Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / xtz / XTZImporter.groovy @ 966

History | View | Annotate | Download (14.9 kB)

1
package org.txm.importer.xtz
2

    
3
import filters.Tokeniser.SimpleTokenizerXml
4

    
5
import java.io.File;
6

    
7
import net.sf.saxon.style.XSLParam;
8

    
9
import org.txm.core.preferences.TBXPreferences;
10
import org.txm.importer.ApplyXsl2
11
import org.txm.importer.CleanFile
12
import org.txm.importer.MileStoneProjection
13
import org.txm.importer.ValidateXml
14
import org.txm.*
15

    
16
import javax.xml.parsers.SAXParser
17
import javax.xml.parsers.SAXParserFactory
18
import javax.xml.stream.*
19

    
20
import org.w3c.dom.Element
21
import org.xml.sax.XMLReader
22
import org.txm.utils.xml.DomUtils
23
import org.txm.metadatas.Metadatas
24
import org.txm.objects.BaseParameters
25
import org.txm.importer.xmltxm.Xml2Ana
26
import org.txm.stat.utils.ConsoleProgressBar
27
import org.txm.utils.io.*
28

    
29
class XTZImporter extends Importer {
30

    
31
        boolean cancelNow = false;
32
        def filesToProcess = []
33

    
34
        File frontXSLResultDirectory;
35
        File tokenizedDirectory;
36
        File sentencedDirectory;
37

    
38
        String xslPath;
39
        def xslParams;
40
        String lang;
41
        String wordTag;
42
        boolean skipDoTokenizeStep = false;
43

    
44
        Metadatas metadata = null; // text metadata
45

    
46
        public XTZImporter(ImportModule module) {
47
                super(module);
48
        }
49

    
50
        public Metadatas getMetadata() {
51
                return metadata;
52
        }
53

    
54
        public void process() {
55

    
56
                BaseParameters params = module.getParameters();
57
                File binDir = module.getBinaryDirectory();
58

    
59
                String corpusname = params.getCorpusName();
60
                Element corpusElem = params.corpora.get(corpusname);
61

    
62
                def srcFiles = [];
63
                def files = inputDirectory.listFiles();
64
                if (files == null) {
65
                        reason = "No file to process in "+inputDirectory
66
                        return; // no file to process in the directory
67
                }
68

    
69
                for (File f : files) {
70
                        if (f.isHidden() || f.isDirectory())
71
                                continue;
72
                        else if (f.getName().endsWith(".properties"))
73
                                continue;
74
                        else if (f.getName().equals("import.xml"))
75
                                continue;
76
                        else if (f.getName().endsWith(".csv"))
77
                                continue;
78
                        else if (f.getName().endsWith(".dtd"))
79
                                continue;
80
                        else if (f.getName().endsWith(".xsl"))
81
                                continue;
82
                        else if (f.getName().endsWith("~"))
83
                                continue;
84
                        else if (f.getName().startsWith("."))
85
                                continue;
86

    
87
                        srcFiles << f
88
                }
89

    
90
                if (srcFiles.size() == 0) {
91
                        reason = "No suitable file to process in "+inputDirectory
92
                        return; // no file to process in the directory
93
                }
94

    
95
                // prepare front XSL if any
96
                xslPath = params.getXsltElement(corpusElem).getAttribute("xsl")
97
                xslParams = params.getXsltParams(corpusElem);
98

    
99
                lang = corpusElem.getAttribute("lang");
100

    
101
                wordTag = module.getParameters().getWordElement().getTextContent()
102
                this.skipDoTokenizeStep = module.getParameters().getSkipTokenization()
103

    
104
                //prepare metadata if any
105
                File allmetadatafile = Metadatas.findMetadataFile(inputDirectory);
106
                if (allmetadatafile.exists()) {
107
                        File copy = new File(binDir, allmetadatasfile.getName())
108
                        if (!FileCopy.copy(allmetadatafile, copy)) {
109
                                println "Error: could not create a copy of the metadata file "+allmetadatafile.getAbsoluteFile();
110
                                return;
111
                        }
112
                        metadata = new Metadatas(copy,
113
                        Toolbox.getPreference(TBXPreferences.METADATA_ENCODING),
114
                        Toolbox.getPreference(TBXPreferences.METADATA_COLSEPARATOR),
115
                        Toolbox.getPreference(TBXPreferences.METADATA_TXTSEPARATOR), 1)
116
                }
117

    
118
                String cleanDirectories = module.getParameters().getKeyValueParameters().get(ImportKeys.CLEAN);
119
                
120
                if (!doFixSurrogates()) return;
121
                if (!doSplitMergeXSLStep()) return;
122
                if (!doFrontXSLStep()) return;
123
                if (!doCheckXMLFileStep()) return;
124
                if (!doTokenizeStep()) return;
125
                if (!doPostTokenizeXSLStep()) return;
126
                if (!doEncodeMileStonesStep()) return;
127
                if (!doToXMLTXMStep()) return;
128
                if (!doInjectMetadataStep()) return;
129

    
130
                isSuccessFul = filesToProcess.size() > 0
131

    
132
                if ("true".equals(cleanDirectories)) {
133
                        new File(module.getBinaryDirectory(), "tokenized").deleteDir()
134
                        new File(module.getBinaryDirectory(), "src").deleteDir()
135
                }
136
        }
137

    
138
        /**
139
         * read from source directory and remove the surrogate chars not well managed
140
         * save the result in $bindir/src
141
         *
142
         * 
143
         *
144
         */
145
        public boolean doFixSurrogates() {
146
                File srcDirectory = new File(module.getBinaryDirectory(), "src")
147
                srcDirectory.mkdir()
148
                def files = inputDirectory.listFiles()
149
                if (files == null) {
150
                        reason = "No file to process in $inputDirectory"
151
                        return false;
152
                }
153

    
154
                for (File file : files) {
155
                        def outputFile = new File(srcDirectory, file.getName());
156
                        if (file.getName().toLowerCase().endsWith(".xml")) {
157
                                CleanFile.removeSurrogateFromXmlFile(file, outputFile)
158
                        } else {
159
                                FileCopy.copyFiles(file, outputFile)
160
                        }
161
                }
162

    
163
                inputDirectory = srcDirectory; // the files to process are now in the "src" directory
164
                return true;
165
        }
166

    
167
        /**
168
         * read from source directory and write the result in $bindir/txm
169
         * 
170
         * only one XSL is applied
171
         *
172
         */
173
        public boolean doSplitMergeXSLStep() {
174
                File srcDirectory = new File(module.getBinaryDirectory(), "src")
175
                srcDirectory.mkdir()
176
                xslParams["output-directory"] = srcDirectory.getAbsoluteFile().toURI().toString();
177

    
178
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/1-split-merge")
179
                println "-- Split-Merge XSL Step with $xslDirectory"
180
                def xslFiles = xslDirectory.listFiles()
181
                if (xslDirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
182

    
183
                        xslFiles.sort()
184
                        for (File xslFile : xslFiles) {
185
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
186
                                //if (!xslFile.getName().matches("[1-9]-.+")) continue;
187

    
188
                                if (ApplyXsl2.processImportSources(xslFile, inputDirectory.listFiles(), xslParams, false)) {
189
                                        println ""
190
                                } else {
191
                                        reason = "Fail to apply split-merge XSL: $xslPath"
192
                                }
193
                                break; // THERE IS ONLY ONE XSL FILE TO APPLY
194
                        }
195
                        inputDirectory = srcDirectory; // the files to process are now in the "src" directory
196
                } else {
197
                        //println "Nothing to do."
198
                }
199
                return true;
200
        }
201

    
202
        public boolean doEncodeMileStonesStep() {
203
                filesToProcess = new File(module.getBinaryDirectory(), "tokenized").listFiles()
204
                def milestonesString = module.getParameters().getMilestonesElement().getTextContent().trim()
205
                if (milestonesString.length() == 0) return true;
206

    
207
                def milestones = milestonesString.split(",")
208
                //def milestones = ["lb", "pb", "cb"]
209
                //println "milestones=$milestones type="+milestones.getClass()+" size="+milestones.size()
210
                if (milestones.size() == 0) return true;
211

    
212
                println "-- Encoding milestone $milestones into XML files..."
213
                ConsoleProgressBar cpb = new ConsoleProgressBar(milestones.size())
214
                for (def milestone : milestones) {
215
                        milestone = milestone.trim()// just in case
216
                        cpb.tick()
217

    
218
                        for (File inputFile : filesToProcess) {
219
                                File outputFile = File.createTempFile("tmp", inputFile.getName(),inputFile.getParentFile());
220
                                println "$inputFile, 'text', $wordTag, $milestone, $outputFile"
221
                                MileStoneProjection msp = new MileStoneProjection(inputFile, "text", wordTag, milestone)
222
                                if (!msp.process(outputFile)) {
223
                                        println "Fail to encode $milestone in $inputFile"
224
                                        return false
225
                                } else {
226
                                        if (inputFile.delete()) {
227
                                                outputFile.renameTo(inputFile)
228
                                        } else {
229
                                                println "Fail to encode $milestone in ${inputFile}: could not replace the file."
230
                                                return false
231
                                        }
232
                                }
233
                        }
234
                }
235
                println ""
236
                return true;
237
        }
238

    
239
        /**
240
         * read from $inputDirectory and write the result in $bindir/txm 
241
         *
242
         */
243
        public boolean doFrontXSLStep() {
244

    
245
                //filesToProcess = inputDirectory.listFiles();
246

    
247
                File frontXSLdirectory = new File(module.getSourceDirectory(), "xsl/2-front")
248
                println "-- Front XSL Step with $frontXSLdirectory"
249
                def xslFiles = frontXSLdirectory.listFiles()
250
                if (frontXSLdirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
251

    
252
                        xslFiles.sort()
253
                        for (File xslFile : xslFiles) {
254
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
255
                                //if (!xslFile.getName().matches("[1-9]-.+")) continue;
256

    
257
                                if (ApplyXsl2.processImportSources(xslFile, inputDirectory, outputDirectory, xslParams)) {
258
                                        inputDirectory = outputDirectory; // the files to process are now in the "txm" directory
259
                                        println ""
260
                                } else {
261
                                        reason = "Fail to apply front XSL: $xslPath"
262
                                        return false;
263
                                }
264
                        }
265
                } else {
266
                        //println "Nothing to do."
267
                }
268
                return true;
269
        }
270

    
271
        public boolean doCheckXMLFileStep() {
272
                filesToProcess = []
273
                println "-- Check XML files for well-formedness."
274
                def files = inputDirectory.listFiles()
275

    
276
                if (files == null || files.size() == 0) {
277
                        println "No XML file (*.xml) to process. Aborting"
278
                        return false;
279
                }
280
                files = files.sort()
281

    
282
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
283
                for (File f : files) {
284
                        cpb.tick()
285
                        if (f.getName().toLowerCase().endsWith(".xml"))
286
                                if (!f.isHidden() && f.canRead() && !f.getName().endsWith(".properties")
287
                                && !f.getName().startsWith("metadata") && !f.isDirectory())
288
                        {
289

    
290
                                if (!ValidateXml.test(f)) {
291
                                        continue;
292
                                } else {
293
                                        if (!f.getName().equals("import.xml"))
294
                                                filesToProcess << f
295
                                }
296
                        }
297
                }
298
                println ""
299
                if (filesToProcess.size() == 0) {
300
                        reason = "No file suitable to process in "+inputDirectory
301
                        return false;
302
                } else {
303
                        return true;
304
                }
305
        }
306

    
307

    
308
        public boolean doTokenizeStep() {
309

    
310
                println "-- Tokenizing "+filesToProcess.size()+" files"
311
                new File(module.getBinaryDirectory(),"tokenized").mkdir()
312

    
313
                String outSideTextTagsRegex = "";
314
                String outSideTextTagsAndKeepContentRegex = "";
315
                String noteRegex = "";
316
                // get the element names to ignore
317
                Element textualPlanEelement = module.getParameters().getTextualPlans();
318

    
319
                if (textualPlanEelement != null) {
320
                        Element e1 = module.getParameters().getOutSideTextTagsElement()
321
                        def split  = e1.getTextContent().split(",")
322
                        for (String s : split) {
323
                                outSideTextTagsRegex += "|"+s.trim()
324
                        }
325
                        if (outSideTextTagsRegex.trim().length() > 0) {
326
                                outSideTextTagsRegex = outSideTextTagsRegex.substring(1) // remove the first "|"
327
                        }
328

    
329
                        Element e2 = module.getParameters().getOutSideTextTagsAndKeepContentElement()
330
                        def split2  = e2.getTextContent().split(",")
331
                        for (String s : split2) {
332
                                outSideTextTagsAndKeepContentRegex += "|"+s.trim()
333
                        }
334
                        if (outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
335
                                outSideTextTagsAndKeepContentRegex = outSideTextTagsAndKeepContentRegex.substring(1) // remove the first "|"
336
                        }
337

    
338
                        Element e3 = module.getParameters().getNoteElement()
339
                        def split3  = e3.getTextContent().split(",")
340
                        for (String s : split3) {
341
                                noteRegex += "|"+s.trim()
342
                        }
343
                        if (noteRegex.trim().length() > 0) {
344
                                noteRegex = noteRegex.substring(1) // remove the first "|"
345
                        }
346
                }
347

    
348
                //if (wordTag != "w") {
349
                if (skipDoTokenizeStep ) {
350
                        println "No tokenization do to."
351
                        // ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
352
                        for (File f : filesToProcess) {
353
                                File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
354
                                FileCopy.copy(f, outfile);
355
                        }
356
                        return true;
357
                } else {
358
                        ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
359
                        for (File f : filesToProcess) {
360
                                cpb.tick()
361
                                File infile = f;
362
                                File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
363
                                SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
364
                                if (wordTag.length() > 0)
365
                                        tokenizer.setWord_tags(wordTag);
366
                                tokenizer.setWord_element_to_create(wordTag)
367

    
368
                                if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) {
369
                                        tokenizer.setOutSideTextTags(outSideTextTagsRegex)
370
                                }
371

    
372
                                if (outSideTextTagsAndKeepContentRegex != null && outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
373
                                        tokenizer.setOutSideTextTagsAndKeepContent(outSideTextTagsAndKeepContentRegex)
374
                                }
375

    
376
                                if (noteRegex != null && noteRegex.trim().length() > 0) {
377
                                        tokenizer.setNote(noteRegex)
378
                                }
379

    
380
                                // tokenize ! \o/
381
                                if (!tokenizer.process()) {
382
                                        println("Failed to process "+f)
383
                                        outfile.delete()
384
                                }
385
                        }
386
                }
387

    
388
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
389
                println ""
390

    
391
                return true;
392
        }
393

    
394
        /**
395
         * read from $bindir/tokenized and write the result in $bindir/tokenized
396
         *
397
         */
398
        public boolean doPostTokenizeXSLStep() {
399

    
400
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
401
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/3-posttok")
402
                def xslFiles = xslDirectory.listFiles()
403
                if (!xslDirectory.exists() || xslFiles == null || xslFiles.size() == 0) return true;
404

    
405
                println "-- Posttokenize XSL Step with $xslDirectory"
406
                return ApplyXsl2.processWithMultipleXSL(filesToProcess, xslDirectory, xslParams)
407
        }
408

    
409
        public boolean doInjectMetadataStep() {
410
                if (metadata != null) {
411
                        println("-- INJECTING METADATA - "+metadata.getHeadersList()+" in texts of directory "+outputDirectory)
412
                        ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
413
                        for (File infile : filesToProcess) {
414
                                cpb.tick()
415
                                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
416
                                if (metadata.injectMetadatasInXml(infile, outfile, "text", null)) {
417
                                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
418
                                        if (!infile.exists()) {
419
                                                println "Error: could not replace $infile by $outfile"
420
                                                return false;
421
                                        }
422
                                } else {
423
                                        outfile.delete(); // fail
424
                                }
425
                        }
426
                        println ""
427
                }
428
                return true;
429
        }
430

    
431
        public boolean doToXMLTXMStep() {
432
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
433
                ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
434
                println("-- Building XML-TXM ("+filesToProcess.size()+" files)")
435
                for (File f : filesToProcess) {
436
                        //ArrayList<String> milestones = new ArrayList<String>();
437
                        cpb.tick()
438
                        File file = f;
439
                        String txmfile = f.getName();
440

    
441
                        def correspType = new HashMap<String,String>()
442
                        def correspRef = new HashMap<String,String>()
443
                        //il faut lister les id de tous les respStmt
444
                        def respId = [];
445
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
446
                        def applications = new HashMap<String,HashMap<String,String>>();
447
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
448
                        //pour construire les ref vers les taxonomies
449
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
450
                        //associe un id d'item avec sa description et son URI
451
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
452
                        //informations de respStmt
453
                        //resps (respId <voir ci-dessus>, [description, person, date])
454
                        def resps = new HashMap<String,String[]>();
455
                        //lance le traitement
456
                        def builder = new Xml2Ana(file);
457
                        builder.setWordTag(module.getParameters().getWordElement().getTextContent())
458
                        builder.setConvertAllAtrtibutes true;
459
                        builder.setCorrespondances(correspRef, correspType);
460
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
461
                        //builder.setAddTEIHeader();
462
                        if (!builder.process(new File(outputDirectory, txmfile))) {
463
                                println("Failed to process "+f);
464
                                new File(outputDirectory,txmfile).delete();
465
                        }
466
                }
467
                filesToProcess = outputDirectory.listFiles();
468
                println ""
469
                return true;
470
        }
471

    
472
        @Override
473
        public void checkFiles() {
474
                //isSuccessFul = isSuccessFul && outputDirectory.listFiles() != null;
475
        }
476

    
477
        @Override
478
        public void cancel() {
479
                cancelNow = true;
480
        }
481
}