Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZImporter.groovy @ 2246

History | View | Annotate | Download (15 kB)

1
package org.txm.scripts.importer.xtz
2

    
3
import org.txm.scripts.filters.Tokeniser.SimpleTokenizerXml
4

    
5
import java.io.File;
6

    
7
import net.sf.saxon.style.XSLParam;
8

    
9
import org.txm.core.preferences.TBXPreferences;
10
import org.txm.importer.ApplyXsl2
11
import org.txm.scripts.importer.CleanFile
12
import org.txm.scripts.importer.MileStoneProjection
13
import org.txm.importer.ValidateXml
14
import org.txm.*
15

    
16
import javax.xml.parsers.SAXParser
17
import javax.xml.parsers.SAXParserFactory
18
import javax.xml.stream.*
19

    
20
import org.w3c.dom.Element
21
import org.xml.sax.XMLReader
22
import org.txm.utils.xml.DomUtils
23
import org.txm.metadatas.Metadatas
24
import org.txm.objects.*
25
import org.txm.importer.scripts.xmltxm.Xml2Ana
26
import org.txm.utils.ConsoleProgressBar
27
import org.txm.utils.io.*
28
import org.txm.importer.xtz.*
29

    
30
class XTZImporter extends Importer {
31

    
32
        public boolean cancelNow = false;
33
        public def filesToProcess = []
34

    
35
        public File frontXSLResultDirectory;
36
        public File tokenizedDirectory;
37
        public File sentencedDirectory;
38

    
39
        public String xslPath;
40
        public def xslParams;
41
        public String lang;
42
        public String wordTag;
43
        public boolean doTokenizeStep = false;
44

    
45
        public Metadatas metadata = null; // text metadata
46

    
47
        public XTZImporter(ImportModule module) {
48
                super(module);
49
        }
50

    
51
        public Metadatas getMetadata() {
52
                return metadata;
53
        }
54

    
55
        public void process() {
56

    
57
                Project project = module.getProject();
58
                File binDir = module.getBinaryDirectory();
59

    
60
                String corpusname = project.getName();
61

    
62
                def srcFiles = [];
63
                def files = inputDirectory.listFiles();
64
                if (files == null) {
65
                        reason = "No file to process in "+inputDirectory
66
                        return; // no file to process in the directory
67
                }
68

    
69
                for (File f : files) {
70
                        if (f.isHidden() || f.isDirectory())
71
                                continue;
72
                        else if (f.getName().endsWith(".properties"))
73
                                continue;
74
                        else if (f.getName().equals("import.xml"))
75
                                continue;
76
                        else if (f.getName().endsWith(".csv"))
77
                                continue;
78
                        else if (f.getName().endsWith(".dtd"))
79
                                continue;
80
                        else if (f.getName().endsWith(".xsl"))
81
                                continue;
82
                        else if (f.getName().endsWith("~"))
83
                                continue;
84
                        else if (f.getName().startsWith("."))
85
                                continue;
86

    
87
                        srcFiles << f
88
                }
89

    
90
                if (srcFiles.size() == 0) {
91
                        reason = "No suitable file to process in "+inputDirectory
92
                        return; // no file to process in the directory
93
                }
94

    
95
                // prepare front XSL if any
96
                xslPath = project.getFrontXSL()
97
                xslParams = project.getXsltParameters()
98

    
99
                lang = project.getLang();
100

    
101
                wordTag = project.getTokenizerWordElement()
102
                this.doTokenizeStep = project.getDoTokenizerStep()
103

    
104
                //prepare metadata if any
105
                File allMetadataFile = Metadatas.findMetadataFile(inputDirectory);
106
                if (allMetadataFile.exists()) {
107
                        File copy = new File(binDir, allMetadataFile.getName())
108
                        if (!FileCopy.copy(allMetadataFile, copy)) {
109
                                println "Error: could not create a copy of the metadata file "+allMetadataFile.getAbsoluteFile();
110
                                return;
111
                        }
112
                        metadata = new Metadatas(copy,
113
                                        Toolbox.getPreference(TBXPreferences.METADATA_ENCODING),
114
                                        Toolbox.getPreference(TBXPreferences.METADATA_COLSEPARATOR),
115
                                        Toolbox.getPreference(TBXPreferences.METADATA_TXTSEPARATOR), 1)
116
                }
117

    
118
                // remove previous Text if any
119
                project.deleteChildren(Text.class);
120
                
121
                
122
                // main workflow of XTZ importer step
123
                
124

    
125
                if (!doFixSurrogates()) return;
126
                if (!doSplitMergeXSLStep()) return;
127
                if (!doFrontXSLStep()) return;
128
                if (!doCheckXMLFileStep()) return;
129
                if (!doTokenizeStep()) return;
130
                if (!doPostTokenizeXSLStep()) return;
131
                if (!doEncodeMileStonesStep()) return;
132
                if (!doToXMLTXMStep()) return;
133
                if (!doInjectMetadataStep()) return;
134
                
135
                isSuccessFul = filesToProcess.size() > 0
136

    
137
                String cleanDirectories = project.getCleanAfterBuild();
138
                if ("true".equals(cleanDirectories)) {
139
                        new File(module.getBinaryDirectory(), "tokenized").deleteDir()
140
                        new File(module.getBinaryDirectory(), "src").deleteDir()
141
                        new File(module.getBinaryDirectory(), "split").deleteDir()
142
                }
143
        }
144

    
145
        /**
146
         * read from source directory and remove the surrogate chars not well managed
147
         * save the result in $bindir/src
148
         *
149
         * 
150
         *
151
         */
152
        public boolean doFixSurrogates() {
153
                File srcDirectory = new File(module.getBinaryDirectory(), "src")
154
                
155
                def files = inputDirectory.listFiles()
156
                if (files == null) {
157
                        reason = "No file to process in $inputDirectory"
158
                        return false;
159
                }
160
                srcDirectory.deleteDir() // clean before copying
161
                srcDirectory.mkdir()
162
                
163
                for (File file : files) {
164
                        def outputFile = new File(srcDirectory, file.getName());
165
                        if (file.getName().toLowerCase().endsWith(".xml")) {
166
                                CleanFile.removeSurrogateFromXmlFile(file, outputFile)
167
                        } else {
168
                                FileCopy.copyFiles(file, outputFile)
169
                        }
170
                }
171

    
172
                inputDirectory = srcDirectory; // the files to process are now in the "src" directory
173
                return true;
174
        }
175

    
176
        /**
177
         * read from source directory and write the result in $bindir/txm
178
         * 
179
         * only one XSL is applied
180
         *
181
         */
182
        public boolean doSplitMergeXSLStep() {
183
                
184
                File splitedDirectory = new File(module.getBinaryDirectory(), "split")
185
                splitedDirectory.deleteDir()
186
                
187
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/1-split-merge")
188
                println "-- Split-Merge XSL Step with $xslDirectory"
189
                def xslFiles = xslDirectory.listFiles()
190
                if (xslDirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
191
                        
192
                        splitedDirectory.mkdir()
193
                        xslParams["output-directory"] = splitedDirectory.getAbsoluteFile().toURI().toString();
194
                        
195
                        xslFiles.sort()
196
                        for (File xslFile : xslFiles) {
197
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
198
                                //if (!xslFile.getName().matches("[1-9]-.+")) continue;
199

    
200
                                if (ApplyXsl2.processImportSources(xslFile, inputDirectory.listFiles(), xslParams)) {
201
                                        println ""
202
                                } else {
203
                                        reason = "Fail to apply split-merge XSL: $xslPath"
204
                                }
205
                                break; // THERE IS ONLY ONE XSL FILE TO APPLY
206
                        }
207
                        inputDirectory = splitedDirectory; // the files to process are now in the "src" directory
208
                } else {
209
                        //println "Nothing to do."
210
                }
211
                return true;
212
        }
213

    
214
        public boolean doEncodeMileStonesStep() {
215
                filesToProcess = new File(module.getBinaryDirectory(), "tokenized").listFiles()
216
                def milestonesString = module.getProject().getTextualPlan("MileStones")
217
                if (milestonesString.length() == 0) return true;
218

    
219
                def milestones = milestonesString.split(",")
220
                //def milestones = ["lb", "pb", "cb"]
221
                //println "milestones=$milestones type="+milestones.getClass()+" size="+milestones.size()
222
                if (milestones.size() == 0) return true;
223

    
224
                println "-- Encoding milestone $milestones into XML files..."
225
                ConsoleProgressBar cpb = new ConsoleProgressBar(milestones.size())
226
                for (def milestone : milestones) {
227
                        milestone = milestone.trim()// just in case
228
                        cpb.tick()
229

    
230
                        for (File inputFile : filesToProcess) {
231
                                File outputFile = File.createTempFile("tmp", inputFile.getName(),inputFile.getParentFile());
232
                                println "$inputFile, 'text', $wordTag, $milestone, $outputFile"
233
                                MileStoneProjection msp = new MileStoneProjection(inputFile, "text", wordTag, milestone)
234
                                if (!msp.process(outputFile)) {
235
                                        println "Fail to encode $milestone in $inputFile"
236
                                        return false
237
                                } else {
238
                                        if (inputFile.delete()) {
239
                                                FileCopy.copy(outputFile, inputFile)
240
                                                //outputFile.renameTo(inputFile)
241
                                        } else {
242
                                                println "Fail to encode $milestone in ${inputFile}: could not replace the file with the $outputFile file."
243
                                                return false
244
                                        }
245
                                }
246
                        }
247
                }
248
                println ""
249
                return true;
250
        }
251

    
252
        /**
253
         * read from $inputDirectory and write the result in $bindir/txm 
254
         *
255
         */
256
        public boolean doFrontXSLStep() {
257

    
258
                //filesToProcess = inputDirectory.listFiles();
259

    
260
                File frontXSLdirectory = new File(module.getSourceDirectory(), "xsl/2-front")
261
                println "-- Front XSL Step with the $frontXSLdirectory directory."
262
                def xslFiles = frontXSLdirectory.listFiles()
263
                
264
                xslParams["output-directory"] = outputDirectory.getAbsoluteFile().toURI().toString();
265
                
266
                if (frontXSLdirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
267

    
268
                        xslFiles.sort()
269
                        for (File xslFile : xslFiles) {
270
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
271
                                //if (!xslFile.getName().matches("[1-9]-.+")) continue;
272

    
273
                                if (ApplyXsl2.processImportSources(xslFile, inputDirectory, outputDirectory, xslParams)) {
274
                                        inputDirectory = outputDirectory; // the files to process are now in the "txm" directory
275
                                        println ""
276
                                } else {
277
                                        reason = "Fail to apply front XSL: $xslPath"
278
                                        return false;
279
                                }
280
                        }
281
                } else {
282
                        //println "Nothing to do."
283
                }
284
                return true;
285
        }
286

    
287
        public boolean doCheckXMLFileStep() {
288
                filesToProcess = []
289
                println "-- Checking XML files for well-formedness."
290
                def files = inputDirectory.listFiles()
291

    
292
                if (files == null || files.size() == 0) {
293
                        println "No XML file (*.xml) to process. Aborting"
294
                        return false;
295
                }
296
                files = files.sort()
297

    
298
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
299
                for (File f : files) {
300
                        cpb.tick()
301
                        if (f.getName().toLowerCase().endsWith(".xml"))
302
                                if (!f.isHidden() && f.canRead() && !f.getName().endsWith(".properties")
303
                                && !f.getName().startsWith("metadata") && !f.isDirectory())
304
                        {
305

    
306
                                if (!ValidateXml.test(f)) {
307
                                        continue;
308
                                } else {
309
                                        if (!f.getName().equals("import.xml"))
310
                                                filesToProcess << f
311
                                }
312
                        }
313
                }
314
                println ""
315
                if (filesToProcess.size() == 0) {
316
                        reason = "No file suitable to process in "+inputDirectory
317
                        return false;
318
                } else {
319
                        return true;
320
                }
321
        }
322

    
323

    
324
        public boolean doTokenizeStep() {
325

    
326
                new File(module.getBinaryDirectory(),"tokenized").mkdir()
327

    
328
                String outSideTextTagsRegex = "";
329
                String outSideTextTagsAndKeepContentRegex = "";
330
                String noteRegex = "";
331
                // get the element names to ignore
332

    
333
                String e1 = module.getProject().getTextualPlan("OutSideTextTags")
334
                def split  = e1.split(",")
335
                for (String s : split) {
336
                        outSideTextTagsRegex += "|"+s.trim()
337
                }
338
                if (outSideTextTagsRegex.trim().length() > 0) {
339
                        outSideTextTagsRegex = outSideTextTagsRegex.substring(1) // remove the first "|"
340
                }
341

    
342
                String e2 = module.getProject().getTextualPlan("OutSideTextTagsAndKeepContent")
343
                def split2  = e2.split(",")
344
                for (String s : split2) {
345
                        outSideTextTagsAndKeepContentRegex += "|"+s.trim()
346
                }
347
                if (outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
348
                        outSideTextTagsAndKeepContentRegex = outSideTextTagsAndKeepContentRegex.substring(1) // remove the first "|"
349
                }
350

    
351
                String e3 = module.getProject().getTextualPlan("Note")
352
                def split3  = e3.split(",")
353
                for (String s : split3) {
354
                        noteRegex += "|"+s.trim()
355
                }
356
                if (noteRegex.trim().length() > 0) {
357
                        noteRegex = noteRegex.substring(1) // remove the first "|"
358
                }
359

    
360
                //if (wordTag != "w") {
361
                if (!doTokenizeStep) {
362
                        println "No tokenization do to."
363
                        // ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
364
                        for (File f : filesToProcess) {
365
                                File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
366
                                FileCopy.copy(f, outfile);
367
                        }
368
                        return true;
369
                } else {
370
                        println "-- Tokenizing "+filesToProcess.size()+" files"
371
                        ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
372
                        for (File f : filesToProcess) {
373
                                cpb.tick()
374
                                File infile = f;
375
                                File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
376
                                SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
377

    
378
                                if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) {
379
                                        tokenizer.setOutSideTextTags(outSideTextTagsRegex)
380
                                }
381

    
382
                                if (outSideTextTagsAndKeepContentRegex != null && outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
383
                                        tokenizer.setOutSideTextTagsAndKeepContent(outSideTextTagsAndKeepContentRegex)
384
                                }
385

    
386
                                if (noteRegex != null && noteRegex.trim().length() > 0) {
387
                                        tokenizer.setNote(noteRegex)
388
                                }
389

    
390
                                // tokenize !
391
                                if (!tokenizer.process()) {
392
                                        println("Failed to process "+f)
393
                                        outfile.delete()
394
                                }
395
                        }
396
                }
397

    
398
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
399
                println ""
400

    
401
                return true;
402
        }
403

    
404
        /**
405
         * read from $bindir/tokenized and write the result in $bindir/tokenized
406
         *
407
         */
408
        public boolean doPostTokenizeXSLStep() {
409

    
410
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
411
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/3-posttok")
412
                def xslFiles = xslDirectory.listFiles()
413
                if (!xslDirectory.exists() || xslFiles == null || xslFiles.size() == 0) return true;
414

    
415
                println "-- Posttokenize XSL Step with $xslDirectory"
416
                return ApplyXsl2.processWithMultipleXSL(filesToProcess, xslDirectory, xslParams)
417
        }
418

    
419
        public boolean doInjectMetadataStep() {
420
                if (metadata != null) {
421
                        println("-- INJECTING METADATA - "+metadata.getHeadersList()+" in texts of directory "+outputDirectory)
422
                        ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
423
                        for (File infile : filesToProcess) {
424
                                cpb.tick()
425
                                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
426
                                if (metadata.injectMetadatasInXml(infile, outfile, "text", null)) {
427
                                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
428
                                        if (!infile.exists()) {
429
                                                println "Error: could not replace $infile by $outfile"
430
                                                return false;
431
                                        }
432
                                } else {
433
                                        outfile.delete(); // fail
434
                                }
435
                        }
436
                        cpb.done()
437
                }
438
                return true;
439
        }
440

    
441
        public boolean doToXMLTXMStep() {
442
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
443
                ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
444
                println("-- Building XML-TXM ("+filesToProcess.size()+" files)")
445
                for (File f : filesToProcess) {
446
                        //ArrayList<String> milestones = new ArrayList<String>();
447
                        cpb.tick()
448
                        File file = f;
449
                        String txmfile = f.getName();
450

    
451
                        def correspType = new HashMap<String,String>()
452
                        def correspRef = new HashMap<String,String>()
453
                        //il faut lister les id de tous les respStmt
454
                        def respId = [];
455
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
456
                        def applications = new HashMap<String,HashMap<String,String>>();
457
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
458
                        //pour construire les ref vers les taxonomies
459
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
460
                        //associe un id d'item avec sa description et son URI
461
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
462
                        //informations de respStmt
463
                        //resps (respId <voir ci-dessus>, [description, person, date])
464
                        def resps = new HashMap<String,String[]>();
465
                        //lance le traitement
466
                        def builder = new Xml2Ana(file);
467
                        builder.setWordTag(module.getProject().getTokenizerWordElement())
468
                        builder.setConvertAllAtrtibutes true;
469
                        builder.setCorrespondances(correspRef, correspType);
470
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
471
                        //builder.setAddTEIHeader();
472
                        if (!builder.process(new File(outputDirectory, txmfile))) {
473
                                println("Failed to process "+f);
474
                                new File(outputDirectory,txmfile).delete();
475
                        }
476
                }
477
                filesToProcess = outputDirectory.listFiles();
478
                cpb.done();
479
                return true;
480
        }
481

    
482
        @Override
483
        public void checkFiles() {
484
                //isSuccessFul = isSuccessFul && outputDirectory.listFiles() != null;
485
        }
486

    
487
        @Override
488
        public void cancel() {
489
                cancelNow = true;
490
        }
491
}