Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZImporter.groovy @ 2080

History | View | Annotate | Download (14.9 kB)

1
package org.txm.scripts.importer.xtz
2

    
3
import org.txm.scripts.filters.Tokeniser.SimpleTokenizerXml
4

    
5
import java.io.File;
6

    
7
import net.sf.saxon.style.XSLParam;
8

    
9
import org.txm.core.preferences.TBXPreferences;
10
import org.txm.importer.ApplyXsl2
11
import org.txm.scripts.importer.CleanFile
12
import org.txm.scripts.importer.MileStoneProjection
13
import org.txm.importer.ValidateXml
14
import org.txm.*
15

    
16
import javax.xml.parsers.SAXParser
17
import javax.xml.parsers.SAXParserFactory
18
import javax.xml.stream.*
19

    
20
import org.w3c.dom.Element
21
import org.xml.sax.XMLReader
22
import org.txm.utils.xml.DomUtils
23
import org.txm.metadatas.Metadatas
24
import org.txm.objects.*
25
import org.txm.importer.scripts.xmltxm.Xml2Ana
26
import org.txm.utils.ConsoleProgressBar
27
import org.txm.utils.io.*
28
import org.txm.importer.xtz.*
29

    
30
class XTZImporter extends Importer {
31

    
32
        public boolean cancelNow = false;
33
        public def filesToProcess = []
34

    
35
        public File frontXSLResultDirectory;
36
        public File tokenizedDirectory;
37
        public File sentencedDirectory;
38

    
39
        public String xslPath;
40
        public def xslParams;
41
        public String lang;
42
        public String wordTag;
43
        public boolean doTokenizeStep = false;
44

    
45
        public Metadatas metadata = null; // text metadata
46

    
47
        public XTZImporter(ImportModule module) {
48
                super(module);
49
        }
50

    
51
        public Metadatas getMetadata() {
52
                return metadata;
53
        }
54

    
55
        public void process() {
56

    
57
                Project project = module.getProject();
58
                File binDir = module.getBinaryDirectory();
59

    
60
                String corpusname = project.getName();
61

    
62
                def srcFiles = [];
63
                def files = inputDirectory.listFiles();
64
                if (files == null) {
65
                        reason = "No file to process in "+inputDirectory
66
                        return; // no file to process in the directory
67
                }
68

    
69
                for (File f : files) {
70
                        if (f.isHidden() || f.isDirectory())
71
                                continue;
72
                        else if (f.getName().endsWith(".properties"))
73
                                continue;
74
                        else if (f.getName().equals("import.xml"))
75
                                continue;
76
                        else if (f.getName().endsWith(".csv"))
77
                                continue;
78
                        else if (f.getName().endsWith(".dtd"))
79
                                continue;
80
                        else if (f.getName().endsWith(".xsl"))
81
                                continue;
82
                        else if (f.getName().endsWith("~"))
83
                                continue;
84
                        else if (f.getName().startsWith("."))
85
                                continue;
86

    
87
                        srcFiles << f
88
                }
89

    
90
                if (srcFiles.size() == 0) {
91
                        reason = "No suitable file to process in "+inputDirectory
92
                        return; // no file to process in the directory
93
                }
94

    
95
                // prepare front XSL if any
96
                xslPath = project.getFrontXSL()
97
                xslParams = project.getXsltParameters()
98

    
99
                lang = project.getLang();
100

    
101
                wordTag = project.getTokenizerWordElement()
102
                this.doTokenizeStep = project.getDoTokenizerStep()
103

    
104
                //prepare metadata if any
105
                File allMetadataFile = Metadatas.findMetadataFile(inputDirectory);
106
                if (allMetadataFile.exists()) {
107
                        File copy = new File(binDir, allMetadataFile.getName())
108
                        if (!FileCopy.copy(allMetadataFile, copy)) {
109
                                println "Error: could not create a copy of the metadata file "+allMetadataFile.getAbsoluteFile();
110
                                return;
111
                        }
112
                        metadata = new Metadatas(copy,
113
                                        Toolbox.getPreference(TBXPreferences.METADATA_ENCODING),
114
                                        Toolbox.getPreference(TBXPreferences.METADATA_COLSEPARATOR),
115
                                        Toolbox.getPreference(TBXPreferences.METADATA_TXTSEPARATOR), 1)
116
                }
117

    
118
                String cleanDirectories = project.getCleanAfterBuild();
119

    
120
                if (!doFixSurrogates()) return;
121
                if (!doSplitMergeXSLStep()) return;
122
                if (!doFrontXSLStep()) return;
123
                if (!doCheckXMLFileStep()) return;
124
                if (!doTokenizeStep()) return;
125
                if (!doPostTokenizeXSLStep()) return;
126
                if (!doEncodeMileStonesStep()) return;
127
                if (!doToXMLTXMStep()) return;
128
                if (!doInjectMetadataStep()) return;
129

    
130
                isSuccessFul = filesToProcess.size() > 0
131

    
132
                if ("true".equals(cleanDirectories)) {
133
                        new File(module.getBinaryDirectory(), "tokenized").deleteDir()
134
                        new File(module.getBinaryDirectory(), "src").deleteDir()
135
                        new File(module.getBinaryDirectory(), "split").deleteDir()
136
                }
137
        }
138

    
139
        /**
140
         * read from source directory and remove the surrogate chars not well managed
141
         * save the result in $bindir/src
142
         *
143
         * 
144
         *
145
         */
146
        public boolean doFixSurrogates() {
147
                File srcDirectory = new File(module.getBinaryDirectory(), "src")
148
                
149
                def files = inputDirectory.listFiles()
150
                if (files == null) {
151
                        reason = "No file to process in $inputDirectory"
152
                        return false;
153
                }
154
                srcDirectory.deleteDir() // clean before copying
155
                srcDirectory.mkdir()
156
                
157
                for (File file : files) {
158
                        def outputFile = new File(srcDirectory, file.getName());
159
                        if (file.getName().toLowerCase().endsWith(".xml")) {
160
                                CleanFile.removeSurrogateFromXmlFile(file, outputFile)
161
                        } else {
162
                                FileCopy.copyFiles(file, outputFile)
163
                        }
164
                }
165

    
166
                inputDirectory = srcDirectory; // the files to process are now in the "src" directory
167
                return true;
168
        }
169

    
170
        /**
171
         * read from source directory and write the result in $bindir/txm
172
         * 
173
         * only one XSL is applied
174
         *
175
         */
176
        public boolean doSplitMergeXSLStep() {
177
                
178
                File splitedDirectory = new File(module.getBinaryDirectory(), "split")
179
                splitedDirectory.deleteDir()
180
                
181
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/1-split-merge")
182
                println "-- Split-Merge XSL Step with $xslDirectory"
183
                def xslFiles = xslDirectory.listFiles()
184
                if (xslDirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
185
                        
186
                        splitedDirectory.mkdir()
187
                        xslParams["output-directory"] = splitedDirectory.getAbsoluteFile().toURI().toString();
188
                        
189
                        xslFiles.sort()
190
                        for (File xslFile : xslFiles) {
191
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
192
                                //if (!xslFile.getName().matches("[1-9]-.+")) continue;
193

    
194
                                if (ApplyXsl2.processImportSources(xslFile, inputDirectory.listFiles(), xslParams)) {
195
                                        println ""
196
                                } else {
197
                                        reason = "Fail to apply split-merge XSL: $xslPath"
198
                                }
199
                                break; // THERE IS ONLY ONE XSL FILE TO APPLY
200
                        }
201
                        inputDirectory = splitedDirectory; // the files to process are now in the "src" directory
202
                } else {
203
                        //println "Nothing to do."
204
                }
205
                return true;
206
        }
207

    
208
        public boolean doEncodeMileStonesStep() {
209
                filesToProcess = new File(module.getBinaryDirectory(), "tokenized").listFiles()
210
                def milestonesString = module.getProject().getTextualPlan("MileStones")
211
                if (milestonesString.length() == 0) return true;
212

    
213
                def milestones = milestonesString.split(",")
214
                //def milestones = ["lb", "pb", "cb"]
215
                //println "milestones=$milestones type="+milestones.getClass()+" size="+milestones.size()
216
                if (milestones.size() == 0) return true;
217

    
218
                println "-- Encoding milestone $milestones into XML files..."
219
                ConsoleProgressBar cpb = new ConsoleProgressBar(milestones.size())
220
                for (def milestone : milestones) {
221
                        milestone = milestone.trim()// just in case
222
                        cpb.tick()
223

    
224
                        for (File inputFile : filesToProcess) {
225
                                File outputFile = File.createTempFile("tmp", inputFile.getName(),inputFile.getParentFile());
226
                                println "$inputFile, 'text', $wordTag, $milestone, $outputFile"
227
                                MileStoneProjection msp = new MileStoneProjection(inputFile, "text", wordTag, milestone)
228
                                if (!msp.process(outputFile)) {
229
                                        println "Fail to encode $milestone in $inputFile"
230
                                        return false
231
                                } else {
232
                                        if (inputFile.delete()) {
233
                                                FileCopy.copy(outputFile, new File(outputFile.getParent(), "copy.xml"))
234
                                                outputFile.renameTo(inputFile)
235
                                        } else {
236
                                                println "Fail to encode $milestone in ${inputFile}: could not replace the file."
237
                                                return false
238
                                        }
239
                                }
240
                        }
241
                }
242
                println ""
243
                return true;
244
        }
245

    
246
        /**
247
         * read from $inputDirectory and write the result in $bindir/txm 
248
         *
249
         */
250
        public boolean doFrontXSLStep() {
251

    
252
                //filesToProcess = inputDirectory.listFiles();
253

    
254
                File frontXSLdirectory = new File(module.getSourceDirectory(), "xsl/2-front")
255
                println "-- Front XSL Step with the $frontXSLdirectory directory."
256
                def xslFiles = frontXSLdirectory.listFiles()
257
                
258
                xslParams["output-directory"] = outputDirectory.getAbsoluteFile().toURI().toString();
259
                
260
                if (frontXSLdirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
261

    
262
                        xslFiles.sort()
263
                        for (File xslFile : xslFiles) {
264
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
265
                                //if (!xslFile.getName().matches("[1-9]-.+")) continue;
266

    
267
                                if (ApplyXsl2.processImportSources(xslFile, inputDirectory, outputDirectory, xslParams)) {
268
                                        inputDirectory = outputDirectory; // the files to process are now in the "txm" directory
269
                                        println ""
270
                                } else {
271
                                        reason = "Fail to apply front XSL: $xslPath"
272
                                        return false;
273
                                }
274
                        }
275
                } else {
276
                        //println "Nothing to do."
277
                }
278
                return true;
279
        }
280

    
281
        public boolean doCheckXMLFileStep() {
282
                filesToProcess = []
283
                println "-- Checking XML files for well-formedness."
284
                def files = inputDirectory.listFiles()
285

    
286
                if (files == null || files.size() == 0) {
287
                        println "No XML file (*.xml) to process. Aborting"
288
                        return false;
289
                }
290
                files = files.sort()
291

    
292
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
293
                for (File f : files) {
294
                        cpb.tick()
295
                        if (f.getName().toLowerCase().endsWith(".xml"))
296
                                if (!f.isHidden() && f.canRead() && !f.getName().endsWith(".properties")
297
                                && !f.getName().startsWith("metadata") && !f.isDirectory())
298
                        {
299

    
300
                                if (!ValidateXml.test(f)) {
301
                                        continue;
302
                                } else {
303
                                        if (!f.getName().equals("import.xml"))
304
                                                filesToProcess << f
305
                                }
306
                        }
307
                }
308
                println ""
309
                if (filesToProcess.size() == 0) {
310
                        reason = "No file suitable to process in "+inputDirectory
311
                        return false;
312
                } else {
313
                        return true;
314
                }
315
        }
316

    
317

    
318
        public boolean doTokenizeStep() {
319

    
320
                new File(module.getBinaryDirectory(),"tokenized").mkdir()
321

    
322
                String outSideTextTagsRegex = "";
323
                String outSideTextTagsAndKeepContentRegex = "";
324
                String noteRegex = "";
325
                // get the element names to ignore
326

    
327
                String e1 = module.getProject().getTextualPlan("OutSideTextTags")
328
                def split  = e1.split(",")
329
                for (String s : split) {
330
                        outSideTextTagsRegex += "|"+s.trim()
331
                }
332
                if (outSideTextTagsRegex.trim().length() > 0) {
333
                        outSideTextTagsRegex = outSideTextTagsRegex.substring(1) // remove the first "|"
334
                }
335

    
336
                String e2 = module.getProject().getTextualPlan("OutSideTextTagsAndKeepContent")
337
                def split2  = e2.split(",")
338
                for (String s : split2) {
339
                        outSideTextTagsAndKeepContentRegex += "|"+s.trim()
340
                }
341
                if (outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
342
                        outSideTextTagsAndKeepContentRegex = outSideTextTagsAndKeepContentRegex.substring(1) // remove the first "|"
343
                }
344

    
345
                String e3 = module.getProject().getTextualPlan("Note")
346
                def split3  = e3.split(",")
347
                for (String s : split3) {
348
                        noteRegex += "|"+s.trim()
349
                }
350
                if (noteRegex.trim().length() > 0) {
351
                        noteRegex = noteRegex.substring(1) // remove the first "|"
352
                }
353

    
354
                //if (wordTag != "w") {
355
                if (!doTokenizeStep) {
356
                        println "No tokenization do to."
357
                        // ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
358
                        for (File f : filesToProcess) {
359
                                File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
360
                                FileCopy.copy(f, outfile);
361
                        }
362
                        return true;
363
                } else {
364
                        println "-- Tokenizing "+filesToProcess.size()+" files"
365
                        ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
366
                        for (File f : filesToProcess) {
367
                                cpb.tick()
368
                                File infile = f;
369
                                File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
370
                                SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
371

    
372
                                if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) {
373
                                        tokenizer.setOutSideTextTags(outSideTextTagsRegex)
374
                                }
375

    
376
                                if (outSideTextTagsAndKeepContentRegex != null && outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
377
                                        tokenizer.setOutSideTextTagsAndKeepContent(outSideTextTagsAndKeepContentRegex)
378
                                }
379

    
380
                                if (noteRegex != null && noteRegex.trim().length() > 0) {
381
                                        tokenizer.setNote(noteRegex)
382
                                }
383

    
384
                                // tokenize !
385
                                if (!tokenizer.process()) {
386
                                        println("Failed to process "+f)
387
                                        outfile.delete()
388
                                }
389
                        }
390
                }
391

    
392
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
393
                println ""
394

    
395
                return true;
396
        }
397

    
398
        /**
399
         * read from $bindir/tokenized and write the result in $bindir/tokenized
400
         *
401
         */
402
        public boolean doPostTokenizeXSLStep() {
403

    
404
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
405
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/3-posttok")
406
                def xslFiles = xslDirectory.listFiles()
407
                if (!xslDirectory.exists() || xslFiles == null || xslFiles.size() == 0) return true;
408

    
409
                println "-- Posttokenize XSL Step with $xslDirectory"
410
                return ApplyXsl2.processWithMultipleXSL(filesToProcess, xslDirectory, xslParams)
411
        }
412

    
413
        public boolean doInjectMetadataStep() {
414
                if (metadata != null) {
415
                        println("-- INJECTING METADATA - "+metadata.getHeadersList()+" in texts of directory "+outputDirectory)
416
                        ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
417
                        for (File infile : filesToProcess) {
418
                                cpb.tick()
419
                                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
420
                                if (metadata.injectMetadatasInXml(infile, outfile, "text", null)) {
421
                                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
422
                                        if (!infile.exists()) {
423
                                                println "Error: could not replace $infile by $outfile"
424
                                                return false;
425
                                        }
426
                                } else {
427
                                        outfile.delete(); // fail
428
                                }
429
                        }
430
                        cpb.done()
431
                }
432
                return true;
433
        }
434

    
435
        public boolean doToXMLTXMStep() {
436
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
437
                ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
438
                println("-- Building XML-TXM ("+filesToProcess.size()+" files)")
439
                for (File f : filesToProcess) {
440
                        //ArrayList<String> milestones = new ArrayList<String>();
441
                        cpb.tick()
442
                        File file = f;
443
                        String txmfile = f.getName();
444

    
445
                        def correspType = new HashMap<String,String>()
446
                        def correspRef = new HashMap<String,String>()
447
                        //il faut lister les id de tous les respStmt
448
                        def respId = [];
449
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
450
                        def applications = new HashMap<String,HashMap<String,String>>();
451
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
452
                        //pour construire les ref vers les taxonomies
453
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
454
                        //associe un id d'item avec sa description et son URI
455
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
456
                        //informations de respStmt
457
                        //resps (respId <voir ci-dessus>, [description, person, date])
458
                        def resps = new HashMap<String,String[]>();
459
                        //lance le traitement
460
                        def builder = new Xml2Ana(file);
461
                        builder.setWordTag(module.getProject().getTokenizerWordElement())
462
                        builder.setConvertAllAtrtibutes true;
463
                        builder.setCorrespondances(correspRef, correspType);
464
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
465
                        //builder.setAddTEIHeader();
466
                        if (!builder.process(new File(outputDirectory, txmfile))) {
467
                                println("Failed to process "+f);
468
                                new File(outputDirectory,txmfile).delete();
469
                        }
470
                }
471
                filesToProcess = outputDirectory.listFiles();
472
                cpb.done();
473
                return true;
474
        }
475

    
476
        @Override
477
        public void checkFiles() {
478
                //isSuccessFul = isSuccessFul && outputDirectory.listFiles() != null;
479
        }
480

    
481
        @Override
482
        public void cancel() {
483
                cancelNow = true;
484
        }
485
}