Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZImporter.groovy @ 1137

History | View | Annotate | Download (14.6 kB)

1
package org.txm.scripts.importer.xtz
2

    
3
import org.txm.scripts.filters.Tokeniser.SimpleTokenizerXml
4

    
5
import java.io.File;
6

    
7
import net.sf.saxon.style.XSLParam;
8

    
9
import org.txm.core.preferences.TBXPreferences;
10
import org.txm.importer.ApplyXsl2
11
import org.txm.scripts.importer.CleanFile
12
import org.txm.scripts.importer.MileStoneProjection
13
import org.txm.importer.ValidateXml
14
import org.txm.*
15

    
16
import javax.xml.parsers.SAXParser
17
import javax.xml.parsers.SAXParserFactory
18
import javax.xml.stream.*
19

    
20
import org.w3c.dom.Element
21
import org.xml.sax.XMLReader
22
import org.txm.utils.xml.DomUtils
23
import org.txm.metadatas.Metadatas
24
import org.txm.objects.*
25
import org.txm.importer.scripts.xmltxm.Xml2Ana
26
import org.txm.stat.utils.ConsoleProgressBar
27
import org.txm.utils.io.*
28
import org.txm.importer.xtz.*
29

    
30
class XTZImporter extends Importer {
31

    
32
        boolean cancelNow = false;
33
        def filesToProcess = []
34

    
35
        File frontXSLResultDirectory;
36
        File tokenizedDirectory;
37
        File sentencedDirectory;
38

    
39
        String xslPath;
40
        def xslParams;
41
        String lang;
42
        String wordTag;
43
        boolean skipDoTokenizeStep = false;
44

    
45
        Metadatas metadata = null; // text metadata
46

    
47
        public XTZImporter(ImportModule module) {
48
                super(module);
49
        }
50

    
51
        public Metadatas getMetadata() {
52
                return metadata;
53
        }
54

    
55
        public void process() {
56

    
57
                Project project = module.getProject();
58
                File binDir = module.getBinaryDirectory();
59

    
60
                String corpusname = project.getName();
61

    
62
                def srcFiles = [];
63
                def files = inputDirectory.listFiles();
64
                if (files == null) {
65
                        reason = "No file to process in "+inputDirectory
66
                        return; // no file to process in the directory
67
                }
68

    
69
                for (File f : files) {
70
                        if (f.isHidden() || f.isDirectory())
71
                                continue;
72
                        else if (f.getName().endsWith(".properties"))
73
                                continue;
74
                        else if (f.getName().equals("import.xml"))
75
                                continue;
76
                        else if (f.getName().endsWith(".csv"))
77
                                continue;
78
                        else if (f.getName().endsWith(".dtd"))
79
                                continue;
80
                        else if (f.getName().endsWith(".xsl"))
81
                                continue;
82
                        else if (f.getName().endsWith("~"))
83
                                continue;
84
                        else if (f.getName().startsWith("."))
85
                                continue;
86

    
87
                        srcFiles << f
88
                }
89

    
90
                if (srcFiles.size() == 0) {
91
                        reason = "No suitable file to process in "+inputDirectory
92
                        return; // no file to process in the directory
93
                }
94

    
95
                // prepare front XSL if any
96
                xslPath = project.getFrontXSL()
97
                xslParams = project.getXsltParameters()
98

    
99
                lang = project.getLang();
100

    
101
                wordTag = project.getTokenizerWordElement()
102
                this.skipDoTokenizeStep = !project.getDoTokenizerStep()
103

    
104
                //prepare metadata if any
105
                File allMetadataFile = Metadatas.findMetadataFile(inputDirectory);
106
                if (allMetadataFile.exists()) {
107
                        File copy = new File(binDir, allMetadataFile.getName())
108
                        if (!FileCopy.copy(allMetadataFile, copy)) {
109
                                println "Error: could not create a copy of the metadata file "+allMetadataFile.getAbsoluteFile();
110
                                return;
111
                        }
112
                        metadata = new Metadatas(copy,
113
                        Toolbox.getPreference(TBXPreferences.METADATA_ENCODING),
114
                        Toolbox.getPreference(TBXPreferences.METADATA_COLSEPARATOR),
115
                        Toolbox.getPreference(TBXPreferences.METADATA_TXTSEPARATOR), 1)
116
                }
117

    
118
                String cleanDirectories = project.getCleanAfterBuild();
119
                
120
                if (!doFixSurrogates()) return;
121
                if (!doSplitMergeXSLStep()) return;
122
                if (!doFrontXSLStep()) return;
123
                if (!doCheckXMLFileStep()) return;
124
                if (!doTokenizeStep()) return;
125
                if (!doPostTokenizeXSLStep()) return;
126
                if (!doEncodeMileStonesStep()) return;
127
                if (!doToXMLTXMStep()) return;
128
                if (!doInjectMetadataStep()) return;
129

    
130
                isSuccessFul = filesToProcess.size() > 0
131

    
132
                if ("true".equals(cleanDirectories)) {
133
                        new File(module.getBinaryDirectory(), "tokenized").deleteDir()
134
                        new File(module.getBinaryDirectory(), "src").deleteDir()
135
                }
136
        }
137

    
138
        /**
139
         * read from source directory and remove the surrogate chars not well managed
140
         * save the result in $bindir/src
141
         *
142
         * 
143
         *
144
         */
145
        public boolean doFixSurrogates() {
146
                File srcDirectory = new File(module.getBinaryDirectory(), "src")
147
                srcDirectory.mkdir()
148
                def files = inputDirectory.listFiles()
149
                if (files == null) {
150
                        reason = "No file to process in $inputDirectory"
151
                        return false;
152
                }
153

    
154
                for (File file : files) {
155
                        def outputFile = new File(srcDirectory, file.getName());
156
                        if (file.getName().toLowerCase().endsWith(".xml")) {
157
                                CleanFile.removeSurrogateFromXmlFile(file, outputFile)
158
                        } else {
159
                                FileCopy.copyFiles(file, outputFile)
160
                        }
161
                }
162

    
163
                inputDirectory = srcDirectory; // the files to process are now in the "src" directory
164
                return true;
165
        }
166

    
167
        /**
168
         * read from source directory and write the result in $bindir/txm
169
         * 
170
         * only one XSL is applied
171
         *
172
         */
173
        public boolean doSplitMergeXSLStep() {
174
                File srcDirectory = new File(module.getBinaryDirectory(), "src")
175
                srcDirectory.mkdir()
176
                xslParams["output-directory"] = srcDirectory.getAbsoluteFile().toURI().toString();
177

    
178
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/1-split-merge")
179
                println "-- Split-Merge XSL Step with $xslDirectory"
180
                def xslFiles = xslDirectory.listFiles()
181
                if (xslDirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
182

    
183
                        xslFiles.sort()
184
                        for (File xslFile : xslFiles) {
185
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
186
                                //if (!xslFile.getName().matches("[1-9]-.+")) continue;
187

    
188
                                if (ApplyXsl2.processImportSources(xslFile, inputDirectory.listFiles(), xslParams, false)) {
189
                                        println ""
190
                                } else {
191
                                        reason = "Fail to apply split-merge XSL: $xslPath"
192
                                }
193
                                break; // THERE IS ONLY ONE XSL FILE TO APPLY
194
                        }
195
                        inputDirectory = srcDirectory; // the files to process are now in the "src" directory
196
                } else {
197
                        //println "Nothing to do."
198
                }
199
                return true;
200
        }
201

    
202
        public boolean doEncodeMileStonesStep() {
203
                filesToProcess = new File(module.getBinaryDirectory(), "tokenized").listFiles()
204
                def milestonesString = module.getProject().getTextualPlan("MileStones")
205
                if (milestonesString.length() == 0) return true;
206

    
207
                def milestones = milestonesString.split(",")
208
                //def milestones = ["lb", "pb", "cb"]
209
                //println "milestones=$milestones type="+milestones.getClass()+" size="+milestones.size()
210
                if (milestones.size() == 0) return true;
211

    
212
                println "-- Encoding milestone $milestones into XML files..."
213
                ConsoleProgressBar cpb = new ConsoleProgressBar(milestones.size())
214
                for (def milestone : milestones) {
215
                        milestone = milestone.trim()// just in case
216
                        cpb.tick()
217

    
218
                        for (File inputFile : filesToProcess) {
219
                                File outputFile = File.createTempFile("tmp", inputFile.getName(),inputFile.getParentFile());
220
                                println "$inputFile, 'text', $wordTag, $milestone, $outputFile"
221
                                MileStoneProjection msp = new MileStoneProjection(inputFile, "text", wordTag, milestone)
222
                                if (!msp.process(outputFile)) {
223
                                        println "Fail to encode $milestone in $inputFile"
224
                                        return false
225
                                } else {
226
                                        if (inputFile.delete()) {
227
                                                outputFile.renameTo(inputFile)
228
                                        } else {
229
                                                println "Fail to encode $milestone in ${inputFile}: could not replace the file."
230
                                                return false
231
                                        }
232
                                }
233
                        }
234
                }
235
                println ""
236
                return true;
237
        }
238

    
239
        /**
240
         * read from $inputDirectory and write the result in $bindir/txm 
241
         *
242
         */
243
        public boolean doFrontXSLStep() {
244

    
245
                //filesToProcess = inputDirectory.listFiles();
246

    
247
                File frontXSLdirectory = new File(module.getSourceDirectory(), "xsl/2-front")
248
                println "-- Front XSL Step with $frontXSLdirectory"
249
                def xslFiles = frontXSLdirectory.listFiles()
250
                if (frontXSLdirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
251

    
252
                        xslFiles.sort()
253
                        for (File xslFile : xslFiles) {
254
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
255
                                //if (!xslFile.getName().matches("[1-9]-.+")) continue;
256

    
257
                                if (ApplyXsl2.processImportSources(xslFile, inputDirectory, outputDirectory, xslParams)) {
258
                                        inputDirectory = outputDirectory; // the files to process are now in the "txm" directory
259
                                        println ""
260
                                } else {
261
                                        reason = "Fail to apply front XSL: $xslPath"
262
                                        return false;
263
                                }
264
                        }
265
                } else {
266
                        //println "Nothing to do."
267
                }
268
                return true;
269
        }
270

    
271
        public boolean doCheckXMLFileStep() {
272
                filesToProcess = []
273
                println "-- Check XML files for well-formedness."
274
                def files = inputDirectory.listFiles()
275

    
276
                if (files == null || files.size() == 0) {
277
                        println "No XML file (*.xml) to process. Aborting"
278
                        return false;
279
                }
280
                files = files.sort()
281

    
282
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
283
                for (File f : files) {
284
                        cpb.tick()
285
                        if (f.getName().toLowerCase().endsWith(".xml"))
286
                                if (!f.isHidden() && f.canRead() && !f.getName().endsWith(".properties")
287
                                && !f.getName().startsWith("metadata") && !f.isDirectory())
288
                        {
289

    
290
                                if (!ValidateXml.test(f)) {
291
                                        continue;
292
                                } else {
293
                                        if (!f.getName().equals("import.xml"))
294
                                                filesToProcess << f
295
                                }
296
                        }
297
                }
298
                println ""
299
                if (filesToProcess.size() == 0) {
300
                        reason = "No file suitable to process in "+inputDirectory
301
                        return false;
302
                } else {
303
                        return true;
304
                }
305
        }
306

    
307

    
308
        public boolean doTokenizeStep() {
309

    
310
                println "-- Tokenizing "+filesToProcess.size()+" files"
311
                new File(module.getBinaryDirectory(),"tokenized").mkdir()
312

    
313
                String outSideTextTagsRegex = "";
314
                String outSideTextTagsAndKeepContentRegex = "";
315
                String noteRegex = "";
316
                // get the element names to ignore
317

    
318
                        String e1 = module.getProject().getTextualPlan("OutSideTextTags")
319
                        def split  = e1.split(",")
320
                        for (String s : split) {
321
                                outSideTextTagsRegex += "|"+s.trim()
322
                        }
323
                        if (outSideTextTagsRegex.trim().length() > 0) {
324
                                outSideTextTagsRegex = outSideTextTagsRegex.substring(1) // remove the first "|"
325
                        }
326

    
327
                        String e2 = module.getProject().getTextualPlan("OutSideTextTagsAndKeepContent")
328
                        def split2  = e2.split(",")
329
                        for (String s : split2) {
330
                                outSideTextTagsAndKeepContentRegex += "|"+s.trim()
331
                        }
332
                        if (outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
333
                                outSideTextTagsAndKeepContentRegex = outSideTextTagsAndKeepContentRegex.substring(1) // remove the first "|"
334
                        }
335

    
336
                        String e3 = module.getProject().getTextualPlan("Note")
337
                        def split3  = e3.split(",")
338
                        for (String s : split3) {
339
                                noteRegex += "|"+s.trim()
340
                        }
341
                        if (noteRegex.trim().length() > 0) {
342
                                noteRegex = noteRegex.substring(1) // remove the first "|"
343
                        }
344

    
345
                //if (wordTag != "w") {
346
                if (skipDoTokenizeStep) {
347
                        println "No tokenization do to."
348
                        // ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
349
                        for (File f : filesToProcess) {
350
                                File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
351
                                FileCopy.copy(f, outfile);
352
                        }
353
                        return true;
354
                } else {
355
                        ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
356
                        for (File f : filesToProcess) {
357
                                cpb.tick()
358
                                File infile = f;
359
                                File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
360
                                SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
361
                                if (wordTag.length() > 0)
362
                                        tokenizer.setWord_tags(wordTag);
363
                                tokenizer.setWord_element_to_create(wordTag)
364

    
365
                                if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) {
366
                                        tokenizer.setOutSideTextTags(outSideTextTagsRegex)
367
                                }
368

    
369
                                if (outSideTextTagsAndKeepContentRegex != null && outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
370
                                        tokenizer.setOutSideTextTagsAndKeepContent(outSideTextTagsAndKeepContentRegex)
371
                                }
372

    
373
                                if (noteRegex != null && noteRegex.trim().length() > 0) {
374
                                        tokenizer.setNote(noteRegex)
375
                                }
376

    
377
                                // tokenize !
378
                                if (!tokenizer.process()) {
379
                                        println("Failed to process "+f)
380
                                        outfile.delete()
381
                                }
382
                        }
383
                }
384

    
385
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
386
                println ""
387

    
388
                return true;
389
        }
390

    
391
        /**
392
         * read from $bindir/tokenized and write the result in $bindir/tokenized
393
         *
394
         */
395
        public boolean doPostTokenizeXSLStep() {
396

    
397
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
398
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/3-posttok")
399
                def xslFiles = xslDirectory.listFiles()
400
                if (!xslDirectory.exists() || xslFiles == null || xslFiles.size() == 0) return true;
401

    
402
                println "-- Posttokenize XSL Step with $xslDirectory"
403
                return ApplyXsl2.processWithMultipleXSL(filesToProcess, xslDirectory, xslParams)
404
        }
405

    
406
        public boolean doInjectMetadataStep() {
407
                if (metadata != null) {
408
                        println("-- INJECTING METADATA - "+metadata.getHeadersList()+" in texts of directory "+outputDirectory)
409
                        ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
410
                        for (File infile : filesToProcess) {
411
                                cpb.tick()
412
                                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
413
                                if (metadata.injectMetadatasInXml(infile, outfile, "text", null)) {
414
                                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
415
                                        if (!infile.exists()) {
416
                                                println "Error: could not replace $infile by $outfile"
417
                                                return false;
418
                                        }
419
                                } else {
420
                                        outfile.delete(); // fail
421
                                }
422
                        }
423
                        println ""
424
                }
425
                return true;
426
        }
427

    
428
        public boolean doToXMLTXMStep() {
429
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
430
                ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
431
                println("-- Building XML-TXM ("+filesToProcess.size()+" files)")
432
                for (File f : filesToProcess) {
433
                        //ArrayList<String> milestones = new ArrayList<String>();
434
                        cpb.tick()
435
                        File file = f;
436
                        String txmfile = f.getName();
437

    
438
                        def correspType = new HashMap<String,String>()
439
                        def correspRef = new HashMap<String,String>()
440
                        //il faut lister les id de tous les respStmt
441
                        def respId = [];
442
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
443
                        def applications = new HashMap<String,HashMap<String,String>>();
444
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
445
                        //pour construire les ref vers les taxonomies
446
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
447
                        //associe un id d'item avec sa description et son URI
448
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
449
                        //informations de respStmt
450
                        //resps (respId <voir ci-dessus>, [description, person, date])
451
                        def resps = new HashMap<String,String[]>();
452
                        //lance le traitement
453
                        def builder = new Xml2Ana(file);
454
                        builder.setWordTag(module.getProject().getTokenizerWordElement())
455
                        builder.setConvertAllAtrtibutes true;
456
                        builder.setCorrespondances(correspRef, correspType);
457
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
458
                        //builder.setAddTEIHeader();
459
                        if (!builder.process(new File(outputDirectory, txmfile))) {
460
                                println("Failed to process "+f);
461
                                new File(outputDirectory,txmfile).delete();
462
                        }
463
                }
464
                filesToProcess = outputDirectory.listFiles();
465
                println ""
466
                return true;
467
        }
468

    
469
        @Override
470
        public void checkFiles() {
471
                //isSuccessFul = isSuccessFul && outputDirectory.listFiles() != null;
472
        }
473

    
474
        @Override
475
        public void cancel() {
476
                cancelNow = true;
477
        }
478
}