Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / xtz / XTZImporter.groovy @ 148

History | View | Annotate | Download (13.8 kB)

1
package org.txm.importer.xtz
2

    
3
import filters.Tokeniser.SimpleTokenizerXml
4

    
5
import java.io.File;
6

    
7
import net.sf.saxon.style.XSLParam;
8

    
9
import org.txm.importer.ApplyXsl2;
10
import org.txm.importer.MileStoneProjection;
11
import org.txm.importer.ValidateXml;
12
import org.txm.*;
13

    
14
import javax.xml.stream.*;
15

    
16
import org.w3c.dom.Element
17
import org.txm.metadatas.Metadatas
18
import org.txm.objects.BaseParameters
19
import org.txm.scripts.teitxm.Xml2Ana
20
import org.txm.stat.utils.ConsoleProgressBar
21
import org.txm.utils.*;
22

    
23
class XTZImporter extends Importer {
24

    
25
        boolean cancelNow = false;
26
        def filesToProcess = []
27

    
28
        File frontXSLResultDirectory;
29
        File tokenizedDirectory;
30
        File sentencedDirectory;
31

    
32
        String xslPath;
33
        def xslParams;
34
        String lang;
35
        String wordTag;
36
        boolean skipDoTokenizeStep = false;
37
        
38
        Metadatas metadata = null; // text metadata
39

    
40
        public XTZImporter(ImportModule module) {
41
                super(module);
42
        }
43
        
44
        public Metadatas getMetadata() {
45
                return metadata;
46
        }
47

    
48
        public void process() {
49

    
50
                BaseParameters params = module.getParameters();
51
                File binDir = module.getBinaryDirectory();
52

    
53
                String corpusname = params.getCorpusName();
54
                Element corpusElem = params.corpora.get(corpusname);
55

    
56
                def srcFiles = [];
57
                def files = inputDirectory.listFiles();
58
                if (files == null) {
59
                        reason = "No file to process in "+inputDirectory
60
                        return; // no file to process in the directory
61
                }
62

    
63
                for (File f : files) {
64
                        if (f.isHidden() || f.isDirectory())
65
                                continue;
66
                        else if (f.getName().endsWith(".properties"))
67
                                continue;
68
                        else if (f.getName().equals("import.xml"))
69
                                continue;
70
                        else if (f.getName().endsWith(".csv"))
71
                                continue;
72
                        else if (f.getName().endsWith(".dtd"))
73
                                continue;
74
                        else if (f.getName().endsWith(".xsl"))
75
                                continue;
76
                        else if (f.getName().endsWith("~"))
77
                                continue;
78
                        else if (f.getName().startsWith("."))
79
                                continue;
80

    
81
                        srcFiles << f
82
                }
83

    
84
                if (srcFiles.size() == 0) {
85
                        reason = "No suitable file to process in "+inputDirectory
86
                        return; // no file to process in the directory
87
                }
88

    
89
                // prepare front XSL if any
90
                xslPath = params.getXsltElement(corpusElem).getAttribute("xsl")
91
                xslParams = params.getXsltParams(corpusElem);
92

    
93
                lang = corpusElem.getAttribute("lang");
94
                
95
                wordTag = module.getParameters().getWordElement().getTextContent()
96
                this.skipDoTokenizeStep = module.getParameters().getSkipTokenization()
97

    
98
                //prepare metadata if any
99
                File allmetadatafile = new File(inputDirectory, "metadata.csv");
100
                if (allmetadatafile.exists()) {
101
                        File copy = new File(binDir, "metadata.csv")
102
                        if (!FileCopy.copy(allmetadatafile, copy)) {
103
                                println "Error: could not create a copy of the metadata file "+allmetadatafile.getAbsoluteFile();
104
                                return;
105
                        }
106
                        metadata = new Metadatas(copy,
107
                                        Toolbox.getParam(Toolbox.METADATA_ENCODING),
108
                                        Toolbox.getParam(Toolbox.METADATA_COLSEPARATOR),
109
                                        Toolbox.getParam(Toolbox.METADATA_TXTSEPARATOR), 1)
110
                }
111

    
112
                String cleanDirectories = module.getParameters().getKeyValueParameters().get(ImportKeys.CLEAN);
113

    
114
                if (!doSplitMergeXSLStep()) return;
115
                if (!doFrontXSLStep()) return;
116
                if (!doCheckXMLFileStep()) return;
117
                if (!doTokenizeStep()) return;
118
                if (!doPostTokenizeXSLStep()) return;
119
                if (!doEncodeMileStonesStep()) return;
120
                if (!doToXMLTXMStep()) return;
121
                if (!doInjectMetadataStep()) return;
122

    
123
                isSuccessFul = filesToProcess.size() > 0
124

    
125
                if ("true".equals(cleanDirectories)) {
126
                        new File(module.getBinaryDirectory(), "tokenized").deleteDir()
127
                        new File(module.getBinaryDirectory(), "src").deleteDir()
128
                }
129
        }
130

    
131
        /**
132
         * read from source directory and write the result in $bindir/txm
133
         * 
134
         * only one XSL is applied
135
         *
136
         */
137
        public boolean doSplitMergeXSLStep() {
138
                File srcDirectory = new File(module.getBinaryDirectory(), "src")
139
                srcDirectory.mkdir()
140
                xslParams["output-directory"] = srcDirectory.getAbsoluteFile().toURI().toString();
141

    
142
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/1-split-merge")
143
                println "-- Split-Merge XSL Step with $xslDirectory"
144
                def xslFiles = xslDirectory.listFiles()
145
                if (xslDirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
146
                        
147
                        xslFiles.sort()
148
                        for (File xslFile : xslFiles) {
149
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
150
                                //if (!xslFile.getName().matches("[1-9]-.+")) continue;
151
                                
152
                                if (ApplyXsl2.processImportSources(xslFile, inputDirectory, srcDirectory, xslParams, true)) {
153
                                        println ""
154
                                } else {
155
                                        reason = "Fail to apply split-merge XSL: $xslPath"
156
                                }
157
                                break; // THERE IS ONLY ONE XSL FILE TO APPLY
158
                        }
159
                        inputDirectory = srcDirectory; // the files to process are now in the "src" directory
160
                } else {
161
                        //println "Nothing to do."
162
                }
163
                return true;
164
        }
165
        
166
        public boolean doEncodeMileStonesStep() {
167
                filesToProcess = new File(module.getBinaryDirectory(), "tokenized").listFiles()
168
                def milestonesString = module.getParameters().getMilestonesElement().getTextContent().trim()
169
                if (milestonesString.length() == 0) return true;
170
                
171
                def milestones = milestonesString.split(",")
172
                //def milestones = ["lb", "pb", "cb"]
173
                //println "milestones=$milestones type="+milestones.getClass()+" size="+milestones.size()
174
                if (milestones.size() == 0) return true;
175
                
176
                println "-- Encoding milestone $milestones into XML files..."
177
                ConsoleProgressBar cpb = new ConsoleProgressBar(milestones.size())
178
                for (def milestone : milestones) {
179
                        milestone = milestone.trim()// just in case
180
                        cpb.tick()
181
                        
182
                        for (File inputFile : filesToProcess) {
183
                                File outputFile = File.createTempFile("tmp", inputFile.getName(),inputFile.getParentFile());
184
                                MileStoneProjection msp = new MileStoneProjection(inputFile, "body", wordTag, milestone)
185
                                if (!msp.process(outputFile)) {
186
                                        println "Fail to encode $milestone in $inputFile"
187
                                        return false
188
                                } else {
189
                                        if (inputFile.delete()) {
190
                                                outputFile.renameTo(inputFile)
191
                                        } else {
192
                                                println "Fail to encode $milestone in ${inputFile}: could not replace the file."
193
                                                return false
194
                                        }
195
                                }
196
                        }
197
                }
198
                println ""
199
                return true;
200
        }
201

    
202
        /**
203
         * read from $inputDirectory and write the result in $bindir/txm 
204
         *
205
         */
206
        public boolean doFrontXSLStep() {
207

    
208
                //filesToProcess = inputDirectory.listFiles();
209

    
210
                File frontXSLdirectory = new File(module.getSourceDirectory(), "xsl/2-front")
211
                println "-- Front XSL Step with $frontXSLdirectory"
212
                def xslFiles = frontXSLdirectory.listFiles()
213
                if (frontXSLdirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
214
                        
215
                        xslFiles.sort()
216
                        for (File xslFile : xslFiles) {
217
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
218
                                //if (!xslFile.getName().matches("[1-9]-.+")) continue;
219

    
220
                                if (ApplyXsl2.processImportSources(xslFile, inputDirectory, outputDirectory, xslParams)) {
221
                                        inputDirectory = outputDirectory; // the files to process are now in the "txm" directory
222
                                        println ""
223
                                } else {
224
                                        reason = "Fail to apply front XSL: $xslPath"
225
                                        return false;
226
                                }
227
                        }
228
                } else {
229
                        //println "Nothing to do."
230
                }
231
                return true;
232
        }
233

    
234
        public boolean doCheckXMLFileStep() {
235
                filesToProcess = []
236
                println "-- Check XML files for well-formedness."
237
                def files = inputDirectory.listFiles()
238

    
239
                if (files == null || files.size() == 0) {
240
                        println "No XML file (*.xml) to process. Aborting"
241
                        return false;
242
                }
243
                files = files.sort()
244

    
245
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
246
                for (File f : files) {
247
                        cpb.tick()
248
                        if (f.getName().toLowerCase().endsWith(".xml"))
249
                                if (!f.isHidden() && f.canRead() && !f.getName().endsWith(".properties")
250
                                && !f.getName().startsWith("metadata") && !f.isDirectory())
251
                        {
252
                                
253
                                if (!ValidateXml.test(f)) {
254
                                        continue;
255
                                } else {
256
                                        if (!f.getName().equals("import.xml"))
257
                                                filesToProcess << f
258
                                }
259
                        }
260
                }
261
                println ""
262
                if (filesToProcess.size() == 0) {
263
                        reason = "No file suitable to process in "+inputDirectory
264
                        return false;
265
                } else {
266
                        return true;
267
                }
268
        }
269

    
270

    
271
        public boolean doTokenizeStep() {
272

    
273
                println "-- Tokenizing "+filesToProcess.size()+" files"
274
                new File(module.getBinaryDirectory(),"tokenized").mkdir()
275

    
276
                String outSideTextTagsRegex = "";
277
                String outSideTextTagsAndKeepContentRegex = "";
278
                String noteRegex = "";
279
                // get the element names to ignore
280
                Element textualPlanEelement = module.getParameters().getTextualPlans();
281

    
282
                if (textualPlanEelement != null) {
283
                        Element e1 = module.getParameters().getOutSideTextTagsElement()
284
                        def split  = e1.getTextContent().split(",")
285
                        for (String s : split) {
286
                                outSideTextTagsRegex += "|"+s.trim()
287
                        }
288
                        if (outSideTextTagsRegex.trim().length() > 0) {
289
                                outSideTextTagsRegex = outSideTextTagsRegex.substring(1) // remove the first "|"
290
                        }
291
                        
292
                        Element e2 = module.getParameters().getOutSideTextTagsAndKeepContentElement()
293
                        def split2  = e2.getTextContent().split(",")
294
                        for (String s : split2) {
295
                                outSideTextTagsAndKeepContentRegex += "|"+s.trim()
296
                        }
297
                        if (outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
298
                                outSideTextTagsAndKeepContentRegex = outSideTextTagsAndKeepContentRegex.substring(1) // remove the first "|"
299
                        }
300
                        
301
                        Element e3 = module.getParameters().getNoteElement()
302
                        def split3  = e3.getTextContent().split(",")
303
                        for (String s : split3) {
304
                                noteRegex += "|"+s.trim()
305
                        }
306
                        if (noteRegex.trim().length() > 0) {
307
                                noteRegex = noteRegex.substring(1) // remove the first "|"
308
                        }
309
                }
310

    
311
                //if (wordTag != "w") {
312
                if (skipDoTokenizeStep ) {
313
                        println "No tokenization do to."
314
                        // ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
315
                        for (File f : filesToProcess) {
316
                                File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
317
                                FileCopy.copy(f, outfile);
318
                        }
319
                        return true;
320
                } else {
321
                        ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
322
                        for (File f : filesToProcess) {
323
                                cpb.tick()
324
                                File infile = f;
325
                                File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
326
                                SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
327
                                if (wordTag.length() > 0)
328
                                        tokenizer.setWord_tags(wordTag);
329

    
330
                                if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) {
331
                                        tokenizer.setOutSideTextTags(outSideTextTagsRegex)
332
                                }
333
                                
334
                                if (outSideTextTagsAndKeepContentRegex != null && outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
335
                                        tokenizer.setOutSideTextTagsAndKeepContent(outSideTextTagsAndKeepContentRegex)
336
                                }
337
                                
338
                                if (noteRegex != null && noteRegex.trim().length() > 0) {
339
                                        tokenizer.setNote(noteRegex)
340
                                }
341

    
342
                                // tokenize ! \o/
343
                                if (!tokenizer.process()) {
344
                                        println("Failed to process "+f)
345
                                        outfile.delete()
346
                                }
347
                        }
348
                }
349
                
350
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
351
                println ""
352

    
353
                return true;
354
        }
355

    
356
        /**
357
         * read from $bindir/tokenized and write the result in $bindir/tokenized
358
         *
359
         */
360
        public boolean doPostTokenizeXSLStep() {
361

    
362
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
363
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/3-posttok")
364
                def xslFiles = xslDirectory.listFiles()
365
                if (!xslDirectory.exists() || xslFiles == null || xslFiles.size() == 0) return true;
366

    
367
                println "-- Posttokenize XSL Step with $xslDirectory"
368
                return ApplyXsl2.processWithMultipleXSL(filesToProcess, xslDirectory, xslParams)
369
        }
370

    
371
        public boolean doInjectMetadataStep() {
372
                if (metadata != null) {
373
                        println("-- INJECTING METADATA - "+metadata.getHeadersList()+" in texts of directory "+outputDirectory)
374
                        ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
375
                        for (File infile : filesToProcess) {
376
                                cpb.tick()
377
                                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
378
                                if (metadata.injectMetadatasInXml(infile, outfile, "text", null)) {
379
                                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
380
                                        if (!infile.exists()) {
381
                                                println "Error: could not replace $infile by $outfile"
382
                                                return false;
383
                                        }
384
                                } else {
385
                                        outfile.delete(); // fail
386
                                }
387
                        }
388
                        println ""
389
                }
390
                return true;
391
        }
392

    
393
        public boolean doToXMLTXMStep() {
394
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
395
                ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
396
                println("-- Building XML-TXM ("+filesToProcess.size()+" files)")
397
                for (File f : filesToProcess) {
398
                        //ArrayList<String> milestones = new ArrayList<String>();
399
                        cpb.tick()
400
                        File file = f;
401
                        String txmfile = f.getName();
402

    
403
                        def correspType = new HashMap<String,String>()
404
                        def correspRef = new HashMap<String,String>()
405
                        //il faut lister les id de tous les respStmt
406
                        def respId = [];
407
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
408
                        def applications = new HashMap<String,HashMap<String,String>>();
409
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
410
                        //pour construire les ref vers les taxonomies
411
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
412
                        //associe un id d'item avec sa description et son URI
413
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
414
                        //informations de respStmt
415
                        //resps (respId <voir ci-dessus>, [description, person, date])
416
                        def resps = new HashMap<String,String[]>();
417
                        //lance le traitement
418
                        def builder = new Xml2Ana(file);
419
                        builder.setWordTag(module.getParameters().getWordElement().getTextContent())
420
                        builder.setConvertAllAtrtibutes true;
421
                        builder.setCorrespondances(correspRef, correspType);
422
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
423
                        //builder.setAddTEIHeader();
424
                        if (!builder.process(new File(outputDirectory, txmfile))) {
425
                                println("Failed to process "+f);
426
                                new File(outputDirectory,txmfile).delete();
427
                        }
428
                }
429
                filesToProcess = outputDirectory.listFiles();
430
                println ""
431
                return true;
432
        }
433

    
434
        @Override
435
        public void checkFiles() {
436
                //isSuccessFul = isSuccessFul && outputDirectory.listFiles() != null;
437
        }
438

    
439
        @Override
440
        public void cancel() {
441
                cancelNow = true;
442
        }
443
}