Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / xtz / XTZImporter.groovy @ 966

History | View | Annotate | Download (14.9 kB)

1 321 mdecorde
package org.txm.importer.xtz
2 321 mdecorde
3 321 mdecorde
import filters.Tokeniser.SimpleTokenizerXml
4 321 mdecorde
5 321 mdecorde
import java.io.File;
6 321 mdecorde
7 321 mdecorde
import net.sf.saxon.style.XSLParam;
8 321 mdecorde
9 788 mdecorde
import org.txm.core.preferences.TBXPreferences;
10 625 mdecorde
import org.txm.importer.ApplyXsl2
11 625 mdecorde
import org.txm.importer.CleanFile
12 625 mdecorde
import org.txm.importer.MileStoneProjection
13 625 mdecorde
import org.txm.importer.ValidateXml
14 625 mdecorde
import org.txm.*
15 321 mdecorde
16 479 mdecorde
import javax.xml.parsers.SAXParser
17 479 mdecorde
import javax.xml.parsers.SAXParserFactory
18 625 mdecorde
import javax.xml.stream.*
19 321 mdecorde
20 321 mdecorde
import org.w3c.dom.Element
21 479 mdecorde
import org.xml.sax.XMLReader
22 625 mdecorde
import org.txm.utils.xml.DomUtils
23 321 mdecorde
import org.txm.metadatas.Metadatas
24 321 mdecorde
import org.txm.objects.BaseParameters
25 927 mdecorde
import org.txm.importer.xmltxm.Xml2Ana
26 321 mdecorde
import org.txm.stat.utils.ConsoleProgressBar
27 625 mdecorde
import org.txm.utils.io.*
28 321 mdecorde
29 321 mdecorde
class XTZImporter extends Importer {
30 321 mdecorde
31 321 mdecorde
        boolean cancelNow = false;
32 321 mdecorde
        def filesToProcess = []
33 321 mdecorde
34 321 mdecorde
        File frontXSLResultDirectory;
35 321 mdecorde
        File tokenizedDirectory;
36 321 mdecorde
        File sentencedDirectory;
37 321 mdecorde
38 321 mdecorde
        String xslPath;
39 321 mdecorde
        def xslParams;
40 321 mdecorde
        String lang;
41 321 mdecorde
        String wordTag;
42 321 mdecorde
        boolean skipDoTokenizeStep = false;
43 479 mdecorde
44 321 mdecorde
        Metadatas metadata = null; // text metadata
45 321 mdecorde
46 321 mdecorde
        public XTZImporter(ImportModule module) {
47 321 mdecorde
                super(module);
48 321 mdecorde
        }
49 479 mdecorde
50 321 mdecorde
        public Metadatas getMetadata() {
51 321 mdecorde
                return metadata;
52 321 mdecorde
        }
53 321 mdecorde
54 321 mdecorde
        public void process() {
55 321 mdecorde
56 321 mdecorde
                BaseParameters params = module.getParameters();
57 321 mdecorde
                File binDir = module.getBinaryDirectory();
58 321 mdecorde
59 321 mdecorde
                String corpusname = params.getCorpusName();
60 321 mdecorde
                Element corpusElem = params.corpora.get(corpusname);
61 321 mdecorde
62 321 mdecorde
                def srcFiles = [];
63 321 mdecorde
                def files = inputDirectory.listFiles();
64 321 mdecorde
                if (files == null) {
65 321 mdecorde
                        reason = "No file to process in "+inputDirectory
66 321 mdecorde
                        return; // no file to process in the directory
67 321 mdecorde
                }
68 321 mdecorde
69 321 mdecorde
                for (File f : files) {
70 321 mdecorde
                        if (f.isHidden() || f.isDirectory())
71 321 mdecorde
                                continue;
72 321 mdecorde
                        else if (f.getName().endsWith(".properties"))
73 321 mdecorde
                                continue;
74 321 mdecorde
                        else if (f.getName().equals("import.xml"))
75 321 mdecorde
                                continue;
76 321 mdecorde
                        else if (f.getName().endsWith(".csv"))
77 321 mdecorde
                                continue;
78 321 mdecorde
                        else if (f.getName().endsWith(".dtd"))
79 321 mdecorde
                                continue;
80 321 mdecorde
                        else if (f.getName().endsWith(".xsl"))
81 321 mdecorde
                                continue;
82 321 mdecorde
                        else if (f.getName().endsWith("~"))
83 321 mdecorde
                                continue;
84 321 mdecorde
                        else if (f.getName().startsWith("."))
85 321 mdecorde
                                continue;
86 321 mdecorde
87 321 mdecorde
                        srcFiles << f
88 321 mdecorde
                }
89 321 mdecorde
90 321 mdecorde
                if (srcFiles.size() == 0) {
91 321 mdecorde
                        reason = "No suitable file to process in "+inputDirectory
92 321 mdecorde
                        return; // no file to process in the directory
93 321 mdecorde
                }
94 321 mdecorde
95 321 mdecorde
                // prepare front XSL if any
96 321 mdecorde
                xslPath = params.getXsltElement(corpusElem).getAttribute("xsl")
97 321 mdecorde
                xslParams = params.getXsltParams(corpusElem);
98 321 mdecorde
99 321 mdecorde
                lang = corpusElem.getAttribute("lang");
100 479 mdecorde
101 321 mdecorde
                wordTag = module.getParameters().getWordElement().getTextContent()
102 321 mdecorde
                this.skipDoTokenizeStep = module.getParameters().getSkipTokenization()
103 321 mdecorde
104 321 mdecorde
                //prepare metadata if any
105 966 mdecorde
                File allmetadatafile = Metadatas.findMetadataFile(inputDirectory);
106 321 mdecorde
                if (allmetadatafile.exists()) {
107 966 mdecorde
                        File copy = new File(binDir, allmetadatasfile.getName())
108 321 mdecorde
                        if (!FileCopy.copy(allmetadatafile, copy)) {
109 321 mdecorde
                                println "Error: could not create a copy of the metadata file "+allmetadatafile.getAbsoluteFile();
110 321 mdecorde
                                return;
111 321 mdecorde
                        }
112 321 mdecorde
                        metadata = new Metadatas(copy,
113 788 mdecorde
                        Toolbox.getPreference(TBXPreferences.METADATA_ENCODING),
114 788 mdecorde
                        Toolbox.getPreference(TBXPreferences.METADATA_COLSEPARATOR),
115 788 mdecorde
                        Toolbox.getPreference(TBXPreferences.METADATA_TXTSEPARATOR), 1)
116 321 mdecorde
                }
117 321 mdecorde
118 321 mdecorde
                String cleanDirectories = module.getParameters().getKeyValueParameters().get(ImportKeys.CLEAN);
119 479 mdecorde
120 479 mdecorde
                if (!doFixSurrogates()) return;
121 321 mdecorde
                if (!doSplitMergeXSLStep()) return;
122 321 mdecorde
                if (!doFrontXSLStep()) return;
123 321 mdecorde
                if (!doCheckXMLFileStep()) return;
124 321 mdecorde
                if (!doTokenizeStep()) return;
125 321 mdecorde
                if (!doPostTokenizeXSLStep()) return;
126 321 mdecorde
                if (!doEncodeMileStonesStep()) return;
127 321 mdecorde
                if (!doToXMLTXMStep()) return;
128 321 mdecorde
                if (!doInjectMetadataStep()) return;
129 321 mdecorde
130 321 mdecorde
                isSuccessFul = filesToProcess.size() > 0
131 321 mdecorde
132 321 mdecorde
                if ("true".equals(cleanDirectories)) {
133 321 mdecorde
                        new File(module.getBinaryDirectory(), "tokenized").deleteDir()
134 321 mdecorde
                        new File(module.getBinaryDirectory(), "src").deleteDir()
135 321 mdecorde
                }
136 321 mdecorde
        }
137 321 mdecorde
138 321 mdecorde
        /**
139 479 mdecorde
         * read from source directory and remove the surrogate chars not well managed
140 479 mdecorde
         * save the result in $bindir/src
141 479 mdecorde
         *
142 479 mdecorde
         *
143 479 mdecorde
         *
144 479 mdecorde
         */
145 479 mdecorde
        public boolean doFixSurrogates() {
146 479 mdecorde
                File srcDirectory = new File(module.getBinaryDirectory(), "src")
147 479 mdecorde
                srcDirectory.mkdir()
148 479 mdecorde
                def files = inputDirectory.listFiles()
149 479 mdecorde
                if (files == null) {
150 479 mdecorde
                        reason = "No file to process in $inputDirectory"
151 479 mdecorde
                        return false;
152 479 mdecorde
                }
153 479 mdecorde
154 479 mdecorde
                for (File file : files) {
155 479 mdecorde
                        def outputFile = new File(srcDirectory, file.getName());
156 479 mdecorde
                        if (file.getName().toLowerCase().endsWith(".xml")) {
157 479 mdecorde
                                CleanFile.removeSurrogateFromXmlFile(file, outputFile)
158 479 mdecorde
                        } else {
159 625 mdecorde
                                FileCopy.copyFiles(file, outputFile)
160 479 mdecorde
                        }
161 479 mdecorde
                }
162 479 mdecorde
163 479 mdecorde
                inputDirectory = srcDirectory; // the files to process are now in the "src" directory
164 479 mdecorde
                return true;
165 479 mdecorde
        }
166 479 mdecorde
167 479 mdecorde
        /**
168 321 mdecorde
         * read from source directory and write the result in $bindir/txm
169 321 mdecorde
         *
170 321 mdecorde
         * only one XSL is applied
171 321 mdecorde
         *
172 321 mdecorde
         */
173 321 mdecorde
        public boolean doSplitMergeXSLStep() {
174 321 mdecorde
                File srcDirectory = new File(module.getBinaryDirectory(), "src")
175 321 mdecorde
                srcDirectory.mkdir()
176 321 mdecorde
                xslParams["output-directory"] = srcDirectory.getAbsoluteFile().toURI().toString();
177 321 mdecorde
178 321 mdecorde
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/1-split-merge")
179 321 mdecorde
                println "-- Split-Merge XSL Step with $xslDirectory"
180 321 mdecorde
                def xslFiles = xslDirectory.listFiles()
181 321 mdecorde
                if (xslDirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
182 479 mdecorde
183 321 mdecorde
                        xslFiles.sort()
184 321 mdecorde
                        for (File xslFile : xslFiles) {
185 321 mdecorde
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
186 321 mdecorde
                                //if (!xslFile.getName().matches("[1-9]-.+")) continue;
187 479 mdecorde
188 479 mdecorde
                                if (ApplyXsl2.processImportSources(xslFile, inputDirectory.listFiles(), xslParams, false)) {
189 321 mdecorde
                                        println ""
190 321 mdecorde
                                } else {
191 321 mdecorde
                                        reason = "Fail to apply split-merge XSL: $xslPath"
192 321 mdecorde
                                }
193 321 mdecorde
                                break; // THERE IS ONLY ONE XSL FILE TO APPLY
194 321 mdecorde
                        }
195 321 mdecorde
                        inputDirectory = srcDirectory; // the files to process are now in the "src" directory
196 321 mdecorde
                } else {
197 321 mdecorde
                        //println "Nothing to do."
198 321 mdecorde
                }
199 321 mdecorde
                return true;
200 321 mdecorde
        }
201 479 mdecorde
202 321 mdecorde
        public boolean doEncodeMileStonesStep() {
203 321 mdecorde
                filesToProcess = new File(module.getBinaryDirectory(), "tokenized").listFiles()
204 321 mdecorde
                def milestonesString = module.getParameters().getMilestonesElement().getTextContent().trim()
205 321 mdecorde
                if (milestonesString.length() == 0) return true;
206 479 mdecorde
207 321 mdecorde
                def milestones = milestonesString.split(",")
208 321 mdecorde
                //def milestones = ["lb", "pb", "cb"]
209 321 mdecorde
                //println "milestones=$milestones type="+milestones.getClass()+" size="+milestones.size()
210 321 mdecorde
                if (milestones.size() == 0) return true;
211 479 mdecorde
212 321 mdecorde
                println "-- Encoding milestone $milestones into XML files..."
213 321 mdecorde
                ConsoleProgressBar cpb = new ConsoleProgressBar(milestones.size())
214 321 mdecorde
                for (def milestone : milestones) {
215 321 mdecorde
                        milestone = milestone.trim()// just in case
216 321 mdecorde
                        cpb.tick()
217 479 mdecorde
218 321 mdecorde
                        for (File inputFile : filesToProcess) {
219 321 mdecorde
                                File outputFile = File.createTempFile("tmp", inputFile.getName(),inputFile.getParentFile());
220 479 mdecorde
                                println "$inputFile, 'text', $wordTag, $milestone, $outputFile"
221 479 mdecorde
                                MileStoneProjection msp = new MileStoneProjection(inputFile, "text", wordTag, milestone)
222 321 mdecorde
                                if (!msp.process(outputFile)) {
223 321 mdecorde
                                        println "Fail to encode $milestone in $inputFile"
224 321 mdecorde
                                        return false
225 321 mdecorde
                                } else {
226 321 mdecorde
                                        if (inputFile.delete()) {
227 321 mdecorde
                                                outputFile.renameTo(inputFile)
228 321 mdecorde
                                        } else {
229 321 mdecorde
                                                println "Fail to encode $milestone in ${inputFile}: could not replace the file."
230 321 mdecorde
                                                return false
231 321 mdecorde
                                        }
232 321 mdecorde
                                }
233 321 mdecorde
                        }
234 321 mdecorde
                }
235 321 mdecorde
                println ""
236 321 mdecorde
                return true;
237 321 mdecorde
        }
238 321 mdecorde
239 321 mdecorde
        /**
240 321 mdecorde
         * read from $inputDirectory and write the result in $bindir/txm
241 321 mdecorde
         *
242 321 mdecorde
         */
243 321 mdecorde
        public boolean doFrontXSLStep() {
244 321 mdecorde
245 321 mdecorde
                //filesToProcess = inputDirectory.listFiles();
246 321 mdecorde
247 321 mdecorde
                File frontXSLdirectory = new File(module.getSourceDirectory(), "xsl/2-front")
248 321 mdecorde
                println "-- Front XSL Step with $frontXSLdirectory"
249 321 mdecorde
                def xslFiles = frontXSLdirectory.listFiles()
250 321 mdecorde
                if (frontXSLdirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
251 479 mdecorde
252 321 mdecorde
                        xslFiles.sort()
253 321 mdecorde
                        for (File xslFile : xslFiles) {
254 321 mdecorde
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
255 321 mdecorde
                                //if (!xslFile.getName().matches("[1-9]-.+")) continue;
256 321 mdecorde
257 321 mdecorde
                                if (ApplyXsl2.processImportSources(xslFile, inputDirectory, outputDirectory, xslParams)) {
258 321 mdecorde
                                        inputDirectory = outputDirectory; // the files to process are now in the "txm" directory
259 321 mdecorde
                                        println ""
260 321 mdecorde
                                } else {
261 321 mdecorde
                                        reason = "Fail to apply front XSL: $xslPath"
262 321 mdecorde
                                        return false;
263 321 mdecorde
                                }
264 321 mdecorde
                        }
265 321 mdecorde
                } else {
266 321 mdecorde
                        //println "Nothing to do."
267 321 mdecorde
                }
268 321 mdecorde
                return true;
269 321 mdecorde
        }
270 321 mdecorde
271 321 mdecorde
        public boolean doCheckXMLFileStep() {
272 321 mdecorde
                filesToProcess = []
273 321 mdecorde
                println "-- Check XML files for well-formedness."
274 321 mdecorde
                def files = inputDirectory.listFiles()
275 321 mdecorde
276 321 mdecorde
                if (files == null || files.size() == 0) {
277 321 mdecorde
                        println "No XML file (*.xml) to process. Aborting"
278 321 mdecorde
                        return false;
279 321 mdecorde
                }
280 321 mdecorde
                files = files.sort()
281 321 mdecorde
282 321 mdecorde
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
283 321 mdecorde
                for (File f : files) {
284 321 mdecorde
                        cpb.tick()
285 321 mdecorde
                        if (f.getName().toLowerCase().endsWith(".xml"))
286 321 mdecorde
                                if (!f.isHidden() && f.canRead() && !f.getName().endsWith(".properties")
287 321 mdecorde
                                && !f.getName().startsWith("metadata") && !f.isDirectory())
288 321 mdecorde
                        {
289 479 mdecorde
290 321 mdecorde
                                if (!ValidateXml.test(f)) {
291 321 mdecorde
                                        continue;
292 321 mdecorde
                                } else {
293 321 mdecorde
                                        if (!f.getName().equals("import.xml"))
294 321 mdecorde
                                                filesToProcess << f
295 321 mdecorde
                                }
296 321 mdecorde
                        }
297 321 mdecorde
                }
298 321 mdecorde
                println ""
299 321 mdecorde
                if (filesToProcess.size() == 0) {
300 321 mdecorde
                        reason = "No file suitable to process in "+inputDirectory
301 321 mdecorde
                        return false;
302 321 mdecorde
                } else {
303 321 mdecorde
                        return true;
304 321 mdecorde
                }
305 321 mdecorde
        }
306 321 mdecorde
307 321 mdecorde
308 321 mdecorde
        public boolean doTokenizeStep() {
309 321 mdecorde
310 321 mdecorde
                println "-- Tokenizing "+filesToProcess.size()+" files"
311 321 mdecorde
                new File(module.getBinaryDirectory(),"tokenized").mkdir()
312 321 mdecorde
313 321 mdecorde
                String outSideTextTagsRegex = "";
314 321 mdecorde
                String outSideTextTagsAndKeepContentRegex = "";
315 321 mdecorde
                String noteRegex = "";
316 321 mdecorde
                // get the element names to ignore
317 321 mdecorde
                Element textualPlanEelement = module.getParameters().getTextualPlans();
318 321 mdecorde
319 321 mdecorde
                if (textualPlanEelement != null) {
320 321 mdecorde
                        Element e1 = module.getParameters().getOutSideTextTagsElement()
321 321 mdecorde
                        def split  = e1.getTextContent().split(",")
322 321 mdecorde
                        for (String s : split) {
323 321 mdecorde
                                outSideTextTagsRegex += "|"+s.trim()
324 321 mdecorde
                        }
325 321 mdecorde
                        if (outSideTextTagsRegex.trim().length() > 0) {
326 321 mdecorde
                                outSideTextTagsRegex = outSideTextTagsRegex.substring(1) // remove the first "|"
327 321 mdecorde
                        }
328 479 mdecorde
329 321 mdecorde
                        Element e2 = module.getParameters().getOutSideTextTagsAndKeepContentElement()
330 321 mdecorde
                        def split2  = e2.getTextContent().split(",")
331 321 mdecorde
                        for (String s : split2) {
332 321 mdecorde
                                outSideTextTagsAndKeepContentRegex += "|"+s.trim()
333 321 mdecorde
                        }
334 321 mdecorde
                        if (outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
335 321 mdecorde
                                outSideTextTagsAndKeepContentRegex = outSideTextTagsAndKeepContentRegex.substring(1) // remove the first "|"
336 321 mdecorde
                        }
337 479 mdecorde
338 321 mdecorde
                        Element e3 = module.getParameters().getNoteElement()
339 321 mdecorde
                        def split3  = e3.getTextContent().split(",")
340 321 mdecorde
                        for (String s : split3) {
341 321 mdecorde
                                noteRegex += "|"+s.trim()
342 321 mdecorde
                        }
343 321 mdecorde
                        if (noteRegex.trim().length() > 0) {
344 321 mdecorde
                                noteRegex = noteRegex.substring(1) // remove the first "|"
345 321 mdecorde
                        }
346 321 mdecorde
                }
347 321 mdecorde
348 321 mdecorde
                //if (wordTag != "w") {
349 321 mdecorde
                if (skipDoTokenizeStep ) {
350 321 mdecorde
                        println "No tokenization do to."
351 321 mdecorde
                        // ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
352 321 mdecorde
                        for (File f : filesToProcess) {
353 321 mdecorde
                                File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
354 321 mdecorde
                                FileCopy.copy(f, outfile);
355 321 mdecorde
                        }
356 321 mdecorde
                        return true;
357 321 mdecorde
                } else {
358 321 mdecorde
                        ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
359 321 mdecorde
                        for (File f : filesToProcess) {
360 321 mdecorde
                                cpb.tick()
361 321 mdecorde
                                File infile = f;
362 321 mdecorde
                                File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
363 321 mdecorde
                                SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
364 321 mdecorde
                                if (wordTag.length() > 0)
365 321 mdecorde
                                        tokenizer.setWord_tags(wordTag);
366 479 mdecorde
                                tokenizer.setWord_element_to_create(wordTag)
367 321 mdecorde
368 321 mdecorde
                                if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) {
369 321 mdecorde
                                        tokenizer.setOutSideTextTags(outSideTextTagsRegex)
370 321 mdecorde
                                }
371 479 mdecorde
372 321 mdecorde
                                if (outSideTextTagsAndKeepContentRegex != null && outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
373 321 mdecorde
                                        tokenizer.setOutSideTextTagsAndKeepContent(outSideTextTagsAndKeepContentRegex)
374 321 mdecorde
                                }
375 479 mdecorde
376 321 mdecorde
                                if (noteRegex != null && noteRegex.trim().length() > 0) {
377 321 mdecorde
                                        tokenizer.setNote(noteRegex)
378 321 mdecorde
                                }
379 321 mdecorde
380 321 mdecorde
                                // tokenize ! \o/
381 321 mdecorde
                                if (!tokenizer.process()) {
382 321 mdecorde
                                        println("Failed to process "+f)
383 321 mdecorde
                                        outfile.delete()
384 321 mdecorde
                                }
385 321 mdecorde
                        }
386 321 mdecorde
                }
387 479 mdecorde
388 321 mdecorde
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
389 321 mdecorde
                println ""
390 321 mdecorde
391 321 mdecorde
                return true;
392 321 mdecorde
        }
393 321 mdecorde
394 321 mdecorde
        /**
395 321 mdecorde
         * read from $bindir/tokenized and write the result in $bindir/tokenized
396 321 mdecorde
         *
397 321 mdecorde
         */
398 321 mdecorde
        public boolean doPostTokenizeXSLStep() {
399 321 mdecorde
400 321 mdecorde
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
401 321 mdecorde
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/3-posttok")
402 321 mdecorde
                def xslFiles = xslDirectory.listFiles()
403 321 mdecorde
                if (!xslDirectory.exists() || xslFiles == null || xslFiles.size() == 0) return true;
404 321 mdecorde
405 321 mdecorde
                println "-- Posttokenize XSL Step with $xslDirectory"
406 321 mdecorde
                return ApplyXsl2.processWithMultipleXSL(filesToProcess, xslDirectory, xslParams)
407 321 mdecorde
        }
408 321 mdecorde
409 321 mdecorde
        public boolean doInjectMetadataStep() {
410 321 mdecorde
                if (metadata != null) {
411 321 mdecorde
                        println("-- INJECTING METADATA - "+metadata.getHeadersList()+" in texts of directory "+outputDirectory)
412 321 mdecorde
                        ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
413 321 mdecorde
                        for (File infile : filesToProcess) {
414 321 mdecorde
                                cpb.tick()
415 321 mdecorde
                                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
416 321 mdecorde
                                if (metadata.injectMetadatasInXml(infile, outfile, "text", null)) {
417 321 mdecorde
                                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
418 321 mdecorde
                                        if (!infile.exists()) {
419 321 mdecorde
                                                println "Error: could not replace $infile by $outfile"
420 321 mdecorde
                                                return false;
421 321 mdecorde
                                        }
422 321 mdecorde
                                } else {
423 321 mdecorde
                                        outfile.delete(); // fail
424 321 mdecorde
                                }
425 321 mdecorde
                        }
426 321 mdecorde
                        println ""
427 321 mdecorde
                }
428 321 mdecorde
                return true;
429 321 mdecorde
        }
430 321 mdecorde
431 321 mdecorde
        public boolean doToXMLTXMStep() {
432 321 mdecorde
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
433 321 mdecorde
                ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
434 321 mdecorde
                println("-- Building XML-TXM ("+filesToProcess.size()+" files)")
435 321 mdecorde
                for (File f : filesToProcess) {
436 321 mdecorde
                        //ArrayList<String> milestones = new ArrayList<String>();
437 321 mdecorde
                        cpb.tick()
438 321 mdecorde
                        File file = f;
439 321 mdecorde
                        String txmfile = f.getName();
440 321 mdecorde
441 321 mdecorde
                        def correspType = new HashMap<String,String>()
442 321 mdecorde
                        def correspRef = new HashMap<String,String>()
443 321 mdecorde
                        //il faut lister les id de tous les respStmt
444 321 mdecorde
                        def respId = [];
445 321 mdecorde
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
446 321 mdecorde
                        def applications = new HashMap<String,HashMap<String,String>>();
447 321 mdecorde
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
448 321 mdecorde
                        //pour construire les ref vers les taxonomies
449 321 mdecorde
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
450 321 mdecorde
                        //associe un id d'item avec sa description et son URI
451 321 mdecorde
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
452 321 mdecorde
                        //informations de respStmt
453 321 mdecorde
                        //resps (respId <voir ci-dessus>, [description, person, date])
454 321 mdecorde
                        def resps = new HashMap<String,String[]>();
455 321 mdecorde
                        //lance le traitement
456 321 mdecorde
                        def builder = new Xml2Ana(file);
457 321 mdecorde
                        builder.setWordTag(module.getParameters().getWordElement().getTextContent())
458 321 mdecorde
                        builder.setConvertAllAtrtibutes true;
459 321 mdecorde
                        builder.setCorrespondances(correspRef, correspType);
460 321 mdecorde
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
461 321 mdecorde
                        //builder.setAddTEIHeader();
462 321 mdecorde
                        if (!builder.process(new File(outputDirectory, txmfile))) {
463 321 mdecorde
                                println("Failed to process "+f);
464 321 mdecorde
                                new File(outputDirectory,txmfile).delete();
465 321 mdecorde
                        }
466 321 mdecorde
                }
467 321 mdecorde
                filesToProcess = outputDirectory.listFiles();
468 321 mdecorde
                println ""
469 321 mdecorde
                return true;
470 321 mdecorde
        }
471 321 mdecorde
472 321 mdecorde
        @Override
473 321 mdecorde
        public void checkFiles() {
474 321 mdecorde
                //isSuccessFul = isSuccessFul && outputDirectory.listFiles() != null;
475 321 mdecorde
        }
476 321 mdecorde
477 321 mdecorde
        @Override
478 321 mdecorde
        public void cancel() {
479 321 mdecorde
                cancelNow = true;
480 321 mdecorde
        }
481 321 mdecorde
}