Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZImporter.groovy @ 2126

History | View | Annotate | Download (14.9 kB)

1 986 mdecorde
package org.txm.scripts.importer.xtz
2 321 mdecorde
3 986 mdecorde
import org.txm.scripts.filters.Tokeniser.SimpleTokenizerXml
4 321 mdecorde
5 321 mdecorde
import java.io.File;
6 321 mdecorde
7 321 mdecorde
import net.sf.saxon.style.XSLParam;
8 321 mdecorde
9 788 mdecorde
import org.txm.core.preferences.TBXPreferences;
10 1000 mdecorde
import org.txm.importer.ApplyXsl2
11 986 mdecorde
import org.txm.scripts.importer.CleanFile
12 986 mdecorde
import org.txm.scripts.importer.MileStoneProjection
13 1000 mdecorde
import org.txm.importer.ValidateXml
14 625 mdecorde
import org.txm.*
15 321 mdecorde
16 479 mdecorde
import javax.xml.parsers.SAXParser
17 479 mdecorde
import javax.xml.parsers.SAXParserFactory
18 625 mdecorde
import javax.xml.stream.*
19 321 mdecorde
20 321 mdecorde
import org.w3c.dom.Element
21 479 mdecorde
import org.xml.sax.XMLReader
22 625 mdecorde
import org.txm.utils.xml.DomUtils
23 321 mdecorde
import org.txm.metadatas.Metadatas
24 1135 mdecorde
import org.txm.objects.*
25 1000 mdecorde
import org.txm.importer.scripts.xmltxm.Xml2Ana
26 1613 mdecorde
import org.txm.utils.ConsoleProgressBar
27 625 mdecorde
import org.txm.utils.io.*
28 1000 mdecorde
import org.txm.importer.xtz.*
29 321 mdecorde
30 321 mdecorde
class XTZImporter extends Importer {
31 321 mdecorde
32 2080 mdecorde
        public boolean cancelNow = false;
33 2080 mdecorde
        public def filesToProcess = []
34 321 mdecorde
35 2080 mdecorde
        public File frontXSLResultDirectory;
36 2080 mdecorde
        public File tokenizedDirectory;
37 2080 mdecorde
        public File sentencedDirectory;
38 321 mdecorde
39 2080 mdecorde
        public String xslPath;
40 2080 mdecorde
        public def xslParams;
41 2080 mdecorde
        public String lang;
42 2080 mdecorde
        public String wordTag;
43 2080 mdecorde
        public boolean doTokenizeStep = false;
44 479 mdecorde
45 2080 mdecorde
        public Metadatas metadata = null; // text metadata
46 321 mdecorde
47 321 mdecorde
        public XTZImporter(ImportModule module) {
48 321 mdecorde
                super(module);
49 321 mdecorde
        }
50 479 mdecorde
51 321 mdecorde
        public Metadatas getMetadata() {
52 321 mdecorde
                return metadata;
53 321 mdecorde
        }
54 321 mdecorde
55 321 mdecorde
        public void process() {
56 321 mdecorde
57 1135 mdecorde
                Project project = module.getProject();
58 321 mdecorde
                File binDir = module.getBinaryDirectory();
59 321 mdecorde
60 1135 mdecorde
                String corpusname = project.getName();
61 321 mdecorde
62 321 mdecorde
                def srcFiles = [];
63 321 mdecorde
                def files = inputDirectory.listFiles();
64 321 mdecorde
                if (files == null) {
65 321 mdecorde
                        reason = "No file to process in "+inputDirectory
66 321 mdecorde
                        return; // no file to process in the directory
67 321 mdecorde
                }
68 321 mdecorde
69 321 mdecorde
                for (File f : files) {
70 321 mdecorde
                        if (f.isHidden() || f.isDirectory())
71 321 mdecorde
                                continue;
72 321 mdecorde
                        else if (f.getName().endsWith(".properties"))
73 321 mdecorde
                                continue;
74 321 mdecorde
                        else if (f.getName().equals("import.xml"))
75 321 mdecorde
                                continue;
76 321 mdecorde
                        else if (f.getName().endsWith(".csv"))
77 321 mdecorde
                                continue;
78 321 mdecorde
                        else if (f.getName().endsWith(".dtd"))
79 321 mdecorde
                                continue;
80 321 mdecorde
                        else if (f.getName().endsWith(".xsl"))
81 321 mdecorde
                                continue;
82 321 mdecorde
                        else if (f.getName().endsWith("~"))
83 321 mdecorde
                                continue;
84 321 mdecorde
                        else if (f.getName().startsWith("."))
85 321 mdecorde
                                continue;
86 321 mdecorde
87 321 mdecorde
                        srcFiles << f
88 321 mdecorde
                }
89 321 mdecorde
90 321 mdecorde
                if (srcFiles.size() == 0) {
91 321 mdecorde
                        reason = "No suitable file to process in "+inputDirectory
92 321 mdecorde
                        return; // no file to process in the directory
93 321 mdecorde
                }
94 321 mdecorde
95 321 mdecorde
                // prepare front XSL if any
96 1135 mdecorde
                xslPath = project.getFrontXSL()
97 1135 mdecorde
                xslParams = project.getXsltParameters()
98 321 mdecorde
99 1135 mdecorde
                lang = project.getLang();
100 479 mdecorde
101 1137 mdecorde
                wordTag = project.getTokenizerWordElement()
102 1395 mdecorde
                this.doTokenizeStep = project.getDoTokenizerStep()
103 321 mdecorde
104 321 mdecorde
                //prepare metadata if any
105 1000 mdecorde
                File allMetadataFile = Metadatas.findMetadataFile(inputDirectory);
106 1000 mdecorde
                if (allMetadataFile.exists()) {
107 1000 mdecorde
                        File copy = new File(binDir, allMetadataFile.getName())
108 1000 mdecorde
                        if (!FileCopy.copy(allMetadataFile, copy)) {
109 1000 mdecorde
                                println "Error: could not create a copy of the metadata file "+allMetadataFile.getAbsoluteFile();
110 321 mdecorde
                                return;
111 321 mdecorde
                        }
112 321 mdecorde
                        metadata = new Metadatas(copy,
113 1610 mdecorde
                                        Toolbox.getPreference(TBXPreferences.METADATA_ENCODING),
114 1610 mdecorde
                                        Toolbox.getPreference(TBXPreferences.METADATA_COLSEPARATOR),
115 1610 mdecorde
                                        Toolbox.getPreference(TBXPreferences.METADATA_TXTSEPARATOR), 1)
116 321 mdecorde
                }
117 321 mdecorde
118 1137 mdecorde
                String cleanDirectories = project.getCleanAfterBuild();
119 1610 mdecorde
120 479 mdecorde
                if (!doFixSurrogates()) return;
121 321 mdecorde
                if (!doSplitMergeXSLStep()) return;
122 321 mdecorde
                if (!doFrontXSLStep()) return;
123 321 mdecorde
                if (!doCheckXMLFileStep()) return;
124 321 mdecorde
                if (!doTokenizeStep()) return;
125 321 mdecorde
                if (!doPostTokenizeXSLStep()) return;
126 321 mdecorde
                if (!doEncodeMileStonesStep()) return;
127 321 mdecorde
                if (!doToXMLTXMStep()) return;
128 321 mdecorde
                if (!doInjectMetadataStep()) return;
129 321 mdecorde
130 321 mdecorde
                isSuccessFul = filesToProcess.size() > 0
131 321 mdecorde
132 321 mdecorde
                if ("true".equals(cleanDirectories)) {
133 321 mdecorde
                        new File(module.getBinaryDirectory(), "tokenized").deleteDir()
134 321 mdecorde
                        new File(module.getBinaryDirectory(), "src").deleteDir()
135 1971 mdecorde
                        new File(module.getBinaryDirectory(), "split").deleteDir()
136 321 mdecorde
                }
137 321 mdecorde
        }
138 321 mdecorde
139 321 mdecorde
        /**
140 479 mdecorde
         * read from source directory and remove the surrogate chars not well managed
141 479 mdecorde
         * save the result in $bindir/src
142 479 mdecorde
         *
143 479 mdecorde
         *
144 479 mdecorde
         *
145 479 mdecorde
         */
146 479 mdecorde
        public boolean doFixSurrogates() {
147 479 mdecorde
                File srcDirectory = new File(module.getBinaryDirectory(), "src")
148 1971 mdecorde
149 479 mdecorde
                def files = inputDirectory.listFiles()
150 479 mdecorde
                if (files == null) {
151 479 mdecorde
                        reason = "No file to process in $inputDirectory"
152 479 mdecorde
                        return false;
153 479 mdecorde
                }
154 1971 mdecorde
                srcDirectory.deleteDir() // clean before copying
155 1971 mdecorde
                srcDirectory.mkdir()
156 1971 mdecorde
157 479 mdecorde
                for (File file : files) {
158 479 mdecorde
                        def outputFile = new File(srcDirectory, file.getName());
159 479 mdecorde
                        if (file.getName().toLowerCase().endsWith(".xml")) {
160 479 mdecorde
                                CleanFile.removeSurrogateFromXmlFile(file, outputFile)
161 479 mdecorde
                        } else {
162 625 mdecorde
                                FileCopy.copyFiles(file, outputFile)
163 479 mdecorde
                        }
164 479 mdecorde
                }
165 479 mdecorde
166 479 mdecorde
                inputDirectory = srcDirectory; // the files to process are now in the "src" directory
167 479 mdecorde
                return true;
168 479 mdecorde
        }
169 479 mdecorde
170 479 mdecorde
        /**
171 321 mdecorde
         * read from source directory and write the result in $bindir/txm
172 321 mdecorde
         *
173 321 mdecorde
         * only one XSL is applied
174 321 mdecorde
         *
175 321 mdecorde
         */
176 321 mdecorde
        public boolean doSplitMergeXSLStep() {
177 1971 mdecorde
178 1971 mdecorde
                File splitedDirectory = new File(module.getBinaryDirectory(), "split")
179 1971 mdecorde
                splitedDirectory.deleteDir()
180 1971 mdecorde
181 321 mdecorde
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/1-split-merge")
182 321 mdecorde
                println "-- Split-Merge XSL Step with $xslDirectory"
183 321 mdecorde
                def xslFiles = xslDirectory.listFiles()
184 321 mdecorde
                if (xslDirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
185 1971 mdecorde
186 1971 mdecorde
                        splitedDirectory.mkdir()
187 1971 mdecorde
                        xslParams["output-directory"] = splitedDirectory.getAbsoluteFile().toURI().toString();
188 1971 mdecorde
189 321 mdecorde
                        xslFiles.sort()
190 321 mdecorde
                        for (File xslFile : xslFiles) {
191 321 mdecorde
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
192 321 mdecorde
                                //if (!xslFile.getName().matches("[1-9]-.+")) continue;
193 479 mdecorde
194 1971 mdecorde
                                if (ApplyXsl2.processImportSources(xslFile, inputDirectory.listFiles(), xslParams)) {
195 321 mdecorde
                                        println ""
196 321 mdecorde
                                } else {
197 321 mdecorde
                                        reason = "Fail to apply split-merge XSL: $xslPath"
198 321 mdecorde
                                }
199 321 mdecorde
                                break; // THERE IS ONLY ONE XSL FILE TO APPLY
200 321 mdecorde
                        }
201 1971 mdecorde
                        inputDirectory = splitedDirectory; // the files to process are now in the "src" directory
202 321 mdecorde
                } else {
203 321 mdecorde
                        //println "Nothing to do."
204 321 mdecorde
                }
205 321 mdecorde
                return true;
206 321 mdecorde
        }
207 479 mdecorde
208 321 mdecorde
        public boolean doEncodeMileStonesStep() {
209 321 mdecorde
                filesToProcess = new File(module.getBinaryDirectory(), "tokenized").listFiles()
210 1137 mdecorde
                def milestonesString = module.getProject().getTextualPlan("MileStones")
211 321 mdecorde
                if (milestonesString.length() == 0) return true;
212 479 mdecorde
213 321 mdecorde
                def milestones = milestonesString.split(",")
214 321 mdecorde
                //def milestones = ["lb", "pb", "cb"]
215 321 mdecorde
                //println "milestones=$milestones type="+milestones.getClass()+" size="+milestones.size()
216 321 mdecorde
                if (milestones.size() == 0) return true;
217 479 mdecorde
218 321 mdecorde
                println "-- Encoding milestone $milestones into XML files..."
219 321 mdecorde
                ConsoleProgressBar cpb = new ConsoleProgressBar(milestones.size())
220 321 mdecorde
                for (def milestone : milestones) {
221 321 mdecorde
                        milestone = milestone.trim()// just in case
222 321 mdecorde
                        cpb.tick()
223 479 mdecorde
224 321 mdecorde
                        for (File inputFile : filesToProcess) {
225 321 mdecorde
                                File outputFile = File.createTempFile("tmp", inputFile.getName(),inputFile.getParentFile());
226 479 mdecorde
                                println "$inputFile, 'text', $wordTag, $milestone, $outputFile"
227 479 mdecorde
                                MileStoneProjection msp = new MileStoneProjection(inputFile, "text", wordTag, milestone)
228 321 mdecorde
                                if (!msp.process(outputFile)) {
229 321 mdecorde
                                        println "Fail to encode $milestone in $inputFile"
230 321 mdecorde
                                        return false
231 321 mdecorde
                                } else {
232 321 mdecorde
                                        if (inputFile.delete()) {
233 1177 mdecorde
                                                FileCopy.copy(outputFile, new File(outputFile.getParent(), "copy.xml"))
234 321 mdecorde
                                                outputFile.renameTo(inputFile)
235 321 mdecorde
                                        } else {
236 321 mdecorde
                                                println "Fail to encode $milestone in ${inputFile}: could not replace the file."
237 321 mdecorde
                                                return false
238 321 mdecorde
                                        }
239 321 mdecorde
                                }
240 321 mdecorde
                        }
241 321 mdecorde
                }
242 321 mdecorde
                println ""
243 321 mdecorde
                return true;
244 321 mdecorde
        }
245 321 mdecorde
246 321 mdecorde
        /**
247 321 mdecorde
         * read from $inputDirectory and write the result in $bindir/txm
248 321 mdecorde
         *
249 321 mdecorde
         */
250 321 mdecorde
        public boolean doFrontXSLStep() {
251 321 mdecorde
252 321 mdecorde
                //filesToProcess = inputDirectory.listFiles();
253 321 mdecorde
254 321 mdecorde
                File frontXSLdirectory = new File(module.getSourceDirectory(), "xsl/2-front")
255 1610 mdecorde
                println "-- Front XSL Step with the $frontXSLdirectory directory."
256 321 mdecorde
                def xslFiles = frontXSLdirectory.listFiles()
257 1971 mdecorde
258 1971 mdecorde
                xslParams["output-directory"] = outputDirectory.getAbsoluteFile().toURI().toString();
259 1971 mdecorde
260 321 mdecorde
                if (frontXSLdirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
261 479 mdecorde
262 321 mdecorde
                        xslFiles.sort()
263 321 mdecorde
                        for (File xslFile : xslFiles) {
264 321 mdecorde
                                if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
265 321 mdecorde
                                //if (!xslFile.getName().matches("[1-9]-.+")) continue;
266 321 mdecorde
267 321 mdecorde
                                if (ApplyXsl2.processImportSources(xslFile, inputDirectory, outputDirectory, xslParams)) {
268 321 mdecorde
                                        inputDirectory = outputDirectory; // the files to process are now in the "txm" directory
269 321 mdecorde
                                        println ""
270 321 mdecorde
                                } else {
271 321 mdecorde
                                        reason = "Fail to apply front XSL: $xslPath"
272 321 mdecorde
                                        return false;
273 321 mdecorde
                                }
274 321 mdecorde
                        }
275 321 mdecorde
                } else {
276 321 mdecorde
                        //println "Nothing to do."
277 321 mdecorde
                }
278 321 mdecorde
                return true;
279 321 mdecorde
        }
280 321 mdecorde
281 321 mdecorde
        public boolean doCheckXMLFileStep() {
282 321 mdecorde
                filesToProcess = []
283 1610 mdecorde
                println "-- Checking XML files for well-formedness."
284 321 mdecorde
                def files = inputDirectory.listFiles()
285 321 mdecorde
286 321 mdecorde
                if (files == null || files.size() == 0) {
287 321 mdecorde
                        println "No XML file (*.xml) to process. Aborting"
288 321 mdecorde
                        return false;
289 321 mdecorde
                }
290 321 mdecorde
                files = files.sort()
291 321 mdecorde
292 321 mdecorde
                ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
293 321 mdecorde
                for (File f : files) {
294 321 mdecorde
                        cpb.tick()
295 321 mdecorde
                        if (f.getName().toLowerCase().endsWith(".xml"))
296 321 mdecorde
                                if (!f.isHidden() && f.canRead() && !f.getName().endsWith(".properties")
297 321 mdecorde
                                && !f.getName().startsWith("metadata") && !f.isDirectory())
298 321 mdecorde
                        {
299 479 mdecorde
300 321 mdecorde
                                if (!ValidateXml.test(f)) {
301 321 mdecorde
                                        continue;
302 321 mdecorde
                                } else {
303 321 mdecorde
                                        if (!f.getName().equals("import.xml"))
304 321 mdecorde
                                                filesToProcess << f
305 321 mdecorde
                                }
306 321 mdecorde
                        }
307 321 mdecorde
                }
308 321 mdecorde
                println ""
309 321 mdecorde
                if (filesToProcess.size() == 0) {
310 321 mdecorde
                        reason = "No file suitable to process in "+inputDirectory
311 321 mdecorde
                        return false;
312 321 mdecorde
                } else {
313 321 mdecorde
                        return true;
314 321 mdecorde
                }
315 321 mdecorde
        }
316 321 mdecorde
317 321 mdecorde
318 321 mdecorde
        public boolean doTokenizeStep() {
319 321 mdecorde
320 321 mdecorde
                new File(module.getBinaryDirectory(),"tokenized").mkdir()
321 321 mdecorde
322 321 mdecorde
                String outSideTextTagsRegex = "";
323 321 mdecorde
                String outSideTextTagsAndKeepContentRegex = "";
324 321 mdecorde
                String noteRegex = "";
325 321 mdecorde
                // get the element names to ignore
326 321 mdecorde
327 1610 mdecorde
                String e1 = module.getProject().getTextualPlan("OutSideTextTags")
328 1610 mdecorde
                def split  = e1.split(",")
329 1610 mdecorde
                for (String s : split) {
330 1610 mdecorde
                        outSideTextTagsRegex += "|"+s.trim()
331 1610 mdecorde
                }
332 1610 mdecorde
                if (outSideTextTagsRegex.trim().length() > 0) {
333 1610 mdecorde
                        outSideTextTagsRegex = outSideTextTagsRegex.substring(1) // remove the first "|"
334 1610 mdecorde
                }
335 479 mdecorde
336 1610 mdecorde
                String e2 = module.getProject().getTextualPlan("OutSideTextTagsAndKeepContent")
337 1610 mdecorde
                def split2  = e2.split(",")
338 1610 mdecorde
                for (String s : split2) {
339 1610 mdecorde
                        outSideTextTagsAndKeepContentRegex += "|"+s.trim()
340 1610 mdecorde
                }
341 1610 mdecorde
                if (outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
342 1610 mdecorde
                        outSideTextTagsAndKeepContentRegex = outSideTextTagsAndKeepContentRegex.substring(1) // remove the first "|"
343 1610 mdecorde
                }
344 479 mdecorde
345 1610 mdecorde
                String e3 = module.getProject().getTextualPlan("Note")
346 1610 mdecorde
                def split3  = e3.split(",")
347 1610 mdecorde
                for (String s : split3) {
348 1610 mdecorde
                        noteRegex += "|"+s.trim()
349 1610 mdecorde
                }
350 1610 mdecorde
                if (noteRegex.trim().length() > 0) {
351 1610 mdecorde
                        noteRegex = noteRegex.substring(1) // remove the first "|"
352 1610 mdecorde
                }
353 321 mdecorde
354 321 mdecorde
                //if (wordTag != "w") {
355 1725 mdecorde
                if (!doTokenizeStep) {
356 321 mdecorde
                        println "No tokenization do to."
357 321 mdecorde
                        // ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
358 321 mdecorde
                        for (File f : filesToProcess) {
359 321 mdecorde
                                File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
360 321 mdecorde
                                FileCopy.copy(f, outfile);
361 321 mdecorde
                        }
362 321 mdecorde
                        return true;
363 321 mdecorde
                } else {
364 1610 mdecorde
                        println "-- Tokenizing "+filesToProcess.size()+" files"
365 321 mdecorde
                        ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
366 321 mdecorde
                        for (File f : filesToProcess) {
367 321 mdecorde
                                cpb.tick()
368 321 mdecorde
                                File infile = f;
369 321 mdecorde
                                File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
370 321 mdecorde
                                SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
371 321 mdecorde
372 321 mdecorde
                                if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) {
373 321 mdecorde
                                        tokenizer.setOutSideTextTags(outSideTextTagsRegex)
374 321 mdecorde
                                }
375 479 mdecorde
376 321 mdecorde
                                if (outSideTextTagsAndKeepContentRegex != null && outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
377 321 mdecorde
                                        tokenizer.setOutSideTextTagsAndKeepContent(outSideTextTagsAndKeepContentRegex)
378 321 mdecorde
                                }
379 479 mdecorde
380 321 mdecorde
                                if (noteRegex != null && noteRegex.trim().length() > 0) {
381 321 mdecorde
                                        tokenizer.setNote(noteRegex)
382 321 mdecorde
                                }
383 321 mdecorde
384 1137 mdecorde
                                // tokenize !
385 321 mdecorde
                                if (!tokenizer.process()) {
386 321 mdecorde
                                        println("Failed to process "+f)
387 321 mdecorde
                                        outfile.delete()
388 321 mdecorde
                                }
389 321 mdecorde
                        }
390 321 mdecorde
                }
391 479 mdecorde
392 321 mdecorde
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
393 321 mdecorde
                println ""
394 321 mdecorde
395 321 mdecorde
                return true;
396 321 mdecorde
        }
397 321 mdecorde
398 321 mdecorde
        /**
399 321 mdecorde
         * read from $bindir/tokenized and write the result in $bindir/tokenized
400 321 mdecorde
         *
401 321 mdecorde
         */
402 321 mdecorde
        public boolean doPostTokenizeXSLStep() {
403 321 mdecorde
404 321 mdecorde
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
405 321 mdecorde
                File xslDirectory = new File(module.getSourceDirectory(), "xsl/3-posttok")
406 321 mdecorde
                def xslFiles = xslDirectory.listFiles()
407 321 mdecorde
                if (!xslDirectory.exists() || xslFiles == null || xslFiles.size() == 0) return true;
408 321 mdecorde
409 321 mdecorde
                println "-- Posttokenize XSL Step with $xslDirectory"
410 321 mdecorde
                return ApplyXsl2.processWithMultipleXSL(filesToProcess, xslDirectory, xslParams)
411 321 mdecorde
        }
412 321 mdecorde
413 321 mdecorde
        public boolean doInjectMetadataStep() {
414 321 mdecorde
                if (metadata != null) {
415 321 mdecorde
                        println("-- INJECTING METADATA - "+metadata.getHeadersList()+" in texts of directory "+outputDirectory)
416 321 mdecorde
                        ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
417 321 mdecorde
                        for (File infile : filesToProcess) {
418 321 mdecorde
                                cpb.tick()
419 321 mdecorde
                                File outfile = File.createTempFile("temp", ".xml", infile.getParentFile());
420 321 mdecorde
                                if (metadata.injectMetadatasInXml(infile, outfile, "text", null)) {
421 321 mdecorde
                                        if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile
422 321 mdecorde
                                        if (!infile.exists()) {
423 321 mdecorde
                                                println "Error: could not replace $infile by $outfile"
424 321 mdecorde
                                                return false;
425 321 mdecorde
                                        }
426 321 mdecorde
                                } else {
427 321 mdecorde
                                        outfile.delete(); // fail
428 321 mdecorde
                                }
429 321 mdecorde
                        }
430 2077 mdecorde
                        cpb.done()
431 321 mdecorde
                }
432 321 mdecorde
                return true;
433 321 mdecorde
        }
434 321 mdecorde
435 321 mdecorde
        public boolean doToXMLTXMStep() {
436 321 mdecorde
                filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
437 321 mdecorde
                ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
438 321 mdecorde
                println("-- Building XML-TXM ("+filesToProcess.size()+" files)")
439 321 mdecorde
                for (File f : filesToProcess) {
440 321 mdecorde
                        //ArrayList<String> milestones = new ArrayList<String>();
441 321 mdecorde
                        cpb.tick()
442 321 mdecorde
                        File file = f;
443 321 mdecorde
                        String txmfile = f.getName();
444 321 mdecorde
445 321 mdecorde
                        def correspType = new HashMap<String,String>()
446 321 mdecorde
                        def correspRef = new HashMap<String,String>()
447 321 mdecorde
                        //il faut lister les id de tous les respStmt
448 321 mdecorde
                        def respId = [];
449 321 mdecorde
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
450 321 mdecorde
                        def applications = new HashMap<String,HashMap<String,String>>();
451 321 mdecorde
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
452 321 mdecorde
                        //pour construire les ref vers les taxonomies
453 321 mdecorde
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
454 321 mdecorde
                        //associe un id d'item avec sa description et son URI
455 321 mdecorde
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
456 321 mdecorde
                        //informations de respStmt
457 321 mdecorde
                        //resps (respId <voir ci-dessus>, [description, person, date])
458 321 mdecorde
                        def resps = new HashMap<String,String[]>();
459 321 mdecorde
                        //lance le traitement
460 321 mdecorde
                        def builder = new Xml2Ana(file);
461 1137 mdecorde
                        builder.setWordTag(module.getProject().getTokenizerWordElement())
462 321 mdecorde
                        builder.setConvertAllAtrtibutes true;
463 321 mdecorde
                        builder.setCorrespondances(correspRef, correspType);
464 321 mdecorde
                        builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
465 321 mdecorde
                        //builder.setAddTEIHeader();
466 321 mdecorde
                        if (!builder.process(new File(outputDirectory, txmfile))) {
467 321 mdecorde
                                println("Failed to process "+f);
468 321 mdecorde
                                new File(outputDirectory,txmfile).delete();
469 321 mdecorde
                        }
470 321 mdecorde
                }
471 321 mdecorde
                filesToProcess = outputDirectory.listFiles();
472 2077 mdecorde
                cpb.done();
473 321 mdecorde
                return true;
474 321 mdecorde
        }
475 321 mdecorde
476 321 mdecorde
        @Override
477 321 mdecorde
        public void checkFiles() {
478 321 mdecorde
                //isSuccessFul = isSuccessFul && outputDirectory.listFiles() != null;
479 321 mdecorde
        }
480 321 mdecorde
481 321 mdecorde
        @Override
482 321 mdecorde
        public void cancel() {
483 321 mdecorde
                cancelNow = true;
484 321 mdecorde
        }
485 321 mdecorde
}