root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xtz / XTZImporter.groovy @ 2126
History | View | Annotate | Download (14.9 kB)
1 | 986 | mdecorde | package org.txm.scripts.importer.xtz
|
---|---|---|---|
2 | 321 | mdecorde | |
3 | 986 | mdecorde | import org.txm.scripts.filters.Tokeniser.SimpleTokenizerXml |
4 | 321 | mdecorde | |
5 | 321 | mdecorde | import java.io.File; |
6 | 321 | mdecorde | |
7 | 321 | mdecorde | import net.sf.saxon.style.XSLParam; |
8 | 321 | mdecorde | |
9 | 788 | mdecorde | import org.txm.core.preferences.TBXPreferences; |
10 | 1000 | mdecorde | import org.txm.importer.ApplyXsl2 |
11 | 986 | mdecorde | import org.txm.scripts.importer.CleanFile |
12 | 986 | mdecorde | import org.txm.scripts.importer.MileStoneProjection |
13 | 1000 | mdecorde | import org.txm.importer.ValidateXml |
14 | 625 | mdecorde | import org.txm.* |
15 | 321 | mdecorde | |
16 | 479 | mdecorde | import javax.xml.parsers.SAXParser |
17 | 479 | mdecorde | import javax.xml.parsers.SAXParserFactory |
18 | 625 | mdecorde | import javax.xml.stream.* |
19 | 321 | mdecorde | |
20 | 321 | mdecorde | import org.w3c.dom.Element |
21 | 479 | mdecorde | import org.xml.sax.XMLReader |
22 | 625 | mdecorde | import org.txm.utils.xml.DomUtils |
23 | 321 | mdecorde | import org.txm.metadatas.Metadatas |
24 | 1135 | mdecorde | import org.txm.objects.* |
25 | 1000 | mdecorde | import org.txm.importer.scripts.xmltxm.Xml2Ana |
26 | 1613 | mdecorde | import org.txm.utils.ConsoleProgressBar |
27 | 625 | mdecorde | import org.txm.utils.io.* |
28 | 1000 | mdecorde | import org.txm.importer.xtz.* |
29 | 321 | mdecorde | |
30 | 321 | mdecorde | class XTZImporter extends Importer { |
31 | 321 | mdecorde | |
32 | 2080 | mdecorde | public boolean cancelNow = false; |
33 | 2080 | mdecorde | public def filesToProcess = [] |
34 | 321 | mdecorde | |
35 | 2080 | mdecorde | public File frontXSLResultDirectory; |
36 | 2080 | mdecorde | public File tokenizedDirectory; |
37 | 2080 | mdecorde | public File sentencedDirectory; |
38 | 321 | mdecorde | |
39 | 2080 | mdecorde | public String xslPath; |
40 | 2080 | mdecorde | public def xslParams; |
41 | 2080 | mdecorde | public String lang; |
42 | 2080 | mdecorde | public String wordTag; |
43 | 2080 | mdecorde | public boolean doTokenizeStep = false; |
44 | 479 | mdecorde | |
45 | 2080 | mdecorde | public Metadatas metadata = null; // text metadata |
46 | 321 | mdecorde | |
47 | 321 | mdecorde | public XTZImporter(ImportModule module) {
|
48 | 321 | mdecorde | super(module);
|
49 | 321 | mdecorde | } |
50 | 479 | mdecorde | |
51 | 321 | mdecorde | public Metadatas getMetadata() {
|
52 | 321 | mdecorde | return metadata;
|
53 | 321 | mdecorde | } |
54 | 321 | mdecorde | |
55 | 321 | mdecorde | public void process() { |
56 | 321 | mdecorde | |
57 | 1135 | mdecorde | Project project = module.getProject(); |
58 | 321 | mdecorde | File binDir = module.getBinaryDirectory();
|
59 | 321 | mdecorde | |
60 | 1135 | mdecorde | String corpusname = project.getName();
|
61 | 321 | mdecorde | |
62 | 321 | mdecorde | def srcFiles = []; |
63 | 321 | mdecorde | def files = inputDirectory.listFiles();
|
64 | 321 | mdecorde | if (files == null) { |
65 | 321 | mdecorde | reason = "No file to process in "+inputDirectory
|
66 | 321 | mdecorde | return; // no file to process in the directory |
67 | 321 | mdecorde | } |
68 | 321 | mdecorde | |
69 | 321 | mdecorde | for (File f : files) { |
70 | 321 | mdecorde | if (f.isHidden() || f.isDirectory())
|
71 | 321 | mdecorde | continue;
|
72 | 321 | mdecorde | else if (f.getName().endsWith(".properties")) |
73 | 321 | mdecorde | continue;
|
74 | 321 | mdecorde | else if (f.getName().equals("import.xml")) |
75 | 321 | mdecorde | continue;
|
76 | 321 | mdecorde | else if (f.getName().endsWith(".csv")) |
77 | 321 | mdecorde | continue;
|
78 | 321 | mdecorde | else if (f.getName().endsWith(".dtd")) |
79 | 321 | mdecorde | continue;
|
80 | 321 | mdecorde | else if (f.getName().endsWith(".xsl")) |
81 | 321 | mdecorde | continue;
|
82 | 321 | mdecorde | else if (f.getName().endsWith("~")) |
83 | 321 | mdecorde | continue;
|
84 | 321 | mdecorde | else if (f.getName().startsWith(".")) |
85 | 321 | mdecorde | continue;
|
86 | 321 | mdecorde | |
87 | 321 | mdecorde | srcFiles << f |
88 | 321 | mdecorde | } |
89 | 321 | mdecorde | |
90 | 321 | mdecorde | if (srcFiles.size() == 0) { |
91 | 321 | mdecorde | reason = "No suitable file to process in "+inputDirectory
|
92 | 321 | mdecorde | return; // no file to process in the directory |
93 | 321 | mdecorde | } |
94 | 321 | mdecorde | |
95 | 321 | mdecorde | // prepare front XSL if any
|
96 | 1135 | mdecorde | xslPath = project.getFrontXSL() |
97 | 1135 | mdecorde | xslParams = project.getXsltParameters() |
98 | 321 | mdecorde | |
99 | 1135 | mdecorde | lang = project.getLang(); |
100 | 479 | mdecorde | |
101 | 1137 | mdecorde | wordTag = project.getTokenizerWordElement() |
102 | 1395 | mdecorde | this.doTokenizeStep = project.getDoTokenizerStep()
|
103 | 321 | mdecorde | |
104 | 321 | mdecorde | //prepare metadata if any
|
105 | 1000 | mdecorde | File allMetadataFile = Metadatas.findMetadataFile(inputDirectory);
|
106 | 1000 | mdecorde | if (allMetadataFile.exists()) {
|
107 | 1000 | mdecorde | File copy = new File(binDir, allMetadataFile.getName()) |
108 | 1000 | mdecorde | if (!FileCopy.copy(allMetadataFile, copy)) {
|
109 | 1000 | mdecorde | println "Error: could not create a copy of the metadata file "+allMetadataFile.getAbsoluteFile();
|
110 | 321 | mdecorde | return;
|
111 | 321 | mdecorde | } |
112 | 321 | mdecorde | metadata = new Metadatas(copy,
|
113 | 1610 | mdecorde | Toolbox.getPreference(TBXPreferences.METADATA_ENCODING), |
114 | 1610 | mdecorde | Toolbox.getPreference(TBXPreferences.METADATA_COLSEPARATOR), |
115 | 1610 | mdecorde | Toolbox.getPreference(TBXPreferences.METADATA_TXTSEPARATOR), 1)
|
116 | 321 | mdecorde | } |
117 | 321 | mdecorde | |
118 | 1137 | mdecorde | String cleanDirectories = project.getCleanAfterBuild();
|
119 | 1610 | mdecorde | |
120 | 479 | mdecorde | if (!doFixSurrogates()) return; |
121 | 321 | mdecorde | if (!doSplitMergeXSLStep()) return; |
122 | 321 | mdecorde | if (!doFrontXSLStep()) return; |
123 | 321 | mdecorde | if (!doCheckXMLFileStep()) return; |
124 | 321 | mdecorde | if (!doTokenizeStep()) return; |
125 | 321 | mdecorde | if (!doPostTokenizeXSLStep()) return; |
126 | 321 | mdecorde | if (!doEncodeMileStonesStep()) return; |
127 | 321 | mdecorde | if (!doToXMLTXMStep()) return; |
128 | 321 | mdecorde | if (!doInjectMetadataStep()) return; |
129 | 321 | mdecorde | |
130 | 321 | mdecorde | isSuccessFul = filesToProcess.size() > 0
|
131 | 321 | mdecorde | |
132 | 321 | mdecorde | if ("true".equals(cleanDirectories)) { |
133 | 321 | mdecorde | new File(module.getBinaryDirectory(), "tokenized").deleteDir() |
134 | 321 | mdecorde | new File(module.getBinaryDirectory(), "src").deleteDir() |
135 | 1971 | mdecorde | new File(module.getBinaryDirectory(), "split").deleteDir() |
136 | 321 | mdecorde | } |
137 | 321 | mdecorde | } |
138 | 321 | mdecorde | |
139 | 321 | mdecorde | /**
|
140 | 479 | mdecorde | * read from source directory and remove the surrogate chars not well managed
|
141 | 479 | mdecorde | * save the result in $bindir/src
|
142 | 479 | mdecorde | *
|
143 | 479 | mdecorde | *
|
144 | 479 | mdecorde | *
|
145 | 479 | mdecorde | */
|
146 | 479 | mdecorde | public boolean doFixSurrogates() { |
147 | 479 | mdecorde | File srcDirectory = new File(module.getBinaryDirectory(), "src") |
148 | 1971 | mdecorde | |
149 | 479 | mdecorde | def files = inputDirectory.listFiles()
|
150 | 479 | mdecorde | if (files == null) { |
151 | 479 | mdecorde | reason = "No file to process in $inputDirectory"
|
152 | 479 | mdecorde | return false; |
153 | 479 | mdecorde | } |
154 | 1971 | mdecorde | srcDirectory.deleteDir() // clean before copying
|
155 | 1971 | mdecorde | srcDirectory.mkdir() |
156 | 1971 | mdecorde | |
157 | 479 | mdecorde | for (File file : files) { |
158 | 479 | mdecorde | def outputFile = new File(srcDirectory, file.getName()); |
159 | 479 | mdecorde | if (file.getName().toLowerCase().endsWith(".xml")) { |
160 | 479 | mdecorde | CleanFile.removeSurrogateFromXmlFile(file, outputFile) |
161 | 479 | mdecorde | } else {
|
162 | 625 | mdecorde | FileCopy.copyFiles(file, outputFile) |
163 | 479 | mdecorde | } |
164 | 479 | mdecorde | } |
165 | 479 | mdecorde | |
166 | 479 | mdecorde | inputDirectory = srcDirectory; // the files to process are now in the "src" directory
|
167 | 479 | mdecorde | return true; |
168 | 479 | mdecorde | } |
169 | 479 | mdecorde | |
170 | 479 | mdecorde | /**
|
171 | 321 | mdecorde | * read from source directory and write the result in $bindir/txm
|
172 | 321 | mdecorde | *
|
173 | 321 | mdecorde | * only one XSL is applied
|
174 | 321 | mdecorde | *
|
175 | 321 | mdecorde | */
|
176 | 321 | mdecorde | public boolean doSplitMergeXSLStep() { |
177 | 1971 | mdecorde | |
178 | 1971 | mdecorde | File splitedDirectory = new File(module.getBinaryDirectory(), "split") |
179 | 1971 | mdecorde | splitedDirectory.deleteDir() |
180 | 1971 | mdecorde | |
181 | 321 | mdecorde | File xslDirectory = new File(module.getSourceDirectory(), "xsl/1-split-merge") |
182 | 321 | mdecorde | println "-- Split-Merge XSL Step with $xslDirectory"
|
183 | 321 | mdecorde | def xslFiles = xslDirectory.listFiles()
|
184 | 321 | mdecorde | if (xslDirectory.exists() && xslFiles != null && xslFiles.size() > 0) { |
185 | 1971 | mdecorde | |
186 | 1971 | mdecorde | splitedDirectory.mkdir() |
187 | 1971 | mdecorde | xslParams["output-directory"] = splitedDirectory.getAbsoluteFile().toURI().toString();
|
188 | 1971 | mdecorde | |
189 | 321 | mdecorde | xslFiles.sort() |
190 | 321 | mdecorde | for (File xslFile : xslFiles) { |
191 | 321 | mdecorde | if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue; |
192 | 321 | mdecorde | //if (!xslFile.getName().matches("[1-9]-.+")) continue;
|
193 | 479 | mdecorde | |
194 | 1971 | mdecorde | if (ApplyXsl2.processImportSources(xslFile, inputDirectory.listFiles(), xslParams)) {
|
195 | 321 | mdecorde | println ""
|
196 | 321 | mdecorde | } else {
|
197 | 321 | mdecorde | reason = "Fail to apply split-merge XSL: $xslPath"
|
198 | 321 | mdecorde | } |
199 | 321 | mdecorde | break; // THERE IS ONLY ONE XSL FILE TO APPLY |
200 | 321 | mdecorde | } |
201 | 1971 | mdecorde | inputDirectory = splitedDirectory; // the files to process are now in the "src" directory
|
202 | 321 | mdecorde | } else {
|
203 | 321 | mdecorde | //println "Nothing to do."
|
204 | 321 | mdecorde | } |
205 | 321 | mdecorde | return true; |
206 | 321 | mdecorde | } |
207 | 479 | mdecorde | |
208 | 321 | mdecorde | public boolean doEncodeMileStonesStep() { |
209 | 321 | mdecorde | filesToProcess = new File(module.getBinaryDirectory(), "tokenized").listFiles() |
210 | 1137 | mdecorde | def milestonesString = module.getProject().getTextualPlan("MileStones") |
211 | 321 | mdecorde | if (milestonesString.length() == 0) return true; |
212 | 479 | mdecorde | |
213 | 321 | mdecorde | def milestones = milestonesString.split(",") |
214 | 321 | mdecorde | //def milestones = ["lb", "pb", "cb"]
|
215 | 321 | mdecorde | //println "milestones=$milestones type="+milestones.getClass()+" size="+milestones.size()
|
216 | 321 | mdecorde | if (milestones.size() == 0) return true; |
217 | 479 | mdecorde | |
218 | 321 | mdecorde | println "-- Encoding milestone $milestones into XML files..."
|
219 | 321 | mdecorde | ConsoleProgressBar cpb = new ConsoleProgressBar(milestones.size())
|
220 | 321 | mdecorde | for (def milestone : milestones) { |
221 | 321 | mdecorde | milestone = milestone.trim()// just in case
|
222 | 321 | mdecorde | cpb.tick() |
223 | 479 | mdecorde | |
224 | 321 | mdecorde | for (File inputFile : filesToProcess) { |
225 | 321 | mdecorde | File outputFile = File.createTempFile("tmp", inputFile.getName(),inputFile.getParentFile()); |
226 | 479 | mdecorde | println "$inputFile, 'text', $wordTag, $milestone, $outputFile"
|
227 | 479 | mdecorde | MileStoneProjection msp = new MileStoneProjection(inputFile, "text", wordTag, milestone) |
228 | 321 | mdecorde | if (!msp.process(outputFile)) {
|
229 | 321 | mdecorde | println "Fail to encode $milestone in $inputFile"
|
230 | 321 | mdecorde | return false |
231 | 321 | mdecorde | } else {
|
232 | 321 | mdecorde | if (inputFile.delete()) {
|
233 | 1177 | mdecorde | FileCopy.copy(outputFile, new File(outputFile.getParent(), "copy.xml")) |
234 | 321 | mdecorde | outputFile.renameTo(inputFile) |
235 | 321 | mdecorde | } else {
|
236 | 321 | mdecorde | println "Fail to encode $milestone in ${inputFile}: could not replace the file."
|
237 | 321 | mdecorde | return false |
238 | 321 | mdecorde | } |
239 | 321 | mdecorde | } |
240 | 321 | mdecorde | } |
241 | 321 | mdecorde | } |
242 | 321 | mdecorde | println ""
|
243 | 321 | mdecorde | return true; |
244 | 321 | mdecorde | } |
245 | 321 | mdecorde | |
246 | 321 | mdecorde | /**
|
247 | 321 | mdecorde | * read from $inputDirectory and write the result in $bindir/txm
|
248 | 321 | mdecorde | *
|
249 | 321 | mdecorde | */
|
250 | 321 | mdecorde | public boolean doFrontXSLStep() { |
251 | 321 | mdecorde | |
252 | 321 | mdecorde | //filesToProcess = inputDirectory.listFiles();
|
253 | 321 | mdecorde | |
254 | 321 | mdecorde | File frontXSLdirectory = new File(module.getSourceDirectory(), "xsl/2-front") |
255 | 1610 | mdecorde | println "-- Front XSL Step with the $frontXSLdirectory directory."
|
256 | 321 | mdecorde | def xslFiles = frontXSLdirectory.listFiles()
|
257 | 1971 | mdecorde | |
258 | 1971 | mdecorde | xslParams["output-directory"] = outputDirectory.getAbsoluteFile().toURI().toString();
|
259 | 1971 | mdecorde | |
260 | 321 | mdecorde | if (frontXSLdirectory.exists() && xslFiles != null && xslFiles.size() > 0) { |
261 | 479 | mdecorde | |
262 | 321 | mdecorde | xslFiles.sort() |
263 | 321 | mdecorde | for (File xslFile : xslFiles) { |
264 | 321 | mdecorde | if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue; |
265 | 321 | mdecorde | //if (!xslFile.getName().matches("[1-9]-.+")) continue;
|
266 | 321 | mdecorde | |
267 | 321 | mdecorde | if (ApplyXsl2.processImportSources(xslFile, inputDirectory, outputDirectory, xslParams)) {
|
268 | 321 | mdecorde | inputDirectory = outputDirectory; // the files to process are now in the "txm" directory
|
269 | 321 | mdecorde | println ""
|
270 | 321 | mdecorde | } else {
|
271 | 321 | mdecorde | reason = "Fail to apply front XSL: $xslPath"
|
272 | 321 | mdecorde | return false; |
273 | 321 | mdecorde | } |
274 | 321 | mdecorde | } |
275 | 321 | mdecorde | } else {
|
276 | 321 | mdecorde | //println "Nothing to do."
|
277 | 321 | mdecorde | } |
278 | 321 | mdecorde | return true; |
279 | 321 | mdecorde | } |
280 | 321 | mdecorde | |
281 | 321 | mdecorde | public boolean doCheckXMLFileStep() { |
282 | 321 | mdecorde | filesToProcess = []
|
283 | 1610 | mdecorde | println "-- Checking XML files for well-formedness."
|
284 | 321 | mdecorde | def files = inputDirectory.listFiles()
|
285 | 321 | mdecorde | |
286 | 321 | mdecorde | if (files == null || files.size() == 0) { |
287 | 321 | mdecorde | println "No XML file (*.xml) to process. Aborting"
|
288 | 321 | mdecorde | return false; |
289 | 321 | mdecorde | } |
290 | 321 | mdecorde | files = files.sort() |
291 | 321 | mdecorde | |
292 | 321 | mdecorde | ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
|
293 | 321 | mdecorde | for (File f : files) { |
294 | 321 | mdecorde | cpb.tick() |
295 | 321 | mdecorde | if (f.getName().toLowerCase().endsWith(".xml")) |
296 | 321 | mdecorde | if (!f.isHidden() && f.canRead() && !f.getName().endsWith(".properties") |
297 | 321 | mdecorde | && !f.getName().startsWith("metadata") && !f.isDirectory())
|
298 | 321 | mdecorde | { |
299 | 479 | mdecorde | |
300 | 321 | mdecorde | if (!ValidateXml.test(f)) {
|
301 | 321 | mdecorde | continue;
|
302 | 321 | mdecorde | } else {
|
303 | 321 | mdecorde | if (!f.getName().equals("import.xml")) |
304 | 321 | mdecorde | filesToProcess << f |
305 | 321 | mdecorde | } |
306 | 321 | mdecorde | } |
307 | 321 | mdecorde | } |
308 | 321 | mdecorde | println ""
|
309 | 321 | mdecorde | if (filesToProcess.size() == 0) { |
310 | 321 | mdecorde | reason = "No file suitable to process in "+inputDirectory
|
311 | 321 | mdecorde | return false; |
312 | 321 | mdecorde | } else {
|
313 | 321 | mdecorde | return true; |
314 | 321 | mdecorde | } |
315 | 321 | mdecorde | } |
316 | 321 | mdecorde | |
317 | 321 | mdecorde | |
318 | 321 | mdecorde | public boolean doTokenizeStep() { |
319 | 321 | mdecorde | |
320 | 321 | mdecorde | new File(module.getBinaryDirectory(),"tokenized").mkdir() |
321 | 321 | mdecorde | |
322 | 321 | mdecorde | String outSideTextTagsRegex = ""; |
323 | 321 | mdecorde | String outSideTextTagsAndKeepContentRegex = ""; |
324 | 321 | mdecorde | String noteRegex = ""; |
325 | 321 | mdecorde | // get the element names to ignore
|
326 | 321 | mdecorde | |
327 | 1610 | mdecorde | String e1 = module.getProject().getTextualPlan("OutSideTextTags") |
328 | 1610 | mdecorde | def split = e1.split(",") |
329 | 1610 | mdecorde | for (String s : split) { |
330 | 1610 | mdecorde | outSideTextTagsRegex += "|"+s.trim()
|
331 | 1610 | mdecorde | } |
332 | 1610 | mdecorde | if (outSideTextTagsRegex.trim().length() > 0) { |
333 | 1610 | mdecorde | outSideTextTagsRegex = outSideTextTagsRegex.substring(1) // remove the first "|" |
334 | 1610 | mdecorde | } |
335 | 479 | mdecorde | |
336 | 1610 | mdecorde | String e2 = module.getProject().getTextualPlan("OutSideTextTagsAndKeepContent") |
337 | 1610 | mdecorde | def split2 = e2.split(",") |
338 | 1610 | mdecorde | for (String s : split2) { |
339 | 1610 | mdecorde | outSideTextTagsAndKeepContentRegex += "|"+s.trim()
|
340 | 1610 | mdecorde | } |
341 | 1610 | mdecorde | if (outSideTextTagsAndKeepContentRegex.trim().length() > 0) { |
342 | 1610 | mdecorde | outSideTextTagsAndKeepContentRegex = outSideTextTagsAndKeepContentRegex.substring(1) // remove the first "|" |
343 | 1610 | mdecorde | } |
344 | 479 | mdecorde | |
345 | 1610 | mdecorde | String e3 = module.getProject().getTextualPlan("Note") |
346 | 1610 | mdecorde | def split3 = e3.split(",") |
347 | 1610 | mdecorde | for (String s : split3) { |
348 | 1610 | mdecorde | noteRegex += "|"+s.trim()
|
349 | 1610 | mdecorde | } |
350 | 1610 | mdecorde | if (noteRegex.trim().length() > 0) { |
351 | 1610 | mdecorde | noteRegex = noteRegex.substring(1) // remove the first "|" |
352 | 1610 | mdecorde | } |
353 | 321 | mdecorde | |
354 | 321 | mdecorde | //if (wordTag != "w") {
|
355 | 1725 | mdecorde | if (!doTokenizeStep) {
|
356 | 321 | mdecorde | println "No tokenization do to."
|
357 | 321 | mdecorde | // ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
|
358 | 321 | mdecorde | for (File f : filesToProcess) { |
359 | 321 | mdecorde | File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName()); |
360 | 321 | mdecorde | FileCopy.copy(f, outfile); |
361 | 321 | mdecorde | } |
362 | 321 | mdecorde | return true; |
363 | 321 | mdecorde | } else {
|
364 | 1610 | mdecorde | println "-- Tokenizing "+filesToProcess.size()+" files" |
365 | 321 | mdecorde | ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
|
366 | 321 | mdecorde | for (File f : filesToProcess) { |
367 | 321 | mdecorde | cpb.tick() |
368 | 321 | mdecorde | File infile = f;
|
369 | 321 | mdecorde | File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName()); |
370 | 321 | mdecorde | SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
|
371 | 321 | mdecorde | |
372 | 321 | mdecorde | if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) { |
373 | 321 | mdecorde | tokenizer.setOutSideTextTags(outSideTextTagsRegex) |
374 | 321 | mdecorde | } |
375 | 479 | mdecorde | |
376 | 321 | mdecorde | if (outSideTextTagsAndKeepContentRegex != null && outSideTextTagsAndKeepContentRegex.trim().length() > 0) { |
377 | 321 | mdecorde | tokenizer.setOutSideTextTagsAndKeepContent(outSideTextTagsAndKeepContentRegex) |
378 | 321 | mdecorde | } |
379 | 479 | mdecorde | |
380 | 321 | mdecorde | if (noteRegex != null && noteRegex.trim().length() > 0) { |
381 | 321 | mdecorde | tokenizer.setNote(noteRegex) |
382 | 321 | mdecorde | } |
383 | 321 | mdecorde | |
384 | 1137 | mdecorde | // tokenize !
|
385 | 321 | mdecorde | if (!tokenizer.process()) {
|
386 | 321 | mdecorde | println("Failed to process "+f)
|
387 | 321 | mdecorde | outfile.delete() |
388 | 321 | mdecorde | } |
389 | 321 | mdecorde | } |
390 | 321 | mdecorde | } |
391 | 479 | mdecorde | |
392 | 321 | mdecorde | filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles() |
393 | 321 | mdecorde | println ""
|
394 | 321 | mdecorde | |
395 | 321 | mdecorde | return true; |
396 | 321 | mdecorde | } |
397 | 321 | mdecorde | |
398 | 321 | mdecorde | /**
|
399 | 321 | mdecorde | * read from $bindir/tokenized and write the result in $bindir/tokenized
|
400 | 321 | mdecorde | *
|
401 | 321 | mdecorde | */
|
402 | 321 | mdecorde | public boolean doPostTokenizeXSLStep() { |
403 | 321 | mdecorde | |
404 | 321 | mdecorde | filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles() |
405 | 321 | mdecorde | File xslDirectory = new File(module.getSourceDirectory(), "xsl/3-posttok") |
406 | 321 | mdecorde | def xslFiles = xslDirectory.listFiles()
|
407 | 321 | mdecorde | if (!xslDirectory.exists() || xslFiles == null || xslFiles.size() == 0) return true; |
408 | 321 | mdecorde | |
409 | 321 | mdecorde | println "-- Posttokenize XSL Step with $xslDirectory"
|
410 | 321 | mdecorde | return ApplyXsl2.processWithMultipleXSL(filesToProcess, xslDirectory, xslParams)
|
411 | 321 | mdecorde | } |
412 | 321 | mdecorde | |
413 | 321 | mdecorde | public boolean doInjectMetadataStep() { |
414 | 321 | mdecorde | if (metadata != null) { |
415 | 321 | mdecorde | println("-- INJECTING METADATA - "+metadata.getHeadersList()+" in texts of directory "+outputDirectory) |
416 | 321 | mdecorde | ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
|
417 | 321 | mdecorde | for (File infile : filesToProcess) { |
418 | 321 | mdecorde | cpb.tick() |
419 | 321 | mdecorde | File outfile = File.createTempFile("temp", ".xml", infile.getParentFile()); |
420 | 321 | mdecorde | if (metadata.injectMetadatasInXml(infile, outfile, "text", null)) { |
421 | 321 | mdecorde | if (!(infile.delete() && outfile.renameTo(infile))) println "Warning can't rename file "+outfile+" to "+infile |
422 | 321 | mdecorde | if (!infile.exists()) {
|
423 | 321 | mdecorde | println "Error: could not replace $infile by $outfile"
|
424 | 321 | mdecorde | return false; |
425 | 321 | mdecorde | } |
426 | 321 | mdecorde | } else {
|
427 | 321 | mdecorde | outfile.delete(); // fail
|
428 | 321 | mdecorde | } |
429 | 321 | mdecorde | } |
430 | 2077 | mdecorde | cpb.done() |
431 | 321 | mdecorde | } |
432 | 321 | mdecorde | return true; |
433 | 321 | mdecorde | } |
434 | 321 | mdecorde | |
435 | 321 | mdecorde | public boolean doToXMLTXMStep() { |
436 | 321 | mdecorde | filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles() |
437 | 321 | mdecorde | ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
|
438 | 321 | mdecorde | println("-- Building XML-TXM ("+filesToProcess.size()+" files)") |
439 | 321 | mdecorde | for (File f : filesToProcess) { |
440 | 321 | mdecorde | //ArrayList<String> milestones = new ArrayList<String>();
|
441 | 321 | mdecorde | cpb.tick() |
442 | 321 | mdecorde | File file = f;
|
443 | 321 | mdecorde | String txmfile = f.getName();
|
444 | 321 | mdecorde | |
445 | 321 | mdecorde | def correspType = new HashMap<String,String>() |
446 | 321 | mdecorde | def correspRef = new HashMap<String,String>() |
447 | 321 | mdecorde | //il faut lister les id de tous les respStmt
|
448 | 321 | mdecorde | def respId = []; |
449 | 321 | mdecorde | //fait la correspondance entre le respId et le rapport d'execution de l'outil
|
450 | 321 | mdecorde | def applications = new HashMap<String,HashMap<String,String>>(); |
451 | 321 | mdecorde | //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
|
452 | 321 | mdecorde | //pour construire les ref vers les taxonomies
|
453 | 321 | mdecorde | def taxonomiesUtilisees = new HashMap<String,String[]>(); |
454 | 321 | mdecorde | //associe un id d'item avec sa description et son URI
|
455 | 321 | mdecorde | def itemsURI = new HashMap<String,HashMap<String,String>>(); |
456 | 321 | mdecorde | //informations de respStmt
|
457 | 321 | mdecorde | //resps (respId <voir ci-dessus>, [description, person, date])
|
458 | 321 | mdecorde | def resps = new HashMap<String,String[]>(); |
459 | 321 | mdecorde | //lance le traitement
|
460 | 321 | mdecorde | def builder = new Xml2Ana(file); |
461 | 1137 | mdecorde | builder.setWordTag(module.getProject().getTokenizerWordElement()) |
462 | 321 | mdecorde | builder.setConvertAllAtrtibutes true;
|
463 | 321 | mdecorde | builder.setCorrespondances(correspRef, correspType); |
464 | 321 | mdecorde | builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI) |
465 | 321 | mdecorde | //builder.setAddTEIHeader();
|
466 | 321 | mdecorde | if (!builder.process(new File(outputDirectory, txmfile))) { |
467 | 321 | mdecorde | println("Failed to process "+f);
|
468 | 321 | mdecorde | new File(outputDirectory,txmfile).delete(); |
469 | 321 | mdecorde | } |
470 | 321 | mdecorde | } |
471 | 321 | mdecorde | filesToProcess = outputDirectory.listFiles(); |
472 | 2077 | mdecorde | cpb.done(); |
473 | 321 | mdecorde | return true; |
474 | 321 | mdecorde | } |
475 | 321 | mdecorde | |
476 | 321 | mdecorde | @Override
|
477 | 321 | mdecorde | public void checkFiles() { |
478 | 321 | mdecorde | //isSuccessFul = isSuccessFul && outputDirectory.listFiles() != null;
|
479 | 321 | mdecorde | } |
480 | 321 | mdecorde | |
481 | 321 | mdecorde | @Override
|
482 | 321 | mdecorde | public void cancel() { |
483 | 321 | mdecorde | cancelNow = true;
|
484 | 321 | mdecorde | } |
485 | 321 | mdecorde | } |