Révision 2472
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZImporter.groovy (revision 2472) | ||
---|---|---|
28 | 28 |
import org.txm.importer.xtz.* |
29 | 29 |
|
30 | 30 |
class XTZImporter extends Importer { |
31 |
|
|
31 |
|
|
32 | 32 |
public boolean cancelNow = false; |
33 | 33 |
public def filesToProcess = [] |
34 |
|
|
34 |
|
|
35 | 35 |
public File frontXSLResultDirectory; |
36 | 36 |
public File tokenizedDirectory; |
37 | 37 |
public File sentencedDirectory; |
38 |
|
|
38 |
|
|
39 | 39 |
public String xslPath; |
40 | 40 |
public def xslParams; |
41 | 41 |
public String lang; |
42 | 42 |
public String wordTag; |
43 | 43 |
public boolean doTokenizeStep = false; |
44 |
|
|
44 |
|
|
45 | 45 |
public Metadatas metadata = null; // text metadata |
46 |
|
|
46 |
|
|
47 | 47 |
public XTZImporter(ImportModule module) { |
48 | 48 |
super(module); |
49 | 49 |
} |
50 |
|
|
50 |
|
|
51 | 51 |
public Metadatas getMetadata() { |
52 | 52 |
return metadata; |
53 | 53 |
} |
54 |
|
|
54 |
|
|
55 | 55 |
public void process() { |
56 |
|
|
56 |
|
|
57 | 57 |
Project project = module.getProject(); |
58 | 58 |
File binDir = module.getBinaryDirectory(); |
59 |
|
|
59 |
|
|
60 | 60 |
String corpusname = project.getName(); |
61 |
|
|
61 |
|
|
62 | 62 |
def srcFiles = []; |
63 | 63 |
def files = inputDirectory.listFiles(); |
64 | 64 |
if (files == null) { |
65 | 65 |
reason = "No file to process in "+inputDirectory |
66 | 66 |
return; // no file to process in the directory |
67 | 67 |
} |
68 |
|
|
68 |
|
|
69 | 69 |
for (File f : files) { |
70 | 70 |
if (f.isHidden() || f.isDirectory()) |
71 |
continue;
|
|
71 |
continue; |
|
72 | 72 |
else if (f.getName().endsWith(".properties")) |
73 |
continue;
|
|
73 |
continue; |
|
74 | 74 |
else if (f.getName().equals("import.xml")) |
75 |
continue;
|
|
75 |
continue; |
|
76 | 76 |
else if (f.getName().endsWith(".csv")) |
77 |
continue;
|
|
77 |
continue; |
|
78 | 78 |
else if (f.getName().endsWith(".dtd")) |
79 |
continue;
|
|
79 |
continue; |
|
80 | 80 |
else if (f.getName().endsWith(".xsl")) |
81 |
continue;
|
|
81 |
continue; |
|
82 | 82 |
else if (f.getName().endsWith("~")) |
83 |
continue;
|
|
83 |
continue; |
|
84 | 84 |
else if (f.getName().startsWith(".")) |
85 |
continue;
|
|
86 |
|
|
85 |
continue; |
|
86 |
|
|
87 | 87 |
srcFiles << f |
88 | 88 |
} |
89 |
|
|
89 |
|
|
90 | 90 |
if (srcFiles.size() == 0) { |
91 | 91 |
reason = "No suitable file to process in "+inputDirectory |
92 | 92 |
return; // no file to process in the directory |
93 | 93 |
} |
94 |
|
|
94 |
|
|
95 | 95 |
// prepare front XSL if any |
96 | 96 |
xslPath = project.getFrontXSL() |
97 | 97 |
xslParams = project.getXsltParameters() |
98 |
|
|
98 |
|
|
99 | 99 |
lang = project.getLang(); |
100 |
|
|
100 |
|
|
101 | 101 |
wordTag = project.getTokenizerWordElement() |
102 | 102 |
this.doTokenizeStep = project.getDoTokenizerStep() |
103 |
|
|
103 |
|
|
104 | 104 |
//prepare metadata if any |
105 | 105 |
File allMetadataFile = Metadatas.findMetadataFile(inputDirectory); |
106 | 106 |
if (allMetadataFile.exists()) { |
... | ... | |
110 | 110 |
return; |
111 | 111 |
} |
112 | 112 |
metadata = new Metadatas(copy, |
113 |
Toolbox.getPreference(TBXPreferences.METADATA_ENCODING),
|
|
114 |
Toolbox.getPreference(TBXPreferences.METADATA_COLSEPARATOR),
|
|
115 |
Toolbox.getPreference(TBXPreferences.METADATA_TXTSEPARATOR), 1)
|
|
113 |
Toolbox.getPreference(TBXPreferences.METADATA_ENCODING), |
|
114 |
Toolbox.getPreference(TBXPreferences.METADATA_COLSEPARATOR), |
|
115 |
Toolbox.getPreference(TBXPreferences.METADATA_TXTSEPARATOR), 1) |
|
116 | 116 |
} |
117 |
|
|
117 |
|
|
118 | 118 |
// remove previous Text if any |
119 | 119 |
project.deleteChildren(Text.class); |
120 | 120 |
|
121 | 121 |
|
122 | 122 |
// main workflow of XTZ importer step |
123 | 123 |
|
124 |
|
|
125 | 124 |
if (!doFixSurrogates()) return; |
126 | 125 |
if (!doSplitMergeXSLStep()) return; |
127 | 126 |
if (!doFrontXSLStep()) return; |
... | ... | |
133 | 132 |
if (!doInjectMetadataStep()) return; |
134 | 133 |
|
135 | 134 |
isSuccessFul = filesToProcess.size() > 0 |
136 |
|
|
135 |
|
|
137 | 136 |
String cleanDirectories = project.getCleanAfterBuild(); |
138 | 137 |
if ("true".equals(cleanDirectories)) { |
139 | 138 |
new File(module.getBinaryDirectory(), "tokenized").deleteDir() |
... | ... | |
141 | 140 |
new File(module.getBinaryDirectory(), "split").deleteDir() |
142 | 141 |
} |
143 | 142 |
} |
144 |
|
|
143 |
|
|
145 | 144 |
/** |
146 | 145 |
* read from source directory and remove the surrogate chars not well managed |
147 | 146 |
* save the result in $bindir/src |
... | ... | |
168 | 167 |
FileCopy.copyFiles(file, outputFile) |
169 | 168 |
} |
170 | 169 |
} |
171 |
|
|
170 |
|
|
172 | 171 |
inputDirectory = srcDirectory; // the files to process are now in the "src" directory |
173 | 172 |
return true; |
174 | 173 |
} |
175 |
|
|
174 |
|
|
176 | 175 |
/** |
177 | 176 |
* read from source directory and write the result in $bindir/txm |
178 | 177 |
* |
... | ... | |
196 | 195 |
for (File xslFile : xslFiles) { |
197 | 196 |
if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue; |
198 | 197 |
//if (!xslFile.getName().matches("[1-9]-.+")) continue; |
199 |
|
|
198 |
|
|
200 | 199 |
if (ApplyXsl2.processImportSources(xslFile, inputDirectory.listFiles(), xslParams)) { |
201 | 200 |
println "" |
202 | 201 |
} else { |
... | ... | |
210 | 209 |
} |
211 | 210 |
return true; |
212 | 211 |
} |
213 |
|
|
212 |
|
|
214 | 213 |
public boolean doEncodeMileStonesStep() { |
215 | 214 |
filesToProcess = new File(module.getBinaryDirectory(), "tokenized").listFiles() |
216 | 215 |
def milestonesString = module.getProject().getTextualPlan("MileStones") |
217 | 216 |
if (milestonesString.length() == 0) return true; |
218 |
|
|
217 |
|
|
219 | 218 |
def milestones = milestonesString.split(",") |
220 | 219 |
//def milestones = ["lb", "pb", "cb"] |
221 | 220 |
//println "milestones=$milestones type="+milestones.getClass()+" size="+milestones.size() |
222 | 221 |
if (milestones.size() == 0) return true; |
223 |
|
|
222 |
|
|
224 | 223 |
println "-- Encoding milestone $milestones into XML files..." |
225 | 224 |
ConsoleProgressBar cpb = new ConsoleProgressBar(milestones.size()) |
226 | 225 |
for (def milestone : milestones) { |
227 | 226 |
milestone = milestone.trim()// just in case |
228 | 227 |
cpb.tick() |
229 |
|
|
228 |
|
|
230 | 229 |
for (File inputFile : filesToProcess) { |
231 | 230 |
File outputFile = File.createTempFile("tmp", inputFile.getName(),inputFile.getParentFile()); |
232 | 231 |
println "$inputFile, 'text', $wordTag, $milestone, $outputFile" |
... | ... | |
248 | 247 |
println "" |
249 | 248 |
return true; |
250 | 249 |
} |
251 |
|
|
250 |
|
|
252 | 251 |
/** |
253 | 252 |
* read from $inputDirectory and write the result in $bindir/txm |
254 | 253 |
* |
255 | 254 |
*/ |
256 | 255 |
public boolean doFrontXSLStep() { |
257 |
|
|
256 |
|
|
258 | 257 |
//filesToProcess = inputDirectory.listFiles(); |
259 |
|
|
258 |
|
|
260 | 259 |
File frontXSLdirectory = new File(module.getSourceDirectory(), "xsl/2-front") |
261 | 260 |
println "-- Front XSL Step with the $frontXSLdirectory directory." |
262 | 261 |
def xslFiles = frontXSLdirectory.listFiles() |
... | ... | |
264 | 263 |
xslParams["output-directory"] = outputDirectory.getAbsoluteFile().toURI().toString(); |
265 | 264 |
|
266 | 265 |
if (frontXSLdirectory.exists() && xslFiles != null && xslFiles.size() > 0) { |
267 |
|
|
266 |
|
|
268 | 267 |
xslFiles.sort() |
269 | 268 |
for (File xslFile : xslFiles) { |
270 | 269 |
if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue; |
271 | 270 |
//if (!xslFile.getName().matches("[1-9]-.+")) continue; |
272 |
|
|
271 |
|
|
273 | 272 |
if (ApplyXsl2.processImportSources(xslFile, inputDirectory, outputDirectory, xslParams)) { |
274 | 273 |
inputDirectory = outputDirectory; // the files to process are now in the "txm" directory |
275 | 274 |
println "" |
... | ... | |
283 | 282 |
} |
284 | 283 |
return true; |
285 | 284 |
} |
286 |
|
|
285 |
|
|
287 | 286 |
public boolean doCheckXMLFileStep() { |
288 | 287 |
filesToProcess = [] |
289 |
println "-- Checking XML files for well-formedness." |
|
290 |
def files = inputDirectory.listFiles() |
|
291 |
|
|
288 |
println "-- Checking XML-TEI files for well-formedness." |
|
289 |
def files = inputDirectory.listFiles(new FileFilter() { |
|
290 |
public boolean accept(File f) { |
|
291 |
return (f.getName().toLowerCase().endsWith(".xml") && !f.isDirectory() && !f.isHidden() |
|
292 |
&& f.canRead() |
|
293 |
&& !f.getName().startsWith("metadata") |
|
294 |
&& !f.getName().startsWith("import")); |
|
295 |
} |
|
296 |
}); |
|
297 |
|
|
292 | 298 |
if (files == null || files.size() == 0) { |
293 | 299 |
println "No XML file (*.xml) to process. Aborting" |
294 | 300 |
return false; |
295 | 301 |
} |
296 | 302 |
files = files.sort() |
297 |
|
|
303 |
|
|
298 | 304 |
ConsoleProgressBar cpb = new ConsoleProgressBar(files.size()) |
299 | 305 |
for (File f : files) { |
300 | 306 |
cpb.tick() |
301 |
if (f.getName().toLowerCase().endsWith(".xml")) |
|
302 |
if (!f.isHidden() && f.canRead() && !f.getName().endsWith(".properties") |
|
303 |
&& !f.getName().startsWith("metadata") && !f.isDirectory()) |
|
304 |
{ |
|
305 |
|
|
306 |
if (!ValidateXml.test(f)) { |
|
307 |
continue; |
|
308 |
} else { |
|
309 |
if (!f.getName().equals("import.xml")) |
|
310 |
filesToProcess << f |
|
311 |
} |
|
307 |
|
|
308 |
if (!ValidateXml.teiTest(f)) { |
|
309 |
println "Won't process XML file: "+f |
|
310 |
continue; |
|
311 |
} else { |
|
312 |
filesToProcess << f |
|
312 | 313 |
} |
313 | 314 |
} |
314 | 315 |
println "" |
315 | 316 |
if (filesToProcess.size() == 0) { |
316 |
reason = "No file suitable to process in "+inputDirectory
|
|
317 |
reason = "No suitable file to process in "+inputDirectory
|
|
317 | 318 |
return false; |
318 | 319 |
} else { |
319 | 320 |
return true; |
320 | 321 |
} |
321 | 322 |
} |
322 |
|
|
323 |
|
|
323 |
|
|
324 | 324 |
public boolean doTokenizeStep() { |
325 |
|
|
325 |
|
|
326 | 326 |
new File(module.getBinaryDirectory(),"tokenized").deleteDir() |
327 | 327 |
new File(module.getBinaryDirectory(),"tokenized").mkdir() |
328 |
|
|
328 |
|
|
329 | 329 |
String outSideTextTagsRegex = ""; |
330 | 330 |
String outSideTextTagsAndKeepContentRegex = ""; |
331 | 331 |
String noteRegex = ""; |
332 | 332 |
// get the element names to ignore |
333 |
|
|
333 |
|
|
334 | 334 |
String e1 = module.getProject().getTextualPlan("OutSideTextTags") |
335 | 335 |
def split = e1.split(",") |
336 | 336 |
for (String s : split) { |
... | ... | |
339 | 339 |
if (outSideTextTagsRegex.trim().length() > 0) { |
340 | 340 |
outSideTextTagsRegex = outSideTextTagsRegex.substring(1) // remove the first "|" |
341 | 341 |
} |
342 |
|
|
342 |
|
|
343 | 343 |
String e2 = module.getProject().getTextualPlan("OutSideTextTagsAndKeepContent") |
344 | 344 |
def split2 = e2.split(",") |
345 | 345 |
for (String s : split2) { |
... | ... | |
348 | 348 |
if (outSideTextTagsAndKeepContentRegex.trim().length() > 0) { |
349 | 349 |
outSideTextTagsAndKeepContentRegex = outSideTextTagsAndKeepContentRegex.substring(1) // remove the first "|" |
350 | 350 |
} |
351 |
|
|
351 |
|
|
352 | 352 |
String e3 = module.getProject().getTextualPlan("Note") |
353 | 353 |
def split3 = e3.split(",") |
354 | 354 |
for (String s : split3) { |
... | ... | |
357 | 357 |
if (noteRegex.trim().length() > 0) { |
358 | 358 |
noteRegex = noteRegex.substring(1) // remove the first "|" |
359 | 359 |
} |
360 |
|
|
360 |
|
|
361 | 361 |
//if (wordTag != "w") { |
362 | 362 |
if (!doTokenizeStep) { |
363 | 363 |
println "No tokenization do to." |
... | ... | |
375 | 375 |
File infile = f; |
376 | 376 |
File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName()); |
377 | 377 |
SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang) |
378 |
|
|
378 |
|
|
379 | 379 |
if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) { |
380 | 380 |
tokenizer.setOutSideTextTags(outSideTextTagsRegex) |
381 | 381 |
} |
382 |
|
|
382 |
|
|
383 | 383 |
if (outSideTextTagsAndKeepContentRegex != null && outSideTextTagsAndKeepContentRegex.trim().length() > 0) { |
384 | 384 |
tokenizer.setOutSideTextTagsAndKeepContent(outSideTextTagsAndKeepContentRegex) |
385 | 385 |
} |
386 |
|
|
386 |
|
|
387 | 387 |
if (noteRegex != null && noteRegex.trim().length() > 0) { |
388 | 388 |
tokenizer.setNote(noteRegex) |
389 | 389 |
} |
390 |
|
|
390 |
|
|
391 | 391 |
// tokenize ! |
392 | 392 |
if (!tokenizer.process()) { |
393 | 393 |
println("Failed to process "+f) |
... | ... | |
395 | 395 |
} |
396 | 396 |
} |
397 | 397 |
} |
398 |
|
|
398 |
|
|
399 | 399 |
filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles() |
400 | 400 |
println "" |
401 |
|
|
401 |
|
|
402 | 402 |
return true; |
403 | 403 |
} |
404 |
|
|
404 |
|
|
405 | 405 |
/** |
406 | 406 |
* read from $bindir/tokenized and write the result in $bindir/tokenized |
407 | 407 |
* |
408 | 408 |
*/ |
409 | 409 |
public boolean doPostTokenizeXSLStep() { |
410 |
|
|
410 |
|
|
411 | 411 |
filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles() |
412 | 412 |
File xslDirectory = new File(module.getSourceDirectory(), "xsl/3-posttok") |
413 | 413 |
def xslFiles = xslDirectory.listFiles() |
414 | 414 |
if (!xslDirectory.exists() || xslFiles == null || xslFiles.size() == 0) return true; |
415 |
|
|
415 |
|
|
416 | 416 |
println "-- Posttokenize XSL Step with $xslDirectory" |
417 | 417 |
return ApplyXsl2.processWithMultipleXSL(filesToProcess, xslDirectory, xslParams) |
418 | 418 |
} |
419 |
|
|
419 |
|
|
420 | 420 |
public boolean doInjectMetadataStep() { |
421 | 421 |
if (metadata != null) { |
422 | 422 |
println("-- INJECTING METADATA - "+metadata.getHeadersList()+" in texts of directory "+outputDirectory) |
... | ... | |
438 | 438 |
} |
439 | 439 |
return true; |
440 | 440 |
} |
441 |
|
|
441 |
|
|
442 | 442 |
public boolean doToXMLTXMStep() { |
443 | 443 |
filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles() |
444 | 444 |
ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size()) |
... | ... | |
448 | 448 |
cpb.tick() |
449 | 449 |
File file = f; |
450 | 450 |
String txmfile = f.getName(); |
451 |
|
|
451 |
|
|
452 | 452 |
def correspType = new HashMap<String,String>() |
453 | 453 |
def correspRef = new HashMap<String,String>() |
454 | 454 |
//il faut lister les id de tous les respStmt |
... | ... | |
479 | 479 |
cpb.done(); |
480 | 480 |
return true; |
481 | 481 |
} |
482 |
|
|
482 |
|
|
483 | 483 |
@Override |
484 | 484 |
public void checkFiles() { |
485 | 485 |
//isSuccessFul = isSuccessFul && outputDirectory.listFiles() != null; |
486 | 486 |
} |
487 |
|
|
487 |
|
|
488 | 488 |
@Override |
489 | 489 |
public void cancel() { |
490 | 490 |
cancelNow = true; |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZImport.groovy (revision 2472) | ||
---|---|---|
40 | 40 |
protected ArrayList<String> getTXMFilesOrder() { |
41 | 41 |
//System.out.println("XTZ FILES ORDER"); |
42 | 42 |
if (importer == null) { |
43 |
println "no importer step, using default text order" |
|
43 |
//println "no importer step, using default text order"
|
|
44 | 44 |
return super.getTXMFilesOrder(); |
45 | 45 |
} |
46 | 46 |
Metadatas metadata = importer.getMetadata(); |
... | ... | |
55 | 55 |
} |
56 | 56 |
|
57 | 57 |
if (metadata == null) { |
58 |
println "no metadata, using default text order" |
|
58 |
//println "no metadata, using default text order"
|
|
59 | 59 |
return super.getTXMFilesOrder(); |
60 | 60 |
} |
61 | 61 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/CropTranscription.groovy (revision 2472) | ||
---|---|---|
1 |
// Copyright © - ENS de Lyon - http://textometrie.ens-lyon.fr |
|
2 |
// |
|
3 |
// This file is part of the TXM platform. |
|
4 |
// |
|
5 |
// The TXM platform is free software: you can redistribute it and/or modify |
|
6 |
// it under the terms of the GNU General Public License as published by |
|
7 |
// the Free Software Foundation, either version 3 of the License, or |
|
8 |
// (at your option) any later version. |
|
9 |
// |
|
10 |
// The TXM platform is distributed in the hope that it will be useful, |
|
11 |
// but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
13 |
// GNU General Public License for more details. |
|
14 |
// |
|
15 |
// You should have received a copy of the GNU General Public License |
|
16 |
// along with the TXM platform. If not, see <http://www.gnu.org/licenses/>. |
|
17 |
// |
|
18 |
// $LastChangedDate: 2011-11-01 16:12:36 +0100 (mar., 01 nov. 2011) $ |
|
19 |
// $LastChangedRevision: 2049 $ |
|
20 |
// $LastChangedBy: sheiden $ |
|
21 |
// |
|
22 |
|
|
23 |
package org.txm.scripts.importer.transcriber |
|
24 |
|
|
25 |
import java.text.DecimalFormat; |
|
26 |
// parameters |
|
27 |
|
|
28 |
String userdir = System.getProperty("user.home") |
|
29 |
File infile = new File(userdir, "xml/minitranscriber/int01.trs") |
|
30 |
File outfile = new File(userdir, "xml/minitranscriber/int01-cropped.trs") |
|
31 |
|
|
32 |
float start = 10.0f; |
|
33 |
float end = 500.0f; |
|
34 |
formater = DecimalFormat.getInstance(Locale.ENGLISH) |
|
35 |
|
|
36 |
public def cropTRS(File infile, File outfile, float start, end) { |
|
37 |
def timeResolution = 0.001 |
|
38 |
URL u = infile.toURI().toURL() |
|
39 |
InputStream ins = u.openStream() |
|
40 |
|
|
41 |
// Open input file |
|
42 |
def slurper = new XmlParser(); |
|
43 |
slurper.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false) |
|
44 |
def trs = slurper.parse(infile.toURI().toString()) |
|
45 |
|
|
46 |
println "TO BE DONE: remove elements before 'start' and after 'end'" |
|
47 |
println "REMOVE TEXT NODES BEFORE removed Sync" |
|
48 |
println "REMOVE TEXT NODES AFTER removed Sync" |
|
49 |
|
|
50 |
// Then fix all <Sync>s of Turns |
|
51 |
for (def section : trs.Episode.Section) { |
|
52 |
|
|
53 |
section.Turn.each{ turn -> |
|
54 |
|
|
55 |
turn.Sync.each(){ sync -> |
|
56 |
|
|
57 |
} |
|
58 |
} |
|
59 |
} |
|
60 |
|
|
61 |
String xml = ""; |
|
62 |
println ""+xml |
|
63 |
outfile.withWriter("UTF-8"){ writer -> |
|
64 |
writer.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE Trans SYSTEM "trans-14.dtd">\n') |
|
65 |
new groovy.util.XmlNodePrinter(new PrintWriter(writer)).print(trs) } |
|
66 |
} |
|
67 |
|
|
68 |
/// MAIN /// |
|
69 |
File tmpfile = new File(infile.getAbsolutePath()+".tmp") |
|
70 |
cropTRS(infile, tmpfile, start, end) |
|
71 |
new ShiftTiming().shiftTRS(tmpfile, outfile, start) |
|
72 |
tmpfile.delete() |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/ShiftTimings.groovy (revision 2472) | ||
---|---|---|
1 |
// Copyright © - ENS de Lyon - http://textometrie.ens-lyon.fr |
|
2 |
// |
|
3 |
// This file is part of the TXM platform. |
|
4 |
// |
|
5 |
// The TXM platform is free software: you can redistribute it and/or modify |
|
6 |
// it under the terms of the GNU General Public License as published by |
|
7 |
// the Free Software Foundation, either version 3 of the License, or |
|
8 |
// (at your option) any later version. |
|
9 |
// |
|
10 |
// The TXM platform is distributed in the hope that it will be useful, |
|
11 |
// but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
13 |
// GNU General Public License for more details. |
|
14 |
// |
|
15 |
// You should have received a copy of the GNU General Public License |
|
16 |
// along with the TXM platform. If not, see <http://www.gnu.org/licenses/>. |
|
17 |
// |
|
18 |
// $LastChangedDate: 2011-11-01 16:12:36 +0100 (mar., 01 nov. 2011) $ |
|
19 |
// $LastChangedRevision: 2049 $ |
|
20 |
// $LastChangedBy: sheiden $ |
|
21 |
// |
|
22 |
|
|
23 |
package org.txm.scripts.importer.transcriber |
|
24 |
import java.text.DecimalFormat; |
|
25 |
// parameters |
|
26 |
|
|
27 |
String userdir = System.getProperty("user.home") |
|
28 |
File infile = new File(userdir, "xml/minitranscriber/int01.trs") |
|
29 |
File outfile = new File(userdir, "xml/minitranscriber/int01-shifted.trs") |
|
30 |
float shift = -0.4; |
|
31 |
formater = DecimalFormat.getInstance(Locale.ENGLISH) |
|
32 |
|
|
33 |
def shiftTime(element, timeProperties, float shift) // end < start |
|
34 |
{ |
|
35 |
//println "shift "+element.attributes()+" "+timeProperties |
|
36 |
for (def p : timeProperties) { |
|
37 |
|
|
38 |
def timeS = element.attributes()[p] |
|
39 |
def time = Float.parseFloat(timeS) |
|
40 |
|
|
41 |
if (time == 0.0f) { |
|
42 |
continue; |
|
43 |
} |
|
44 |
|
|
45 |
def newTime = time + shift |
|
46 |
if (newTime < 0) newTime = 0.0f; |
|
47 |
|
|
48 |
element.attributes()[p] = formater.format(newTime); |
|
49 |
} |
|
50 |
|
|
51 |
return true |
|
52 |
} |
|
53 |
|
|
54 |
public def shiftTRS(File infile, File outfile, float shift) { |
|
55 |
def timeResolution = 0.001 |
|
56 |
URL u = infile.toURI().toURL() |
|
57 |
InputStream ins = u.openStream() |
|
58 |
|
|
59 |
// Open input file |
|
60 |
def slurper = new XmlParser(); |
|
61 |
slurper.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false) |
|
62 |
def trs = slurper.parse(infile.toURI().toString()) |
|
63 |
|
|
64 |
// Then fix all <Sync>s of Turns |
|
65 |
for (def section : trs.Episode.Section) { |
|
66 |
|
|
67 |
if (!shiftTime(section, ["startTime", "endTime"], shift)) { |
|
68 |
println "Shift Section error: "+section.attributes() |
|
69 |
return false; |
|
70 |
} |
|
71 |
|
|
72 |
section.Turn.each{ turn -> |
|
73 |
|
|
74 |
if (!shiftTime(turn, ["startTime", "endTime"], shift)) { |
|
75 |
println "Shift Turn error: "+turn.attributes() |
|
76 |
return false; |
|
77 |
} |
|
78 |
|
|
79 |
turn.Sync.each(){ sync -> |
|
80 |
if (!shiftTime(sync, ["time"], shift)) { |
|
81 |
println "Shift Sync error: "+sync.attributes() |
|
82 |
return false; |
|
83 |
} |
|
84 |
} |
|
85 |
} |
|
86 |
} |
|
87 |
|
|
88 |
String xml = ""; |
|
89 |
println ""+xml |
|
90 |
outfile.withWriter("UTF-8"){ writer -> |
|
91 |
writer.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE Trans SYSTEM "trans-14.dtd">\n') |
|
92 |
new groovy.util.XmlNodePrinter(new PrintWriter(writer)).print(trs) } |
|
93 |
} |
|
94 |
|
|
95 |
/// MAIN /// |
|
96 |
shiftTRS(infile, outfile, shift); |
Formats disponibles : Unified diff