Révision 2472

tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZImporter.groovy (revision 2472)
28 28
import org.txm.importer.xtz.*
29 29

  
30 30
class XTZImporter extends Importer {
31

  
31
	
32 32
	public boolean cancelNow = false;
33 33
	public def filesToProcess = []
34

  
34
	
35 35
	public File frontXSLResultDirectory;
36 36
	public File tokenizedDirectory;
37 37
	public File sentencedDirectory;
38

  
38
	
39 39
	public String xslPath;
40 40
	public def xslParams;
41 41
	public String lang;
42 42
	public String wordTag;
43 43
	public boolean doTokenizeStep = false;
44

  
44
	
45 45
	public Metadatas metadata = null; // text metadata
46

  
46
	
47 47
	public XTZImporter(ImportModule module) {
48 48
		super(module);
49 49
	}
50

  
50
	
51 51
	public Metadatas getMetadata() {
52 52
		return metadata;
53 53
	}
54

  
54
	
55 55
	public void process() {
56

  
56
		
57 57
		Project project = module.getProject();
58 58
		File binDir = module.getBinaryDirectory();
59

  
59
		
60 60
		String corpusname = project.getName();
61

  
61
		
62 62
		def srcFiles = [];
63 63
		def files = inputDirectory.listFiles();
64 64
		if (files == null) {
65 65
			reason = "No file to process in "+inputDirectory
66 66
			return; // no file to process in the directory
67 67
		}
68

  
68
		
69 69
		for (File f : files) {
70 70
			if (f.isHidden() || f.isDirectory())
71
				continue;
71
			continue;
72 72
			else if (f.getName().endsWith(".properties"))
73
				continue;
73
			continue;
74 74
			else if (f.getName().equals("import.xml"))
75
				continue;
75
			continue;
76 76
			else if (f.getName().endsWith(".csv"))
77
				continue;
77
			continue;
78 78
			else if (f.getName().endsWith(".dtd"))
79
				continue;
79
			continue;
80 80
			else if (f.getName().endsWith(".xsl"))
81
				continue;
81
			continue;
82 82
			else if (f.getName().endsWith("~"))
83
				continue;
83
			continue;
84 84
			else if (f.getName().startsWith("."))
85
				continue;
86

  
85
			continue;
86
			
87 87
			srcFiles << f
88 88
		}
89

  
89
		
90 90
		if (srcFiles.size() == 0) {
91 91
			reason = "No suitable file to process in "+inputDirectory
92 92
			return; // no file to process in the directory
93 93
		}
94

  
94
		
95 95
		// prepare front XSL if any
96 96
		xslPath = project.getFrontXSL()
97 97
		xslParams = project.getXsltParameters()
98

  
98
		
99 99
		lang = project.getLang();
100

  
100
		
101 101
		wordTag = project.getTokenizerWordElement()
102 102
		this.doTokenizeStep = project.getDoTokenizerStep()
103

  
103
		
104 104
		//prepare metadata if any
105 105
		File allMetadataFile = Metadatas.findMetadataFile(inputDirectory);
106 106
		if (allMetadataFile.exists()) {
......
110 110
				return;
111 111
			}
112 112
			metadata = new Metadatas(copy,
113
					Toolbox.getPreference(TBXPreferences.METADATA_ENCODING),
114
					Toolbox.getPreference(TBXPreferences.METADATA_COLSEPARATOR),
115
					Toolbox.getPreference(TBXPreferences.METADATA_TXTSEPARATOR), 1)
113
			Toolbox.getPreference(TBXPreferences.METADATA_ENCODING),
114
			Toolbox.getPreference(TBXPreferences.METADATA_COLSEPARATOR),
115
			Toolbox.getPreference(TBXPreferences.METADATA_TXTSEPARATOR), 1)
116 116
		}
117

  
117
		
118 118
		// remove previous Text if any
119 119
		project.deleteChildren(Text.class);
120 120
		
121 121
		
122 122
		// main workflow of XTZ importer step
123 123
		
124

  
125 124
		if (!doFixSurrogates()) return;
126 125
		if (!doSplitMergeXSLStep()) return;
127 126
		if (!doFrontXSLStep()) return;
......
133 132
		if (!doInjectMetadataStep()) return;
134 133
		
135 134
		isSuccessFul = filesToProcess.size() > 0
136

  
135
		
137 136
		String cleanDirectories = project.getCleanAfterBuild();
138 137
		if ("true".equals(cleanDirectories)) {
139 138
			new File(module.getBinaryDirectory(), "tokenized").deleteDir()
......
141 140
			new File(module.getBinaryDirectory(), "split").deleteDir()
142 141
		}
143 142
	}
144

  
143
	
145 144
	/**
146 145
	 * read from source directory and remove the surrogate chars not well managed
147 146
	 * save the result in $bindir/src
......
168 167
				FileCopy.copyFiles(file, outputFile)
169 168
			}
170 169
		}
171

  
170
		
172 171
		inputDirectory = srcDirectory; // the files to process are now in the "src" directory
173 172
		return true;
174 173
	}
175

  
174
	
176 175
	/**
177 176
	 * read from source directory and write the result in $bindir/txm
178 177
	 * 
......
196 195
			for (File xslFile : xslFiles) {
197 196
				if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
198 197
				//if (!xslFile.getName().matches("[1-9]-.+")) continue;
199

  
198
				
200 199
				if (ApplyXsl2.processImportSources(xslFile, inputDirectory.listFiles(), xslParams)) {
201 200
					println ""
202 201
				} else {
......
210 209
		}
211 210
		return true;
212 211
	}
213

  
212
	
214 213
	public boolean doEncodeMileStonesStep() {
215 214
		filesToProcess = new File(module.getBinaryDirectory(), "tokenized").listFiles()
216 215
		def milestonesString = module.getProject().getTextualPlan("MileStones")
217 216
		if (milestonesString.length() == 0) return true;
218

  
217
		
219 218
		def milestones = milestonesString.split(",")
220 219
		//def milestones = ["lb", "pb", "cb"]
221 220
		//println "milestones=$milestones type="+milestones.getClass()+" size="+milestones.size()
222 221
		if (milestones.size() == 0) return true;
223

  
222
		
224 223
		println "-- Encoding milestone $milestones into XML files..."
225 224
		ConsoleProgressBar cpb = new ConsoleProgressBar(milestones.size())
226 225
		for (def milestone : milestones) {
227 226
			milestone = milestone.trim()// just in case
228 227
			cpb.tick()
229

  
228
			
230 229
			for (File inputFile : filesToProcess) {
231 230
				File outputFile = File.createTempFile("tmp", inputFile.getName(),inputFile.getParentFile());
232 231
				println "$inputFile, 'text', $wordTag, $milestone, $outputFile"
......
248 247
		println ""
249 248
		return true;
250 249
	}
251

  
250
	
252 251
	/**
253 252
	 * read from $inputDirectory and write the result in $bindir/txm 
254 253
	 *
255 254
	 */
256 255
	public boolean doFrontXSLStep() {
257

  
256
		
258 257
		//filesToProcess = inputDirectory.listFiles();
259

  
258
		
260 259
		File frontXSLdirectory = new File(module.getSourceDirectory(), "xsl/2-front")
261 260
		println "-- Front XSL Step with the $frontXSLdirectory directory."
262 261
		def xslFiles = frontXSLdirectory.listFiles()
......
264 263
		xslParams["output-directory"] = outputDirectory.getAbsoluteFile().toURI().toString();
265 264
		
266 265
		if (frontXSLdirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
267

  
266
			
268 267
			xslFiles.sort()
269 268
			for (File xslFile : xslFiles) {
270 269
				if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
271 270
				//if (!xslFile.getName().matches("[1-9]-.+")) continue;
272

  
271
				
273 272
				if (ApplyXsl2.processImportSources(xslFile, inputDirectory, outputDirectory, xslParams)) {
274 273
					inputDirectory = outputDirectory; // the files to process are now in the "txm" directory
275 274
					println ""
......
283 282
		}
284 283
		return true;
285 284
	}
286

  
285
	
287 286
	public boolean doCheckXMLFileStep() {
288 287
		filesToProcess = []
289
		println "-- Checking XML files for well-formedness."
290
		def files = inputDirectory.listFiles()
291

  
288
		println "-- Checking XML-TEI files for well-formedness."
289
		def files = inputDirectory.listFiles(new FileFilter() {
290
			public boolean accept(File f) {
291
				return (f.getName().toLowerCase().endsWith(".xml") && !f.isDirectory() && !f.isHidden()
292
				&& f.canRead()
293
				&& !f.getName().startsWith("metadata")
294
				&& !f.getName().startsWith("import"));
295
			}
296
		});
297
		
292 298
		if (files == null || files.size() == 0) {
293 299
			println "No XML file (*.xml) to process. Aborting"
294 300
			return false;
295 301
		}
296 302
		files = files.sort()
297

  
303
		
298 304
		ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
299 305
		for (File f : files) {
300 306
			cpb.tick()
301
			if (f.getName().toLowerCase().endsWith(".xml"))
302
				if (!f.isHidden() && f.canRead() && !f.getName().endsWith(".properties")
303
				&& !f.getName().startsWith("metadata") && !f.isDirectory())
304
			{
305

  
306
				if (!ValidateXml.test(f)) {
307
					continue;
308
				} else {
309
					if (!f.getName().equals("import.xml"))
310
						filesToProcess << f
311
				}
307
			
308
			if (!ValidateXml.teiTest(f)) {
309
				println "Won't process XML file: "+f
310
				continue;
311
			} else {
312
				filesToProcess << f
312 313
			}
313 314
		}
314 315
		println ""
315 316
		if (filesToProcess.size() == 0) {
316
			reason = "No file suitable to process in "+inputDirectory
317
			reason = "No suitable file to process in "+inputDirectory
317 318
			return false;
318 319
		} else {
319 320
			return true;
320 321
		}
321 322
	}
322

  
323

  
323
	
324 324
	public boolean doTokenizeStep() {
325

  
325
		
326 326
		new File(module.getBinaryDirectory(),"tokenized").deleteDir()
327 327
		new File(module.getBinaryDirectory(),"tokenized").mkdir()
328

  
328
		
329 329
		String outSideTextTagsRegex = "";
330 330
		String outSideTextTagsAndKeepContentRegex = "";
331 331
		String noteRegex = "";
332 332
		// get the element names to ignore
333

  
333
		
334 334
		String e1 = module.getProject().getTextualPlan("OutSideTextTags")
335 335
		def split  = e1.split(",")
336 336
		for (String s : split) {
......
339 339
		if (outSideTextTagsRegex.trim().length() > 0) {
340 340
			outSideTextTagsRegex = outSideTextTagsRegex.substring(1) // remove the first "|"
341 341
		}
342

  
342
		
343 343
		String e2 = module.getProject().getTextualPlan("OutSideTextTagsAndKeepContent")
344 344
		def split2  = e2.split(",")
345 345
		for (String s : split2) {
......
348 348
		if (outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
349 349
			outSideTextTagsAndKeepContentRegex = outSideTextTagsAndKeepContentRegex.substring(1) // remove the first "|"
350 350
		}
351

  
351
		
352 352
		String e3 = module.getProject().getTextualPlan("Note")
353 353
		def split3  = e3.split(",")
354 354
		for (String s : split3) {
......
357 357
		if (noteRegex.trim().length() > 0) {
358 358
			noteRegex = noteRegex.substring(1) // remove the first "|"
359 359
		}
360

  
360
		
361 361
		//if (wordTag != "w") {
362 362
		if (!doTokenizeStep) {
363 363
			println "No tokenization do to."
......
375 375
				File infile = f;
376 376
				File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
377 377
				SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
378

  
378
				
379 379
				if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) {
380 380
					tokenizer.setOutSideTextTags(outSideTextTagsRegex)
381 381
				}
382

  
382
				
383 383
				if (outSideTextTagsAndKeepContentRegex != null && outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
384 384
					tokenizer.setOutSideTextTagsAndKeepContent(outSideTextTagsAndKeepContentRegex)
385 385
				}
386

  
386
				
387 387
				if (noteRegex != null && noteRegex.trim().length() > 0) {
388 388
					tokenizer.setNote(noteRegex)
389 389
				}
390

  
390
				
391 391
				// tokenize !
392 392
				if (!tokenizer.process()) {
393 393
					println("Failed to process "+f)
......
395 395
				}
396 396
			}
397 397
		}
398

  
398
		
399 399
		filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
400 400
		println ""
401

  
401
		
402 402
		return true;
403 403
	}
404

  
404
	
405 405
	/**
406 406
	 * read from $bindir/tokenized and write the result in $bindir/tokenized
407 407
	 *
408 408
	 */
409 409
	public boolean doPostTokenizeXSLStep() {
410

  
410
		
411 411
		filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
412 412
		File xslDirectory = new File(module.getSourceDirectory(), "xsl/3-posttok")
413 413
		def xslFiles = xslDirectory.listFiles()
414 414
		if (!xslDirectory.exists() || xslFiles == null || xslFiles.size() == 0) return true;
415

  
415
		
416 416
		println "-- Posttokenize XSL Step with $xslDirectory"
417 417
		return ApplyXsl2.processWithMultipleXSL(filesToProcess, xslDirectory, xslParams)
418 418
	}
419

  
419
	
420 420
	public boolean doInjectMetadataStep() {
421 421
		if (metadata != null) {
422 422
			println("-- INJECTING METADATA - "+metadata.getHeadersList()+" in texts of directory "+outputDirectory)
......
438 438
		}
439 439
		return true;
440 440
	}
441

  
441
	
442 442
	public boolean doToXMLTXMStep() {
443 443
		filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
444 444
		ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
......
448 448
			cpb.tick()
449 449
			File file = f;
450 450
			String txmfile = f.getName();
451

  
451
			
452 452
			def correspType = new HashMap<String,String>()
453 453
			def correspRef = new HashMap<String,String>()
454 454
			//il faut lister les id de tous les respStmt
......
479 479
		cpb.done();
480 480
		return true;
481 481
	}
482

  
482
	
483 483
	@Override
484 484
	public void checkFiles() {
485 485
		//isSuccessFul = isSuccessFul && outputDirectory.listFiles() != null;
486 486
	}
487

  
487
	
488 488
	@Override
489 489
	public void cancel() {
490 490
		cancelNow = true;
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZImport.groovy (revision 2472)
40 40
	protected ArrayList<String> getTXMFilesOrder() {
41 41
		//System.out.println("XTZ FILES ORDER");
42 42
		if (importer == null) {
43
			println "no importer step, using default text order"
43
			//println "no importer step, using default text order"
44 44
			return super.getTXMFilesOrder();
45 45
		}
46 46
		Metadatas metadata = importer.getMetadata();
......
55 55
		}
56 56
		
57 57
		if (metadata == null) {
58
			println "no metadata, using default text order"
58
			//println "no metadata, using default text order"
59 59
			return super.getTXMFilesOrder();
60 60
		}
61 61
		
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/CropTranscription.groovy (revision 2472)
1
// Copyright © - ENS de Lyon - http://textometrie.ens-lyon.fr
2
//
3
// This file is part of the TXM platform.
4
//
5
// The TXM platform is free software: you can redistribute it and/or modify
6
// it under the terms of the GNU General Public License as published by
7
// the Free Software Foundation, either version 3 of the License, or
8
// (at your option) any later version.
9
//
10
// The TXM platform is distributed in the hope that it will be useful,
11
// but WITHOUT ANY WARRANTY; without even the implied warranty of
12
// MERCHANTABILITY or FITNESS  FOR A PARTICULAR PURPOSE.  See the
13
// GNU General Public License for more details.
14
//
15
// You should have received a copy of the GNU General Public License
16
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
17
//
18
// $LastChangedDate: 2011-11-01 16:12:36 +0100 (mar., 01 nov. 2011) $
19
// $LastChangedRevision: 2049 $
20
// $LastChangedBy: sheiden $
21
//
22

  
23
package org.txm.scripts.importer.transcriber
24

  
25
import java.text.DecimalFormat;
26
// parameters
27

  
28
String userdir = System.getProperty("user.home")
29
File infile = new File(userdir, "xml/minitranscriber/int01.trs")
30
File outfile = new File(userdir, "xml/minitranscriber/int01-cropped.trs")
31

  
32
float start = 10.0f;
33
float end = 500.0f;
34
formater = DecimalFormat.getInstance(Locale.ENGLISH)
35

  
36
public def cropTRS(File infile, File outfile, float start, end) {
37
	def timeResolution = 0.001
38
	URL u = infile.toURI().toURL()
39
	InputStream ins = u.openStream()
40

  
41
	// Open input file
42
	def slurper = new XmlParser();
43
	slurper.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false)
44
	def trs = slurper.parse(infile.toURI().toString())
45

  
46
	println "TO BE DONE: remove elements before 'start' and after 'end'"
47
	println "REMOVE TEXT NODES BEFORE removed Sync"
48
	println "REMOVE TEXT NODES AFTER removed Sync"
49
	
50
	// Then fix all <Sync>s of Turns
51
	for (def section : trs.Episode.Section) {
52
		
53
		section.Turn.each{ turn ->
54
			
55
			turn.Sync.each(){ sync ->
56
				
57
			}
58
		}
59
	}
60

  
61
	String xml = "";
62
	println ""+xml
63
	outfile.withWriter("UTF-8"){ writer ->
64
		writer.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE Trans SYSTEM "trans-14.dtd">\n')
65
		new groovy.util.XmlNodePrinter(new PrintWriter(writer)).print(trs) }
66
}
67

  
68
/// MAIN ///
69
File tmpfile = new File(infile.getAbsolutePath()+".tmp")
70
cropTRS(infile, tmpfile, start, end)
71
new ShiftTiming().shiftTRS(tmpfile, outfile, start)
72
tmpfile.delete()
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/ShiftTimings.groovy (revision 2472)
1
// Copyright © - ENS de Lyon - http://textometrie.ens-lyon.fr
2
//
3
// This file is part of the TXM platform.
4
//
5
// The TXM platform is free software: you can redistribute it and/or modify
6
// it under the terms of the GNU General Public License as published by
7
// the Free Software Foundation, either version 3 of the License, or
8
// (at your option) any later version.
9
//
10
// The TXM platform is distributed in the hope that it will be useful,
11
// but WITHOUT ANY WARRANTY; without even the implied warranty of
12
// MERCHANTABILITY or FITNESS  FOR A PARTICULAR PURPOSE.  See the
13
// GNU General Public License for more details.
14
//
15
// You should have received a copy of the GNU General Public License
16
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
17
//
18
// $LastChangedDate: 2011-11-01 16:12:36 +0100 (mar., 01 nov. 2011) $
19
// $LastChangedRevision: 2049 $
20
// $LastChangedBy: sheiden $
21
//
22

  
23
package org.txm.scripts.importer.transcriber
24
import java.text.DecimalFormat;
25
// parameters
26

  
27
String userdir = System.getProperty("user.home")
28
File infile = new File(userdir, "xml/minitranscriber/int01.trs")
29
File outfile = new File(userdir, "xml/minitranscriber/int01-shifted.trs")
30
float shift = -0.4;
31
formater = DecimalFormat.getInstance(Locale.ENGLISH)
32

  
33
def shiftTime(element, timeProperties, float shift) // end < start
34
{
35
	//println "shift "+element.attributes()+" "+timeProperties
36
	for (def p : timeProperties) {
37
		
38
		def timeS = element.attributes()[p]
39
		def time =  Float.parseFloat(timeS)
40
		
41
		if (time == 0.0f) {
42
			continue;
43
		}
44
		
45
		def newTime = time + shift
46
		if (newTime < 0) newTime = 0.0f;
47
		
48
		element.attributes()[p] = formater.format(newTime);
49
	}
50
	
51
	return true
52
}
53

  
54
public def shiftTRS(File infile, File outfile, float shift) {
55
	def timeResolution = 0.001
56
	URL u = infile.toURI().toURL()
57
	InputStream ins = u.openStream()
58

  
59
	// Open input file
60
	def slurper = new XmlParser();
61
	slurper.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false)
62
	def trs = slurper.parse(infile.toURI().toString())
63

  
64
	// Then fix all <Sync>s of Turns
65
	for (def section : trs.Episode.Section) {
66
		
67
		if (!shiftTime(section, ["startTime", "endTime"], shift)) {
68
			println "Shift Section error: "+section.attributes()
69
			return false;
70
		}
71
		
72
		section.Turn.each{ turn ->
73
			
74
			if (!shiftTime(turn, ["startTime", "endTime"], shift)) {
75
				println "Shift Turn error: "+turn.attributes()
76
				return false;
77
			}
78
			
79
			turn.Sync.each(){ sync ->
80
				if (!shiftTime(sync, ["time"], shift)) {
81
					println "Shift Sync error: "+sync.attributes()
82
					return false;
83
				}
84
			}
85
		}
86
	}
87

  
88
	String xml = "";
89
	println ""+xml
90
	outfile.withWriter("UTF-8"){ writer ->
91
		writer.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE Trans SYSTEM "trans-14.dtd">\n')
92
		new groovy.util.XmlNodePrinter(new PrintWriter(writer)).print(trs) }
93
}
94

  
95
/// MAIN ///
96
shiftTRS(infile, outfile, shift);

Formats disponibles : Unified diff