/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 2472

     import org.txm.importer.xtz.*
     class XTZImporter extends Importer {
     	public boolean cancelNow = false;
     	public def filesToProcess = []
     	public File frontXSLResultDirectory;
     	public File tokenizedDirectory;
     	public File sentencedDirectory;
     	public String xslPath;
     	public def xslParams;
     	public String lang;
     	public String wordTag;
     	public boolean doTokenizeStep = false;
     	public Metadatas metadata = null; // text metadata
     	public XTZImporter(ImportModule module) {
     		super(module);
+    	}
     	public Metadatas getMetadata() {
     		return metadata;
+    	}
     	public void process() {
     		Project project = module.getProject();
     		File binDir = module.getBinaryDirectory();
     		String corpusname = project.getName();
     		def srcFiles = [];
     		def files = inputDirectory.listFiles();
     		if (files == null) {
     			reason = "No file to process in "+inputDirectory
     			return; // no file to process in the directory
+    		}
     		for (File f : files) {
     			if (f.isHidden() || f.isDirectory())
     				continue;
     			continue;
     			else if (f.getName().endsWith(".properties"))
     				continue;
     			continue;
     			else if (f.getName().equals("import.xml"))
     				continue;
     			continue;
     			else if (f.getName().endsWith(".csv"))
     				continue;
     			continue;
     			else if (f.getName().endsWith(".dtd"))
     				continue;
     			continue;
     			else if (f.getName().endsWith(".xsl"))
     				continue;
     			continue;
     			else if (f.getName().endsWith("~"))
     				continue;
     			continue;
     			else if (f.getName().startsWith("."))
     				continue;
     			continue;
     			srcFiles << f
+    		}
     		if (srcFiles.size() == 0) {
     			reason = "No suitable file to process in "+inputDirectory
     			return; // no file to process in the directory
+    		}
     		// prepare front XSL if any
     		xslPath = project.getFrontXSL()
     		xslParams = project.getXsltParameters()
     		lang = project.getLang();
     		wordTag = project.getTokenizerWordElement()
     		this.doTokenizeStep = project.getDoTokenizerStep()
     		//prepare metadata if any
     		File allMetadataFile = Metadatas.findMetadataFile(inputDirectory);
     		if (allMetadataFile.exists()) {
-...
     				return;
+    			}
     			metadata = new Metadatas(copy,
     					Toolbox.getPreference(TBXPreferences.METADATA_ENCODING),
     					Toolbox.getPreference(TBXPreferences.METADATA_COLSEPARATOR),
     					Toolbox.getPreference(TBXPreferences.METADATA_TXTSEPARATOR), 1)
     			Toolbox.getPreference(TBXPreferences.METADATA_ENCODING),
     			Toolbox.getPreference(TBXPreferences.METADATA_COLSEPARATOR),
     			Toolbox.getPreference(TBXPreferences.METADATA_TXTSEPARATOR), 1)
+    		}
     		// remove previous Text if any
     		project.deleteChildren(Text.class);
     		// main workflow of XTZ importer step
     		if (!doFixSurrogates()) return;
     		if (!doSplitMergeXSLStep()) return;
     		if (!doFrontXSLStep()) return;
-...
     		if (!doInjectMetadataStep()) return;
     		isSuccessFul = filesToProcess.size() > 0
     		String cleanDirectories = project.getCleanAfterBuild();
     		if ("true".equals(cleanDirectories)) {
     			new File(module.getBinaryDirectory(), "tokenized").deleteDir()
-...
     			new File(module.getBinaryDirectory(), "split").deleteDir()
+    		}
+    	}
     	/**
     	 * read from source directory and remove the surrogate chars not well managed
     	 * save the result in $bindir/src
-...
     				FileCopy.copyFiles(file, outputFile)
+    			}
+    		}
     		inputDirectory = srcDirectory; // the files to process are now in the "src" directory
     		return true;
+    	}
     	/**
     	 * read from source directory and write the result in $bindir/txm
+    	 *
-...
     			for (File xslFile : xslFiles) {
     				if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
     				//if (!xslFile.getName().matches("[1-9]-.+")) continue;
     				if (ApplyXsl2.processImportSources(xslFile, inputDirectory.listFiles(), xslParams)) {
     					println ""
     				} else {
-...
+    		}
     		return true;
+    	}
     	public boolean doEncodeMileStonesStep() {
     		filesToProcess = new File(module.getBinaryDirectory(), "tokenized").listFiles()
     		def milestonesString = module.getProject().getTextualPlan("MileStones")
     		if (milestonesString.length() == 0) return true;
     		def milestones = milestonesString.split(",")
     		//def milestones = ["lb", "pb", "cb"]
     		//println "milestones=$milestones type="+milestones.getClass()+" size="+milestones.size()
     		if (milestones.size() == 0) return true;
     		println "-- Encoding milestone $milestones into XML files..."
     		ConsoleProgressBar cpb = new ConsoleProgressBar(milestones.size())
     		for (def milestone : milestones) {
     			milestone = milestone.trim()// just in case
     			cpb.tick()
     			for (File inputFile : filesToProcess) {
     				File outputFile = File.createTempFile("tmp", inputFile.getName(),inputFile.getParentFile());
     				println "$inputFile, 'text', $wordTag, $milestone, $outputFile"
-...
     		println ""
     		return true;
+    	}
     	/**
     	 * read from $inputDirectory and write the result in $bindir/txm
+    	 *
     	 */
     	public boolean doFrontXSLStep() {
     		//filesToProcess = inputDirectory.listFiles();
     		File frontXSLdirectory = new File(module.getSourceDirectory(), "xsl/2-front")
     		println "-- Front XSL Step with the $frontXSLdirectory directory."
     		def xslFiles = frontXSLdirectory.listFiles()
-...
     		xslParams["output-directory"] = outputDirectory.getAbsoluteFile().toURI().toString();
     		if (frontXSLdirectory.exists() && xslFiles != null && xslFiles.size() > 0) {
     			xslFiles.sort()
     			for (File xslFile : xslFiles) {
     				if (xslFile.isDirectory() || xslFile.isHidden() || !xslFile.getName().endsWith(".xsl")) continue;
     				//if (!xslFile.getName().matches("[1-9]-.+")) continue;
     				if (ApplyXsl2.processImportSources(xslFile, inputDirectory, outputDirectory, xslParams)) {
     					inputDirectory = outputDirectory; // the files to process are now in the "txm" directory
     					println ""
-...
+    		}
     		return true;
+    	}
     	public boolean doCheckXMLFileStep() {
     		filesToProcess = []
     		println "-- Checking XML files for well-formedness."
     		def files = inputDirectory.listFiles()
     		println "-- Checking XML-TEI files for well-formedness."
     		def files = inputDirectory.listFiles(new FileFilter() {
     			public boolean accept(File f) {
     				return (f.getName().toLowerCase().endsWith(".xml") && !f.isDirectory() && !f.isHidden()
     				&& f.canRead()
     				&& !f.getName().startsWith("metadata")
     				&& !f.getName().startsWith("import"));
+    			}
     		});
     		if (files == null || files.size() == 0) {
     			println "No XML file (*.xml) to process. Aborting"
     			return false;
+    		}
     		files = files.sort()
     		ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
     		for (File f : files) {
     			cpb.tick()
     			if (f.getName().toLowerCase().endsWith(".xml"))
     				if (!f.isHidden() && f.canRead() && !f.getName().endsWith(".properties")
     				&& !f.getName().startsWith("metadata") && !f.isDirectory())
+    			{
     				if (!ValidateXml.test(f)) {
     					continue;
     				} else {
     					if (!f.getName().equals("import.xml"))
     						filesToProcess << f
+    				}
     			if (!ValidateXml.teiTest(f)) {
     				println "Won't process XML file: "+f
     				continue;
     			} else {
     				filesToProcess << f
+    			}
+    		}
     		println ""
     		if (filesToProcess.size() == 0) {
     			reason = "No file suitable to process in "+inputDirectory
     			reason = "No suitable file to process in "+inputDirectory
     			return false;
     		} else {
     			return true;
+    		}
+    	}
     	public boolean doTokenizeStep() {
     		new File(module.getBinaryDirectory(),"tokenized").deleteDir()
     		new File(module.getBinaryDirectory(),"tokenized").mkdir()
     		String outSideTextTagsRegex = "";
     		String outSideTextTagsAndKeepContentRegex = "";
     		String noteRegex = "";
     		// get the element names to ignore
     		String e1 = module.getProject().getTextualPlan("OutSideTextTags")
     		def split  = e1.split(",")
     		for (String s : split) {
-...
     		if (outSideTextTagsRegex.trim().length() > 0) {
     			outSideTextTagsRegex = outSideTextTagsRegex.substring(1) // remove the first "|"
+    		}
     		String e2 = module.getProject().getTextualPlan("OutSideTextTagsAndKeepContent")
     		def split2  = e2.split(",")
     		for (String s : split2) {
-...
     		if (outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
     			outSideTextTagsAndKeepContentRegex = outSideTextTagsAndKeepContentRegex.substring(1) // remove the first "|"
+    		}
     		String e3 = module.getProject().getTextualPlan("Note")
     		def split3  = e3.split(",")
     		for (String s : split3) {
-...
     		if (noteRegex.trim().length() > 0) {
     			noteRegex = noteRegex.substring(1) // remove the first "|"
+    		}
     		//if (wordTag != "w") {
     		if (!doTokenizeStep) {
     			println "No tokenization do to."
-...
     				File infile = f;
     				File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
     				SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
     				if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) {
     					tokenizer.setOutSideTextTags(outSideTextTagsRegex)
+    				}
     				if (outSideTextTagsAndKeepContentRegex != null && outSideTextTagsAndKeepContentRegex.trim().length() > 0) {
     					tokenizer.setOutSideTextTagsAndKeepContent(outSideTextTagsAndKeepContentRegex)
+    				}
     				if (noteRegex != null && noteRegex.trim().length() > 0) {
     					tokenizer.setNote(noteRegex)
+    				}
     				// tokenize !
     				if (!tokenizer.process()) {
     					println("Failed to process "+f)
-...
+    				}
+    			}
+    		}
     		filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
     		println ""
     		return true;
+    	}
     	/**
     	 * read from $bindir/tokenized and write the result in $bindir/tokenized
+    	 *
     	 */
     	public boolean doPostTokenizeXSLStep() {
     		filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
     		File xslDirectory = new File(module.getSourceDirectory(), "xsl/3-posttok")
     		def xslFiles = xslDirectory.listFiles()
     		if (!xslDirectory.exists() || xslFiles == null || xslFiles.size() == 0) return true;
     		println "-- Posttokenize XSL Step with $xslDirectory"
     		return ApplyXsl2.processWithMultipleXSL(filesToProcess, xslDirectory, xslParams)
+    	}
     	public boolean doInjectMetadataStep() {
     		if (metadata != null) {
     			println("-- INJECTING METADATA - "+metadata.getHeadersList()+" in texts of directory "+outputDirectory)
-...
+    		}
     		return true;
+    	}
     	public boolean doToXMLTXMStep() {
     		filesToProcess = new File(module.getBinaryDirectory(),"tokenized").listFiles()
     		ConsoleProgressBar cpb = new ConsoleProgressBar(filesToProcess.size())
-...
     			cpb.tick()
     			File file = f;
     			String txmfile = f.getName();
     			def correspType = new HashMap<String,String>()
     			def correspRef = new HashMap<String,String>()
     			//il faut lister les id de tous les respStmt
-...
     		cpb.done();
     		return true;
+    	}
     	@Override
     	public void checkFiles() {
     		//isSuccessFul = isSuccessFul && outputDirectory.listFiles() != null;
+    	}
     	@Override
     	public void cancel() {
     		cancelNow = true;

     	protected ArrayList<String> getTXMFilesOrder() {
     		//System.out.println("XTZ FILES ORDER");
     		if (importer == null) {
     			println "no importer step, using default text order"
     			//println "no importer step, using default text order"
     			return super.getTXMFilesOrder();
+    		}
     		Metadatas metadata = importer.getMetadata();
-...
+    		}
     		if (metadata == null) {
     			println "no metadata, using default text order"
     			//println "no metadata, using default text order"
     			return super.getTXMFilesOrder();
+    		}

     // Copyright © - ENS de Lyon - http://textometrie.ens-lyon.fr
     //
     // This file is part of the TXM platform.
     //
     // The TXM platform is free software: you can redistribute it and/or modify
     // it under the terms of the GNU General Public License as published by
     // the Free Software Foundation, either version 3 of the License, or
     // (at your option) any later version.
     //
     // The TXM platform is distributed in the hope that it will be useful,
     // but WITHOUT ANY WARRANTY; without even the implied warranty of
     // MERCHANTABILITY or FITNESS  FOR A PARTICULAR PURPOSE.  See the
     // GNU General Public License for more details.
     //
     // You should have received a copy of the GNU General Public License
     // along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
     //
     // $LastChangedDate: 2011-11-01 16:12:36 +0100 (mar., 01 nov. 2011) $
     // $LastChangedRevision: 2049 $
     // $LastChangedBy: sheiden $
     //
     package org.txm.scripts.importer.transcriber
     import java.text.DecimalFormat;
     // parameters
     String userdir = System.getProperty("user.home")
     File infile = new File(userdir, "xml/minitranscriber/int01.trs")
     File outfile = new File(userdir, "xml/minitranscriber/int01-cropped.trs")
     float start = 10.0f;
     float end = 500.0f;
     formater = DecimalFormat.getInstance(Locale.ENGLISH)
     public def cropTRS(File infile, File outfile, float start, end) {
     	def timeResolution = 0.001
     	URL u = infile.toURI().toURL()
     	InputStream ins = u.openStream()
     	// Open input file
     	def slurper = new XmlParser();
     	slurper.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false)
     	def trs = slurper.parse(infile.toURI().toString())
     	println "TO BE DONE: remove elements before 'start' and after 'end'"
     	println "REMOVE TEXT NODES BEFORE removed Sync"
     	println "REMOVE TEXT NODES AFTER removed Sync"
     	// Then fix all <Sync>s of Turns
     	for (def section : trs.Episode.Section) {
     		section.Turn.each{ turn ->
     			turn.Sync.each(){ sync ->
+    			}
+    		}
+    	}
     	String xml = "";
     	println ""+xml
     	outfile.withWriter("UTF-8"){ writer ->
     		writer.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE Trans SYSTEM "trans-14.dtd">\n')
     		new groovy.util.XmlNodePrinter(new PrintWriter(writer)).print(trs) }
+    }
     /// MAIN ///
     File tmpfile = new File(infile.getAbsolutePath()+".tmp")
     cropTRS(infile, tmpfile, start, end)
     new ShiftTiming().shiftTRS(tmpfile, outfile, start)
     tmpfile.delete()

     // Copyright © - ENS de Lyon - http://textometrie.ens-lyon.fr
     //
     // This file is part of the TXM platform.
     //
     // The TXM platform is free software: you can redistribute it and/or modify
     // it under the terms of the GNU General Public License as published by
     // the Free Software Foundation, either version 3 of the License, or
     // (at your option) any later version.
     //
     // The TXM platform is distributed in the hope that it will be useful,
     // but WITHOUT ANY WARRANTY; without even the implied warranty of
     // MERCHANTABILITY or FITNESS  FOR A PARTICULAR PURPOSE.  See the
     // GNU General Public License for more details.
     //
     // You should have received a copy of the GNU General Public License
     // along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
     //
     // $LastChangedDate: 2011-11-01 16:12:36 +0100 (mar., 01 nov. 2011) $
     // $LastChangedRevision: 2049 $
     // $LastChangedBy: sheiden $
     //
     package org.txm.scripts.importer.transcriber
     import java.text.DecimalFormat;
     // parameters
     String userdir = System.getProperty("user.home")
     File infile = new File(userdir, "xml/minitranscriber/int01.trs")
     File outfile = new File(userdir, "xml/minitranscriber/int01-shifted.trs")
     float shift = -0.4;
     formater = DecimalFormat.getInstance(Locale.ENGLISH)
     def shiftTime(element, timeProperties, float shift) // end < start
+    {
     	//println "shift "+element.attributes()+" "+timeProperties
     	for (def p : timeProperties) {
     		def timeS = element.attributes()[p]
     		def time =  Float.parseFloat(timeS)
     		if (time == 0.0f) {
     			continue;
+    		}
     		def newTime = time + shift
     		if (newTime < 0) newTime = 0.0f;
     		element.attributes()[p] = formater.format(newTime);
+    	}
     	return true
+    }
     public def shiftTRS(File infile, File outfile, float shift) {
     	def timeResolution = 0.001
     	URL u = infile.toURI().toURL()
     	InputStream ins = u.openStream()
     	// Open input file
     	def slurper = new XmlParser();
     	slurper.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false)
     	def trs = slurper.parse(infile.toURI().toString())
     	// Then fix all <Sync>s of Turns
     	for (def section : trs.Episode.Section) {
     		if (!shiftTime(section, ["startTime", "endTime"], shift)) {
     			println "Shift Section error: "+section.attributes()
     			return false;
+    		}
     		section.Turn.each{ turn ->
     			if (!shiftTime(turn, ["startTime", "endTime"], shift)) {
     				println "Shift Turn error: "+turn.attributes()
     				return false;
+    			}
     			turn.Sync.each(){ sync ->
     				if (!shiftTime(sync, ["time"], shift)) {
     					println "Shift Sync error: "+sync.attributes()
     					return false;
+    				}
+    			}
+    		}
+    	}
     	String xml = "";
     	println ""+xml
     	outfile.withWriter("UTF-8"){ writer ->
     		writer.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE Trans SYSTEM "trans-14.dtd">\n')
     		new groovy.util.XmlNodePrinter(new PrintWriter(writer)).print(trs) }
+    }
     /// MAIN ///
     shiftTRS(infile, outfile, shift);

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 2472