/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 3003

tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/importer.groovy (revision 3003)
187	187	File tfile = new File(tokenizedDir, pfile.getName())
188	188	try {
189	189	TranscriberTokenizer tokenizer = new TranscriberTokenizer(pfile, tfile, lang)
	190	tokenizer.setRetokenize("true" == project.getTokenizerParameter("doRetokenizeStep", "false"))
190	191	if (!tokenizer.process()) {
191	192	println("Failed to tokenize "+pfile)
192	193	}

     //			}
     			//test events
     			if (s.startsWith("^^")) {
     			if (s.startsWith("^^") && s.length() > 2) {
     				event += "#orth";
     				s = s.substring(2);
+    			}
     			if (s.startsWith("*")) {
     			if (s.startsWith("*") && s.length() > 1) {
     				event += "#corr";
     				s = s.substring(1);
+    			}
-...
     				event = event.substring(1);//remove '|'
     			// test audio
     			if (s.contains("(") || s.contains(")")) // contains ( or )
     			if ((s.contains("(") || s.contains(")")) && && s.length() > 2) // contains ( or )
+    			{
     				if (s.contains("(") ^ s.contains(")")) // ert(ert XOR ert)ert
+    				{

     		new File(module.getBinaryDirectory(),"tokenized").deleteDir()
     		new File(module.getBinaryDirectory(),"tokenized").mkdir()
     		boolean retokenize = "true" == module.getProject().getTokenizerParameter("doRetokenizeStep", "false")
     		String outSideTextTagsRegex = "";
     		String outSideTextTagsAndKeepContentRegex = "";
     		String noteRegex = "";
-...
     				File infile = f;
     				File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
     				SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, lang)
     				tokenizer.setRetokenize(retokenize)
     				if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) {
     					tokenizer.setOutSideTextTags(outSideTextTagsRegex)
+    				}

     import javax.xml.stream.*
     import org.txm.scripts.importer.graal.PersonalNamespaceContext
     import org.txm.importer.PersonalNamespaceContext
     import org.txm.tokenizer.TokenizerClasses
     @CompileStatic
     public class SimpleTokenizerXml {
     	boolean retokenize = false
     	LinkedHashMap<String, String>retokenizedWordProperties = new LinkedHashMap()
     	/** The word_tags. */
     	String word_tags = TokenizerClasses.word_tags;
     	String word_element_to_create = TokenizerClasses.word_element_to_create;
     	Pattern reg_word_tags;
     	/** The intraword_tags. */
     	String intraword_tags = TokenizerClasses.intraword_tags;
     	/** The punct_strong. */
     	String punct_strong = TokenizerClasses.punct_strong;
     	/** The punct_all. */
     	String punct_all = TokenizerClasses.punct_all;
     	/** The word_chars. */
     	String word_chars = TokenizerClasses.word_chars;
     	String fclitics = null; // default behavior don't manage clitics
     	String pclitics = null; // default behavior don't manage clitics
     	/** The outside_text_tags_ignore_content. */
     	String note_content = null;
     	String outside_text_tags_keep_content = null // tag and content NOT removed but not tokenized
     	String outside_text_tags = null // tag and content removed
     	String startTag = null
     	Pattern reg_note_content;
     	Pattern reg_outside_text_tags_keep_content;
     	Pattern reg_outside_text_tags_keep_content;
     	Pattern reg_outside_text_tags;
     	Pattern reg_startTag;
     	Pattern reg_punct_other = Pattern.compile("\\p{P}")
     	/** The DEBUG. */
     	public boolean DEBUG = false;
     	/** The outfile. */
     	File outfile;
     	/** The infile. */
     	File infile;
     	String lang;
     	/** The buffer. */
     	StringBuffer buffer;
     	/** The writer. */
     	XMLStreamWriter writer;
     	BufferedOutputStream output;
     	/** The parser. */
     	XMLStreamReader parser
     	/** The localname. */
     	String localname;
     	/** The prefix. */
     	String prefix;
     	String filename;
     	def regElision = null;
     	def reg3pts = null;
     	def regPunct;
-...
     	Pattern regLN;
     	Pattern regCTRL;
     	Pattern regSplitWhiteSpaces;
     	public SimpleTokenizerXml(File infile, File outfile) {
     		this(infile, outfile, "");
+    	}
     	/**
     	 * Instantiates a new simple tokenizer xml.
+    	 *
-...
     			} else if (lang.startsWith("it")) {
     				pclitics = TokenizerClasses.PClitic_it
+    			}
     		word_tags = TokenizerClasses.word_tags;
     		reg_word_tags = Pattern.compile(word_tags);
-...
     		punct_strong = TokenizerClasses.punct_strong;
     		punct_all = TokenizerClasses.punct_all;
     		word_chars = TokenizerClasses.word_chars;
     		this.outfile = outfile;
     		this.infile = infile;
     		this.filename = infile.getName();
     		int index = filename.lastIndexOf(".");
     		if (index > 0) filename = filename.substring(0, index);
     		String strRegElision = TokenizerClasses.regElision;
     		if (strRegElision != null && strRegElision.length() > 0)
     			regElision = ~/^([\p{L}-]++${strRegElision}[\p{L}-]++)(.*)$/ // the test must start with the start of string  ^
-...
     		if (fclitics != null && fclitics.length() > 0)
     			regFClitics = ~/(.+)($fclitics)$/ // the test must end with the end of string $
     		if (pclitics != null && pclitics.length() > 0)
     			regPClitics = /^($pclitics)(.*)/ // the test must start with the start of string  ^
     		if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0)
     			regWhitespaces = ~TokenizerClasses.whitespaces;
     		if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0)
     			regSplitWhiteSpaces = Pattern.compile(TokenizerClasses.whitespaces);
     		regLN = Pattern.compile("/\n/");
     		regCTRL = Pattern.compile("/\\p{C}/");
+    	}
     	/**
     	 * Fill infos.
+    	 *
-...
     			prefix = parser.getPrefix();
+    		}
+    	}
     	/**
     	 * Donothing: just write what is read
+    	 *
-...
     				//					writer.writeStartElement(namespace, localname);
     				//				else
     				writer.writeStartElement(localname);
     			//			if(parser.getNamespaceCount() > 0)
     			//				writer.writeDefaultNamespace(parser.getNamespaceURI(0))
     			//			for(int i = 1 ; i < parser.getNamespaceCount() ; i++)
     			//				writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
     			String namespace_prefix;
     			for (int i = 0 ; i<   parser.getNamespaceCount() ; i++) {
     				namespace_prefix = parser.getNamespacePrefix(i);
-...
     					writer.writeDefaultNamespace(parser.getNamespaceURI(i));
+    				}
+    			}
     			String attrprefix, attname;
     			boolean hasId = false;
     			//boolean hasType = false
-...
     				if ("id".equals(attname)) hasId = true;
     				//if ("type".equals(attname)) hasType = true;
     				if ("n".equals(attname)) hasN = true;
     				if (attrprefix != null && attrprefix.length() > 0)
     					writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i))
     				else
     					writer.writeAttribute(attname, parser.getAttributeValue(i))
+    			}
     			if (wordid != null && !hasId && localname == word_element_to_create)
     				writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
     			if (!hasN && localname == word_element_to_create)
     				writer.writeAttribute("n", ""+wordcount);
     			writer.writeCharacters("\n");
     			if (!reg_word_tags.matcher(localname).matches()) {
     				writer.writeCharacters("\n");
+    			}
+    		}
     		else if(event == XMLStreamConstants.END_ELEMENT)
+    		{
-...
+    		{
     			//println parser.getText();
     			//writer.writeCharacters("𦟛");
     			// checks if the token starts with an high surrogate
     			//			if(isHighSurrogate(parser.getText().charAt(0)))	{
     			//				println "warning: invalid UTF-8 XML range, token " + parser.getText() + " has been replaced.";
-...
     			//				writer.writeCharacters("&#" + scalar + ";");
     			//			}
     			//			else
     			if (insideword) { // ensure there is not \t or \n in the word form value
     				writer.writeCharacters(parser.getText().trim().replace("\n", " ").replace("\t", " "));
     			} else {
-...
+    			}
+    		}
+    	}
     	/**
     	 * Converts the specified surrogates pair to scalar.
     	 * @param highSurrogate
-...
     	public int surrogatesPairToScalar(char highSurrogate, char lowSurrogate)	{
     		return ((highSurrogate - 0xD800) * 0x400) + (lowSurrogate - 0xDC00) + 0x10000;
+    	}
     	/**
     	 * Checks if the specified character is an high/leading surrogate.
     	 * @param character
-...
     	public boolean isHighSurrogate(char character)	{
     		return (character >= 0xD800 && character <= 0xDBFF);
+    	}
     	/** The wordcount. */
     	int wordcount = 0;
     	/** The ignorecontent. */
     	boolean ignorecontent = true; // tokenization starts with element name matching $reg_startTag
     	boolean insideword = false;
-...
     		output = new BufferedOutputStream(new FileOutputStream(outfile))
     		writer = factory.createXMLStreamWriter(output, "UTF-8")
     		writer.setNamespaceContext(new PersonalNamespaceContext());
     		def inputData = infile.toURI().toURL().openStream();
     		def inputfactory = XMLInputFactory.newInstance();
     		//inputfactory.setProperty("http://apache.org/xml/properties/input-buffer-size", new Integer(2048));
     		//inputfactory.setExpandEntityReferences(false);
     		parser = inputfactory.createXMLStreamReader(inputData, "UTF-8");
     		XMLInputFactory.newInstance();
     		parser = inputfactory.createXMLStreamReader(inputData);
     		//println "PARSER: "+parser.getClass()
     		writer.writeStartDocument("UTF-8","1.0");
     		writer.writeCharacters("\n");
     		int previousEvent = 0;
     		boolean startProcess = false;
     		if (startTag == null) // if no startTag specified we process from the start
-...
     		buffer = new StringBuffer();
     		//println "process - start start tag: "+startTag+" startProcess: $startProcess"
     //		println "reg_outside_text_tags_keep_content=$reg_outside_text_tags_keep_content"
     //		println "reg_outside_text_tags=$reg_outside_text_tags"
     //		println "reg_note_content=$reg_note_content"
     		//		println "reg_outside_text_tags_keep_content=$reg_outside_text_tags_keep_content"
     		//		println "reg_outside_text_tags=$reg_outside_text_tags"
     		//		println "reg_note_content=$reg_note_content"
     		try {
     			for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
     				if (!startProcess) {
-...
     						continue;
+    					}
+    				}
     				if (previousEvent == XMLStreamConstants.CHARACTERS && previousEvent != event) {
     					processWord(); // tokenize now!
     					buffer.setLength(0);
-...
     					localname = parser.getLocalName()
     					if (reg_word_tags.matcher(localname).matches()) { // ignore the content of the word but keep counting
     						//println "Found pretagged word";
     						wordcount++;
     						donothing(event, wordcount);
     						//ignorecontent = true;
     						insideword = true;
     						if (retokenize) {
     							retokenizedWordProperties.clear()
     							for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
     								retokenizedWordProperties[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i)
+    							}
     						} else {
     							wordcount++;
     							donothing(event, wordcount);
     							//ignorecontent = true;
     							insideword = true;
+    						}
     					} else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
     						// ignore the tag only
     						donothing(event, null); // write the tag
-...
     					//println "Close: "+localname;
     					localname = parser.getLocalName()
     					if (reg_word_tags.matcher(localname).matches()) {
     						//ignorecontent = false;
     						insideword = false;
     						writer.writeEndElement();
     						writer.writeCharacters("\n");
     						if (retokenize) {
     							retokenizedWordProperties.clear()
     						} else {
     							//ignorecontent = false;
     							insideword = false;
     							writer.writeEndElement();
     							writer.writeCharacters("\n");
+    						}
     					} else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag
     						ignorecontent = false;
     						donothing(event, null);
-...
+    				}
     				previousEvent = event;
+    			}
     			parser.close()
     			writer.close();
     			output.close();
-...
+    		}
     		return true;
+    	}
     	public void setRetokenize(boolean retokenize) {
     		this.retokenize = retokenize
+    	}
     	/**
     	 * Set the element and content to ignore
+    	 *
-...
     		if (regSplitWhiteSpaces != null) {
     			for (String s : regSplitWhiteSpaces.split(text)) {		// separate with unicode white spaces
     				//	if (DEBUG){println "process $s"}
     					iterate(s);
+    				}
     				iterate(s);
+    			}
     		} else {
     			iterate(text);
+    		}
+    	}
     	/**
     	 * Iterate. a String, should be called when a word is found in a String
+    	 *
-...
     	 */
     	protected iterate(String s) {
     		while (s != null && s.length() > 0) {
     		//	if (DEBUG){println "  > $s"}
     			//	if (DEBUG){println "  > $s"}
     			s = standardChecks(s);
+    		}
+    	}
     	/**
     	 * Standard checks.
+    	 *
-...
     	public String standardChecks(String s) {
     		Matcher m;
     		if (fclitics != null && (m = s =~ regFClitics) ) {
     		//	if (DEBUG) println "CLITIC found: $s ->"+ m
     			//	if (DEBUG) println "CLITIC found: $s ->"+ m
     			iterate(m.group(1))
     			wordcount++;
-...
     			return "";
     		} else if (regElision != null && (m = s =~ regElision) ) {
     		//	if (DEBUG) println "Elision found: $s ->"+ m.group(1)+" + "+m.group(2)+" + "+m.group(3)
     			//	if (DEBUG) println "Elision found: $s ->"+ m.group(1)+" + "+m.group(2)+" + "+m.group(3)
     			//iterate(m.group(1))
     //			int sep = s.indexOf("'");
     //			if (sep < 0)
     //				sep = s.indexOf("’");
     //			if (sep < 0)
     //				sep = s.indexOf("‘");
     			//			int sep = s.indexOf("'");
     			//			if (sep < 0)
     			//				sep = s.indexOf("’");
     			//			if (sep < 0)
     			//				sep = s.indexOf("‘");
     			wordcount++;
     			writer.writeStartElement(word_element_to_create);
     			writeWordAttributes();// id
-...
     			writer.writeCharacters(m.group(1));
     			writer.writeEndElement();
     			writer.writeCharacters("\n");
     			iterate(m.group(2))
     			return "";
     		} else if (reg3pts != null && (m = s =~ reg3pts) )	{
     		//	if(DEBUG){println "REG '...' found: $s -> "+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
     			//	if(DEBUG){println "REG '...' found: $s -> "+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
     			iterate(m.group(1))
     			wordcount++;
     			writer.writeStartElement(word_element_to_create);
     			writeWordAttributes();// id
-...
     			writer.writeCharacters("...");
     			writer.writeEndElement();
     			writer.writeCharacters("\n");
     			return m.group(3);
     		} else if (regPunct != null && (m = s =~ regPunct) ) {
     			if(DEBUG){println "PUNCT found: $s ->"+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
     			iterate(m.group(1));
     			wordcount++;
     			writer.writeStartElement(word_element_to_create);
     			writeWordAttributes();// id
-...
     			writer.writeCharacters(m.group(2));
     			writer.writeEndElement();
     			writer.writeCharacters("\n");
     			return m.group(3);
     		} else {
     	//		if(DEBUG){println "Other found: "+s}
     			//		if(DEBUG){println "Other found: "+s}
     			wordcount++;
     			writer.writeStartElement(word_element_to_create);
     			writeWordAttributes();// id
     //			if (reg_punct_other.matcher(s).matches())
     //				writer.writeAttribute("type","pon");
     //			else
     //				writer.writeAttribute("type","w");
     			//			if (reg_punct_other.matcher(s).matches())
     			//				writer.writeAttribute("type","pon");
     			//			else
     			//				writer.writeAttribute("type","w");
     			writer.writeCharacters(s);
     			writer.writeEndElement();
     			writer.writeCharacters("\n");
     			return "";
+    		}
+    	}
     	/**
     	 * Write word attributes.
+    	 *
-...
     	protected writeWordAttributes() {
     		writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
     		writer.writeAttribute("n",""+wordcount);
     		for (String attr : retokenizedWordProperties.keySet()) {
     			if ("id" == attr) {
     				writer.writeAttribute("previous-id", retokenizedWordProperties[attr]);
     			} else if ("n" == attr) {
     				writer.writeAttribute("previous-n", retokenizedWordProperties[attr]);
     			} else {
     				writer.writeAttribute(attr, retokenizedWordProperties[attr]);
+    			}
+    		}
+    	}
     	public void setStartTag(String tag)
+    	{
     		this.startTag = tag;
     		this.reg_startTag = Pattern.compile(startTag);
+    	}
     	/**
     	 * Tokenize.
+    	 *
-...
+    	{
     		return str.tokenize()	// cut by whitespace
+    	}
     	/**
     	 * The main method.
+    	 *
-...
     	 */
     	public static void main(String[] args)
+    	{
     		File SVNDIR = new File(System.getProperty("user.home"), "xml/tokenizernum");
     		Runtime.getRuntime().exec("svn", "update", SVNDIR.getAbsolutePath());
     		File inputFile = new File(SVNDIR, "baye1.xml")
     		File outputFile = new File(SVNDIR, "baye1-t.xml")
     		SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, "fr")
     		tokenizer.DEBUG = true
     		tokenizer.setNote("note")
     		//tokenizer.setOutSideTextTags("teiHeader")
     		tokenizer.setOutSideTextTagsAndKeepContent("teiHeader")
     		println outputFile.toString() + " : "+tokenizer.process();
     		//		File SVNDIR = new File(System.getProperty("user.home"), "xml/tokenizernum");
     		//		Runtime.getRuntime().exec("svn", "update", SVNDIR.getAbsolutePath());
     		//		File inputFile = new File(SVNDIR, "baye1.xml")
     		//		File outputFile = new File(SVNDIR, "baye1-t.xml")
     		//		SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, "fr")
     		//		tokenizer.DEBUG = true
     		//		tokenizer.setNote("note")
     		//		//tokenizer.setOutSideTextTags("teiHeader")
     		//		tokenizer.setOutSideTextTagsAndKeepContent("teiHeader")
     		//		println outputFile.toString() + " : "+tokenizer.process();
     		// FIXME: tests UTF-8 XML ranges
     		//		File inputDir = new File("C:/Tools/Textometrie/___corpus/xml/test_bug_import_chinois/")
     		//	File inputFile = new File(inputDir, "problem2.xml");
     		/*
     		File inputDir = new File("/home/mdecorde/xml/bvhepistemon2016")
     		File outputDir = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast")
     		File outputDir2 = new File("/home/mdecorde/xml/bvhepistemon2016/out-slow")
     		File outputDir3 = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast2")
     		File inputFile = new File(inputDir, "1542_RabelaisGrgt.xml");
     		outputDir.deleteDir()
     		outputDir2.deleteDir()
     		outputDir3.deleteDir()
     		outputDir.mkdir()
     		outputDir2.mkdir()
     		outputDir3.mkdir()
     		long time;
     		for (String lang : ["none"]) {
     			println "LANG: $lang"
     			time = System.currentTimeMillis();
     			File outputFile = new File(outputDir, "${lang}.xml")
     			SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, lang)
     			tokenizer.process();
     			println "FAST $lang: "+(System.currentTimeMillis()-time)
     //			time = System.currentTimeMillis();
     //			File outputFile3 = new File(outputDir3, "${lang}.xml")
     //			FastSimpleTokenizerXml tokenizer3 = new FastSimpleTokenizerXml(inputFile, outputFile3, lang)
     //			tokenizer3.process();
     //			println "FAST2 $lang: "+(System.currentTimeMillis()-time)
     //
     //			time = System.currentTimeMillis();
     //			File outputFile2 = new File(outputDir2, "${lang}.xml")
     //			OldSimpleTokenizerXml tokenizer2 = new OldSimpleTokenizerXml(inputFile, outputFile2, lang)
     //			tokenizer2.process();
     //			println "SLOW $lang: "+(System.currentTimeMillis()-time)
+    		}
     */
     		 File inputDir = new File("/home/mdecorde/xml/bvhepistemon2016")
     		 File outputDir = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast")
     		 File outputDir2 = new File("/home/mdecorde/xml/bvhepistemon2016/out-slow")
     		 File outputDir3 = new File("/home/mdecorde/xml/bvhepistemon2016/out-fast2")
     		 File inputFile = new File(inputDir, "1542_RabelaisGrgt.xml");
     		 outputDir.deleteDir()
     		 outputDir2.deleteDir()
     		 outputDir3.deleteDir()
     		 outputDir.mkdir()
     		 outputDir2.mkdir()
     		 outputDir3.mkdir()
     		 long time;
     		 for (String lang : ["none"]) {
     		 println "LANG: $lang"
     		 time = System.currentTimeMillis();
     		 File outputFile = new File(outputDir, "${lang}.xml")
     		 SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inputFile, outputFile, lang)
     		 tokenizer.process();
     		 println "FAST $lang: "+(System.currentTimeMillis()-time)
     		 //			time = System.currentTimeMillis();
     		 //			File outputFile3 = new File(outputDir3, "${lang}.xml")
     		 //			FastSimpleTokenizerXml tokenizer3 = new FastSimpleTokenizerXml(inputFile, outputFile3, lang)
     		 //			tokenizer3.process();
     		 //			println "FAST2 $lang: "+(System.currentTimeMillis()-time)
     		 //
     		 //			time = System.currentTimeMillis();
     		 //			File outputFile2 = new File(outputDir2, "${lang}.xml")
     		 //			OldSimpleTokenizerXml tokenizer2 = new OldSimpleTokenizerXml(inputFile, outputFile2, lang)
     		 //			tokenizer2.process();
     		 //			println "SLOW $lang: "+(System.currentTimeMillis()-time)
+    		 }
     		 */
     		println ""
     		println "Done"
     		//		String lang = "en"
     		//		File inDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant")
     		//		File outDir = new File("/home/mdecorde/xml/ny911/cycle1/Data_Modif_Pour_Import_XML_Cycle1/xml_files/1xml_par_participant-t")
-...
     		//		}
     		//		println ""
     		//		println "Done"
     		String lang = "fr"
     		File inFile = new File(System.getProperty("user.home"), "xml/vocapia/test.trs")
     		File outFile = new File(System.getProperty("user.home"), "xml/vocapia/test-retokenized.trs")
     		println "processing "+inFile
     		SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(inFile, outFile, lang)
     		tokenizer.setRetokenize(true)
     		//tokenizer.setDEBUG false
     		tokenizer.process();
     		println "Done"
+    	}
+    }

     package org.txm.macro.transcription
     import javax.xml.stream.*
     import org.txm.importer.PersonalNamespaceContext
     import org.txm.xml.IdentityHook
     import org.txm.xml.*
     import java.io.BufferedOutputStream
     import java.io.FileOutputStream
     import java.io.IOException
     import java.net.URL
     import java.util.*
     import java.util.Map.Entry
     import java.util.regex.Pattern
     class FixTranscription extends XMLProcessor {
     	LocalNamesHookActivator activator;
     	IdentityHook hook;
     	boolean retokenizeWords
     	public FixTranscription(File xmlfile, boolean retokenizeWords) {
     		super(xmlfile)
     		this.retokenizeWords = retokenizeWords
     		activator = new LocalNamesHookActivator<>(hook, ["w", "Turn", "Sync"]);
     		hook = new IdentityHook("word_hook", activator, this) {
     					boolean inTurn = false;
     					boolean inW = false;
     					StringBuilder wordBuffer = new StringBuilder();
     					String currentTime;
     					LinkedHashMap turnInfos = new LinkedHashMap()
     					LinkedHashMap wInfos = new LinkedHashMap()
     					boolean other
     					@Override
     					public boolean deactivate() {
     						return true;
+    					}
     					@Override
     					public boolean _activate() {
     						return true;
+    					}
     					@Override
     					protected void processStartElement() throws XMLStreamException, IOException {
     						if (localname.equals("Turn")) {
     							// store values
     							inTurn = true;
     							turnInfos.clear()
     							for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
     								turnInfos[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i)
+    							}
     							currentTime = turnInfos["startTime"]
     							super.processStartElement();
     						} else if (localname.equals("Sync")) {
     							currentTime = parser.getAttributeValue(null, "time")
     							super.processStartElement();
     						} else if (localname.equals("w")) {
     							// store values
     							inW = true;
     							wInfos.clear()
     							for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
     								wInfos[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i)
+    							}
     							String time = parser.getAttributeValue(null, "time")
     							if (time != null && time.length() > 0) {
     								currentTime = time
+    							}
     							wordBuffer.setLength(0);
     							return; // write w later
+    						}
     						else {
     							super.processStartElement();
+    						}
+    					}
     					@Override
     					protected void processCharacters() throws XMLStreamException {
     						if (inW) {
     							wordBuffer.append(parser.getText())
+    						}
     						else {
     							super.processCharacters();
+    						}
+    					}
     					@Override
     					protected void processEndElement() throws XMLStreamException {
     						if (localname.equals("w")) {
     							inW = false
     							String word = wordBuffer.toString().trim()
     							if (!other && word.startsWith("*")) {
     								//close current Turn and start a 'other' Turn
     								writer.writeEndElement() // current Turn
     								writer.writeCharacters("\n")
     								def tmpInfos = new LinkedHashMap()
     								for (String attr : turnInfos.keySet()) tmpInfos[attr] = turnInfos[attr]
     								tmpInfos["orig-speaker"] = turnInfos["speaker"]
     								tmpInfos["speaker"] = "other"
     								tmpInfos["startTime"] = currentTime
     								writer.writeStartElement("Turn")
     								for (String attr : tmpInfos.keySet()) {
     									writer.writeAttribute(attr, tmpInfos[attr])
+    								}
     								other = true
     								word = word.substring(1)
+    							}
     							boolean shouldCloseOtherTurn = false;
     							if (other && word.endsWith("*")) {
     								shouldCloseOtherTurn = true;
     								word = word.substring(0, word.length()-1)
     								other = false
+    							}
     							// split before re-writing the word
     							def puncts = []
     							if (retokenizeWords) {
     //								while (word.length() > 0 && word.matches("\\p{Punct}.+")) {
     //									puncts << word.substring(0, 1)
     //									word = word.substring(1, word.length())
     //								}
     								//						//vocapia fix
     								//						Pattern reg = Pattern.compile("([^']+')(.+)")
     								//						def m = reg.matcher(word)
     								//						while (word.length() > 0 && m.matches()) {
     								//							puncts << m.group(1)
     								//							word = m.group(2)
     								//							m = reg.matcher(word)
     								//						}
+    							}
     							for (def punct : puncts) { // pre-retokenize if any
     								writer.writeStartElement("w")
     								for (String attr : wInfos.keySet()) {
     									writer.writeAttribute(attr, wInfos[attr])
+    								}
     								writer.writeCharacters(punct)
     								writer.writeEndElement() // w
     								writer.writeCharacters("\n")
+    							}
     							puncts = []
     							if (retokenizeWords) {
     //								while (word.length() > 0 && word.matches(".+\\p{Punct}")) {
     //									puncts << word.substring(word.length()-1, word.length())
     //									word = word.substring(0, word.length()-1)
     //								}
+    							}
     							if ("XXX".equals(word)) { // <Event desc="XXX" type="unknown" extent="next"/>
     								writer.writeStartElement("event") // start the initial word
     								writer.writeAttribute("desc", "XXX from "+wInfos["start"] + " to "+wInfos["end"])
     								writer.writeAttribute("type", "unknown")
     								writer.writeAttribute("extent", "instantaneous")
     								writer.writeEndElement() // event
     								word = "" // don't write the word
+    							}
     							if (word.length() > 0) {
     								writer.writeStartElement("w") // start the initial word
     								for (String attr : wInfos.keySet() ) {
     									writer.writeAttribute(attr, wInfos[attr])
+    								}
     								writer.writeCharacters(word)
     								writer.writeEndElement() // w
+    							}
     							for (String punct : puncts) {  // post-retokenize if any
     								writer.writeStartElement("w")
     								for (String attr : wInfos.keySet() ) {
     									writer.writeAttribute(attr, wInfos[attr])
+    								}
     								writer.writeCharacters(punct)
     								writer.writeEndElement() // w
+    							}
     							if (shouldCloseOtherTurn) {
     								shouldCloseOtherTurn = false;
     								//close the current 'other' Turn and restart the actual Turn
     								writer.writeEndElement() // current 'other' Turn
     								writer.writeStartElement("Turn") // rebuild the orig Turn and fix its start-end infos
     								turnInfos["startTime"] = wInfos["end"] // fix the startTime using the current word end time
     								for (String attr : turnInfos.keySet()) {
     									writer.writeAttribute(attr, turnInfos[attr])
+    								}
     								other = false
+    							}
     						} else {
     							super.processEndElement();
+    						}
+    					}
+    				}
+    	}
     	public static void main(String[] args) {
     		File infile = new File("/home/mdecorde/xml/vocapia","test.trs")
     		File outfile = new File("/home/mdecorde/xml/vocapia","test-fixed.trs")
     		def processor = new FixTranscription(infile, true)
     		println processor.process(outfile)
+    	}
+    }

     package org.txm.macro.transcription
     import java.time.LocalTime
     import java.time.format.DateTimeFormatter
     import org.txm.utils.*
     import org.txm.utils.logger.*
     @Field @Option(name="trsFile", usage="A single vocapia XML file", widget="FileOpen", required=false, def="")
     		File trsFile;
     @Field @Option(name="trsDirectory", usage="A Vocapia XML files directory to process", widget="Folder", required=false, def="")
     		File trsDirectory;
     @Field @Option(name="resultDirectory", usage="The result directory", widget="Folder", required=false, def="")
     		File resultDirectory;
     if (!ParametersDialog.open(this)) return;
     if (trsDirectory.equals(resultDirectory) || trsFile.getParentFile().equals(resultDirectory)) {
     	return false;
+    }
     resultDirectory.mkdirs();
     def trsFiles = []
     if (trsDirectory != null && trsDirectory.exists()) {
     	println "Processing TRS directory: $trsDirectory"
     	for (File file : trsDirectory.listFiles()) {
     		if (file.getName().toLowerCase().endsWith(".trs")) {
     			trsFiles << file
+    		}
+    	}
     } else if (trsFile != null && trsFile.exists()) {
     	println "Processing TRS file: $trsFile"
     	trsFiles << trsFile
+    }
     if (trsFiles.size() == 0) {
     	println "No XML file found for parameters trsFile=$trsFile and trsDirectory=$trsDirectory"
     	return false
+    }
     ConsoleProgressBar cpb = new ConsoleProgressBar(trsFiles.size())
     for (File trsFile : trsFiles) {
     	cpb.tick()
     	FixTranscription fixer = new FixTranscription(trsFile)
     	String name = FileUtils.stripExtension(trsFile)
     	File outFile = new File(resultDirectory, name+".trs")
     	if (!fixer.process(outFile)) {
     		println "WARNING: ERROR WHILE PROCESSING: "+trsFile
     		return false
+    	}
+    }
     cpb.done()
     println "Done: "+trsFiles.size()+" files processed. Result files in $resultDirectory"

     		XMLStreamReader parser = factory.createXMLStreamReader(inputData);
     		boolean flagWord = false
     		def winfos = [:]
     		def turninfos = [:]
     		def winfos = new LinkedHashMap()
     		def turninfos = new LinkedHashMap()
     		boolean other = false;
     		String word = ""
     		try {
-...
     								writer.writeStartElement("Episode")
     							//<Section type="report" startTime="0" endTime="3617.593">
     								writer.writeStartElement("Section")
     								for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
     									writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i))
+    								}
     								break;
     							case "SpeechSegment": // <SpeechSegment ch="1" sconf="1.00" stime="9.94" etime="43.81" spkid="Enquêtrice" lang="fre" lconf="1.00" trs="1">
     								writer.writeStartElement("Turn")
     								turninfos = ["speaker":parser.getAttributeValue(null, "spkid"),
     									"startTime":parser.getAttributeValue(null, "stime"),
     									"endTime":parser.getAttributeValue(null, "etime"),
+    								]
     								for (String attr : turninfos.keySet()) {
     									writer.writeAttribute(attr, turninfos[attr])
     								for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
     									String name = parser.getAttributeLocalName(i)
     									if (name == "stime") name = "startTime"
     									else if (name == "etime") name = "endTime"
     									else if (name == "spkid") name = "speaker"
     									writer.writeAttribute(name, parser.getAttributeValue(i))
+    								}
     								writer.writeCharacters("\n")
     								writer.writeStartElement("Sync")
     								writer.writeAttribute("time", parser.getAttributeValue(null, "stime"))
-...
     							// store w infos in case the word must be splited
     								def endValue = String.format(Locale.US, "%.2f", (Double.parseDouble(parser.getAttributeValue(null, "stime")) + Double.parseDouble(parser.getAttributeValue(null, "dur"))))
     								winfos = ["time":parser.getAttributeValue(null, "stime"), "start": parser.getAttributeValue(null, "stime"), "end":endValue]
     								def startValue = parser.getAttributeValue(null, "stime");
     								winfos.clear()
     								for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
     									String name = parser.getAttributeLocalName(i)
     									if (name == "stime") continue
     										winfos[name] = parser.getAttributeValue(i)
+    								}
     								winfos["time"] = startValue
     								winfos["start"] = startValue
     								winfos["end"] = endValue
     							// write <w> at the end of <Word> tag
     								break
+    						}
     						break;
-...
     							case "Word":
     								flagWord = false
     								word = word.trim()
     								if (word.startsWith("*")) {
     									//close current Turn and start a 'other' Turn
     									writer.writeEndElement() // current Turn
     									writer.writeStartElement("Turn")
     									writer.writeAttribute("speaker", "other")
     									writer.writeAttribute("startTime", winfos["time"])
     									writer.writeAttribute("orig-speaker", turninfos["speaker"])
     									writer.writeCharacters("\n")
     									other = true
     									word = word.substring(1)
+    								}
     								String otherAttributeValue = Boolean.toString(other) // set now
     								if (other && word.endsWith("*")) {
     									//close the current 'other' Turn and restart the actual Turn
     									writer.writeEndElement() // current 'other' Turn
     									writer.writeStartElement("Turn") // rebuild the orig Turn with its infos
     									turninfos["startTime"] = winfos["end"] // fix the startTime using the current word end time
     									for (String attr : turninfos.keySet()) {
     										writer.writeAttribute(attr, turninfos[attr])
+    									}
     									writer.writeCharacters("\n")
     									word = word.substring(0, word.length()-1)
     									other = false
+    								}
     							// split before the word
     								def puncts = []
     								if (retokenizeWords) {
     									while (word.length() > 0 && word.matches("\\p{Punct}.+")) {
     										puncts << word.substring(0, 1)
     										word = word.substring(1, word.length())
     										puncts << word.substring(0 ,1)
     										word = word.substring(1)
+    									}
     									// fix "d'abord" like words
     									Pattern reg = Pattern.compile("([^']+')(.+)")
     									def m = reg.matcher(word)
     									while (word.length() > 0 && m.matches()) {
-...
     										m = reg.matcher(word)
+    									}
+    								}
     								for (def punct : puncts) { // pre-retokenize if any
     									writer.writeStartElement("w")
     									for (String attr : winfos.keySet()) {
-...
     								puncts = []
     								if (retokenizeWords) {
     									while (word.length() > 0 && word.matches(".+\\p{Punct}")) {
     										puncts << word.substring(word.length()-1, word.length())
     										puncts.add(0, word.substring(word.length()-1, word.length()))
     										word = word.substring(0, word.length()-1)
+    									}
+    								}
     								writer.writeStartElement("w") // start the initial word
     								writer.writeAttribute("time", winfos["time"])
     								writer.writeAttribute("start", winfos["start"])
     								writer.writeAttribute("end", winfos["end"])
     								writer.writeAttribute("other", otherAttributeValue)
     								if (word.length() > 0) {
     									writer.writeStartElement("w") // start the initial word
     									for (String attr : winfos.keySet()) {
     										writer.writeAttribute(attr, winfos[attr])
+    									}
     									writer.writeCharacters(word)
     									writer.writeEndElement() // w
     									writer.writeCharacters("\n")
+    								}
     								writer.writeCharacters(word)
     								writer.writeEndElement() // w
     								writer.writeCharacters("\n")
     								for (String punct : puncts) {  // post-retokenize if any
     								for (String punct : puncts) {  // write post-retokenize if any
     									writer.writeStartElement("w")
     									writer.writeAttribute("time", winfos["time"])
     									writer.writeAttribute("start", winfos["end"])
     									writer.writeAttribute("end", winfos["end"])
     									for (String attr : winfos.keySet()) {
     										writer.writeAttribute(attr, winfos[attr])
+    									}
     									writer.writeCharacters(punct)
     									writer.writeEndElement() // w
     									writer.writeCharacters("\n")
-...
     		File infile = new File("/home/mdecorde/xml/vocapia","test.xml")
     		File outfile = new File("/home/mdecorde/xml/vocapia","test.trs")
     		def processor = new Vocapia2Transcriber(infile)
     		println processor.process(outfile)
     		println processor.process(outfile, true)
+    	}
+    }

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 3003