/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 3545

     	@Override
     	public void process() {
     		File connluSrcDirectory = inputDirectory
     		boolean usenewdocid =  UDPreferences.getInstance().getString(UDPreferences.IMPORT_USE_NEW_DOC_ID); // THE conllu -> Tiger XSL MUST HAVE THE SAME BEHAVIOR BEFORE //
     		if (usenewdocid) {
     			connluSrcDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu")
     			connluSrcDirectory.deleteDir();
     			connluSrcDirectory.mkdirs();
     			println "Convert CoNLL-U to XML-TEI..."
     			if (!splitCoNLLUFiles(inputDirectory, connluSrcDirectory, project)) {
     				return;
+    			}
+    		}
     		File srcDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu2tei")
     		srcDirectory.deleteDir();
     		srcDirectory.mkdirs();
     		println "Convert CoNLL-U to XML-TEI..."
     		convert(inputDirectory, srcDirectory, project)
     		convertCoNLLU2TEI(connluSrcDirectory, srcDirectory, project)
     		inputDirectory = srcDirectory // switch source directory
     		super.process();
+    	}
     	public static def convert(File inputDirectory, File srcDirectory, def project) {
     	public static def splitCoNLLUFiles(File inputDirectory, File srcDirectory, def project) {
     		def files = inputDirectory.listFiles()
     		if (files == null) {
     			println "Aborting. No CONLL file found in $inputDirectory."
     			return false
+    		}
     		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
     		println "Splitting CoNLL-U files..."
     		for (File master : files) {
     			cpb_texts.tick()
     			if (!master.getName().endsWith(".conllu")) {
     				continue;
+    			}
     			String orig_text_id = master.getName()
     			String current_text_id = master.getName()
     			File conlluFile = new File(srcDirectory, current_text_id+".conllu")
     			def writer = conlluFile.newWriter("UTF-8", true)
     			master.eachLine("UTF-8") { line ->
     				if (line.startsWith("# newdoc id = ")) {
     					String text_id = line.substring("# newdoc id = ".length())
     					if (!text_id.equals(current_text_id)) {
     						writer.close()
     						current_text_id = text_id
     						conlluFile = new File(srcDirectory, current_text_id+".conllu")
     						writer = conlluFile.newWriter("UTF-8", true)
+    					}
+    				}
     				writer.println(line)
+    			}
+    		}
     		cpb_texts.done()
     		return true
+    	}
     	public static def convertCoNLLU2TEI(File inputDirectory, File srcDirectory, def project) {
     		def files = inputDirectory.listFiles()
     		if (files == null) {
-...
     			return false
+    		}
     		def content = new LinkedHashMap(); // /text/par/sent
     		def properties = Arrays.asList(ImportCoNLLUAnnotations.UD_PROPERTY_NAMES)
     		String prefix = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.UDPREFIX, UDPreferences.getInstance().getString(UDPreferences.UDPREFIX));
-...
     		boolean keepContractions =  UDPreferences.getInstance().getString(UDPreferences.KEEP_CONTRACTIONS)
     		boolean usenewdocid =  false; // THE conllu -> Tiger XSL MUST HAVE THE SAME BEHAVIOR BEFORE // UDPreferences.getInstance().getString(UDPreferences.IMPORT_USE_NEW_DOC_ID)
     		def headPropertiesToProject = UDPreferences.getInstance().getString(UDPreferences.IMPORT_HEAD_TO_PROJECT).split(",") as Set
     		def depsPropertiesToProject = UDPreferences.getInstance().getString(UDPreferences.IMPORT_DEPS_TO_PROJECT).split(",") as Set
     		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
     		def docToMaster = [:]
     		println "Parsing CoNLL-U files..."
     		for (File master : files) {
-...
     				continue;
+    			}
     			def content = []; // list of sentence
     			String text_id = master.getName();
     			String sent_id = "";
     			String par_id = "1";
     			def comments = []; // /text/par/sent
     			def words = []
     			master.eachLine("UTF-8") { line ->
     				if (usenewdocid && line.startsWith("# newdoc id = ")) {
     					text_id = line.substring("# newdoc id = ".length())
     					if (!docToMaster.containsKey(text_id)) docToMaster[text_id] = new HashSet();
     					docToMaster[text_id].add(master.getName())
     				if (line.startsWith("# newdoc id = ")) {
     					// already set or ignored
     				} else if (line.startsWith("# sent_id = ")) {
     					sent_id = line.substring("# sent_id = ".length())
     				} else if (line.startsWith("# newpar id = ")) {
     					par_id = line.substring("# newpar id = ".length())
     				} else if (line.startsWith("#")) {
     					//sent_id = line.substring("# sent_id = ".length())
     					comments << line
     				} else if (line.trim().isEmpty()) {
     					if (words.size() > 0) {
     						def sentence = [par_id, sent_id, words, comments]
     						content.add(sentence)
+    					}
     				} else {
     					if (text_id != null && sent_id != null) {
     						if (!content.containsKey(text_id)) {
     							content[text_id] = new LinkedHashMap()
     					HashMap<String, String> wProperties = new HashMap<String, String>()
     					def split = line.split("\t")
     					if (split.size() == properties.size()) {
     						String id = split[0]
     						for (int i = 0 ; i < split.size() ; i++) {
     							wProperties[properties[i]] = split[i]
+    						}
     						def text = content[text_id]
     						if (!text.containsKey(par_id)) {
     							text[par_id] = new LinkedHashMap()
+    						}
     						if (!text[par_id].containsKey(sent_id)) {
     							LinkedHashMap<String, HashMap<String, String>> sentenceProperties = new LinkedHashMap<String, HashMap<String, String>>();
     							text[par_id][sent_id] = sentenceProperties
+    						}
     						HashMap<String, String> wProperties = new HashMap<String, String>()
     						def split = line.split("\t")
     						if (split.size() == properties.size()) {
     							String id = split[0]
     							for (int i = 0 ; i < split.size() ; i++) {
     								wProperties[properties[i]] = split[i]
     						if (wProperties.get("id").equals("1")) { // it's a new sentence, store the current if any and starts a new sentence
     							if (words.size() > 0) {
     								def sentence = [par_id, sent_id, words, comments]
     								content.add(sentence)
+    							}
     							text[par_id][sent_id][id] = wProperties
     							sent_id = "";
     							par_id = "1";
     							comments = [];
     							words = []
+    						}
     						words << wProperties
     					} else {
     						//println "Warning: not a line: "+line
+    					}
+    				}
+    			}
+    		}
     		cpb_texts.done()
     		println "Writing XML files..."
     		cpb_texts = new ConsoleProgressBar(content.size())
     		for (def text_id2 : content.keySet()) {
     			cpb_texts.tick()
     			if (content.size() == 0) {
     				continue;
+    			}
     			File xmlFile = new File(srcDirectory, text_id2+".xml")
     			File xmlFile = new File(srcDirectory, text_id+".xml")
     			// println "xmlFile=$xmlFile"
     			BufferedOutputStream output = new BufferedOutputStream(new FileOutputStream(xmlFile))
     			XMLOutputFactory factory = XMLOutputFactory.newInstance()
-...
     			writer.writeEndElement()
     			writer.writeCharacters("\n")
     			writer.writeStartElement ("text")
     			if (docToMaster.containsKey(text_id2)) {
     				writer.writeAttribute ("filename", docToMaster[text_id2].join(", "))
+    			}
     			writer.writeCharacters("\n")
     			def text = content[text_id2]
     			for (def par_id2 : text.keySet()) { // for all paragraph of the current text
     			String current_par_id = null
     			for (def sentence : content) { // for all paragraph of the current text
     				writer.writeStartElement ("p");
     				writer.writeAttribute("id", par_id2)
     				par_id = sentence[0]
     				sent_id = sentence[1]
     				words = sentence[2]
     				comments = sentence[3]
     				if (current_par_id == null || par_id != current_par_id) {
     					if (current_par_id != null) {
     						writer.writeEndElement() // p
+    					}
     					writer.writeStartElement ("p");
     					writer.writeAttribute("id", par_id)
     					writer.writeCharacters("\n")
     					current_par_id = par_id
+    				}
     				writer.writeStartElement ("s")
     				writer.writeAttribute("id", sent_id)
     				writer.writeCharacters("\n")
     				for (def sent_id2 : text[par_id2].keySet()) { // for all sentence of the current paragraph
     					writer.writeStartElement ("s")
     					writer.writeAttribute("id", sent_id2)
     				for (def comment : comments) {
     					writer.writeComment(comment)
     					writer.writeCharacters("\n")
     					def sentence = text[par_id2][sent_id2]
     					if (!keepContractions) { // merge properties in the "-" word and remove the parts
     						for (String wordid : sentence.keySet()) {
     							def word = sentence[wordid]
     							if (word == null) continue;
+    				}
     				if (!keepContractions) { // merge properties in the "-" word and remove the parts
     					for (int i = 0 ; i < words.size() ; i++) {
     						def word = words[i]
     						String id = word[0]
     						if (id.contains("-")) { // multi-word line
     							int index = id.indexOf("-")
     							String id1 = id.substring(0, index)
     							String id2 = id.substring(index+1)
     							def token1 = sentence[id1]
     							def token2 = sentence[id2]
     							String id = word[0]
     							if (token1 == null || token2 == null) {
     								println "Error: text $text_id paragraph $par_id sent $sent_id word $id has wrong token ids $id1 and $id2 -> $token1 and $token2"
     								continue
+    							}
     							if (id.contains("-")) { // multi-word line
     								int index = id.indexOf("-")
     								String id1 = id.substring(0, index)
     								String id2 = id.substring(index+1)
     								def token1 = sentence[id1]
     								def token2 = sentence[id2]
     								if (token1 == null || token2 == null) {
     									println "Error: text $text_id2 paragraph $par_id2 sent $sent_id2 word $id has wrong token ids $id1 and $id2 -> $token1 and $token2"
     									continue
+    								}
     								for (String p : properties) {
     									if (p == "id") continue // don't merge the form property
     										if (p == "form") continue // don't merge the form property
     										word[p] =  token1[p] + "+" + token2[p]
+    								}
     								sentence.remove(id1) // remove the token
     								sentence.remove(id2) // remove the token
     							for (String p : properties) {
     								if (p == "id") continue // don't merge the form property
     									if (p == "form") continue // don't merge the form property
     									word[p] =  token1[p] + "+" + token2[p]
+    							}
     							words.remove(i+1) // remove the token
     							words.remove(i+1) // remove the token
+    						}
+    					}
+    				}
     				if (headPropertiesToProject.size() > 0 || depsPropertiesToProject.size() > 0) {
     					LinkedHashMap sentencehash = new LinkedHashMap()
     					//println "WORDS="+words
     					for (def word : words) {
     						sentencehash[word["id"]] = word
+    					}
     					//println "SENTENCE="+sentencehash
     					ImportCoNLLUAnnotations.buildPropertiesProjections(sentencehash, headPropertiesToProject, depsPropertiesToProject)
+    				}
     				for (def word : words) {
     					if (headPropertiesToProject.size() > 0 || depsPropertiesToProject.size() > 0) {
     						ImportCoNLLUAnnotations.buildPropertiesProjections(sentence, headPropertiesToProject, depsPropertiesToProject)
     					def id = word["id"]
     					writer.writeStartElement ("w")
     					for (String p : word.keySet()) {
     						if (p == "feats") word[p] = "|"+word[p]+"|"
     						//println "WORD="+word
     						writer.writeAttribute(prefix+p, word[p])
+    					}
     					for (String wordid : sentence.keySet()) {
     						def word = sentence[wordid]
     						def id = word["id"]
     						writer.writeStartElement ("w")
     						for (String p : word.keySet()) {
     							if (p == "feats") word[p] = "|"+word[p]+"|"
     							//println "WORD="+word
     							writer.writeAttribute(prefix+p, word[p])
+    						}
     						writer.writeCharacters(word["form"])
     						writer.writeEndElement() // w
     						writer.writeCharacters(" ")
+    					}
     					writer.writeCharacters("\n")
     					writer.writeEndElement() // s
     					writer.writeCharacters(word["form"])
     					writer.writeEndElement() // w
     					writer.writeCharacters(" ")
+    				}
     				writer.writeCharacters("\n")
     				writer.writeEndElement() // s
+    			}
     			if (current_par_id != null) {
     				writer.writeEndElement() // p
     				writer.writeCharacters("\n")
+    			}
     			writer.writeEndElement() // text
-...
     		return true
+    	}
+    }

TXM/trunk/org.txm.connlu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImport.groovy (revision 3545)
48	48	// build the TIGER-XML file
49	49	CallUD2TigerPerlScript cutps = new CallUD2TigerPerlScript();
50	50
	51	boolean buildtigerindexes = UDPreferences.getInstance().getString(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES); // THE conllu -> Tiger XSL MUST HAVE THE SAME BEHAVIOR BEFORE //
	52	if (!buildtigerindexes) {
	53	println "Skipping TIGER conversion step."
	54	return;
	55	}
	56
51	57	if (cutps.canBuildTSFiles()) {
52	58
53	59	println "Converting CoNLL-U files to TIGER-XML files..."
...	...
108	114	TIGERSearchEngine.buildTIGERCorpus(tigerXMLDirectory, this.binaryDirectory, corpusName);
109	115	}
110	116	} else {
111		println "Skipping TIGER conversion step."
	117	println "Can not do the TIGER indexes step."
112	118	}
113	119	}
114	120	}

TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZDefaultPagerStep.groovy (revision 3545)
662	662
663	663	String interpvalue = null;
664	664	def tooltipProperties = pager.project.getEditionDefinition("default").get(TBXPreferences.EDITION_DEFINITION_TOOLTIP_PROPERTIES, "*");
665		println tooltipProperties
666	665	if (tooltipProperties.equals("*")) {
667	666	interpvalue = "- "+anaValues.entrySet().join("\n- ")+"\n- "+wordid
668	667	} else {

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 3545