/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

     		try {
     			PrintWriter pw = new PrintWriter(file);
     			SimpleParse p = SimpleParse.fromConll(conll);
     			// Fixing multiwords 6th position if equals to "_"
     			ArrayList<String> conll2 = new ArrayList<>();
     			for (int i = 0 ; i < conll.size() ; i++) {
     				String l = conll.get(i);
     				String split[] = l.split("\t");
     //				if (split[6].equals("_") && split[0].contains("-")) {
     //					if (conll.size() > i + 1) {
     //						String splitNext[] = conll.get(i+1).split("\t");
     //						if (split[0].startsWith(splitNext[0]+ "-")) {
     //							split[6] = splitNext[6];
     //						}
     //					}
     //				}
     //
     				if (split[6].equals("_")) { // 6th col is broken and we don't have the tokens of the multiwords
     					//split[6] = "0";
     					continue;
+    				}
     				conll2.add(l);
     //				l = "";
     //				for (String s : split) l += s+"\t";
     //
     //				conll2.add(l.substring(0, l.length() - 1));
+    			}
     			for (String l : conll2) System.out.println(l);
     			SimpleParse p = SimpleParse.fromConll(conll2);
     			List<SimpleWord> words = p.getWords();
     			List<String> labels = p.getVerticesLabels().get("SYN");
-...
     	public static void main(String[] args) throws IOException {
     		File file = File.createTempFile("txm", ".svg", new File("/home/mdecorde"));
     		File file = new File("/home/mdecorde/deptreeviz.svg");
     		List<String> conll = Arrays.asList(
     				"1	Ce	ce	PRON	_	Number=Sing|Person=3|PronType=Dem	2	nsubj	_	_",
     				"2	doit	devoir	VERB	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	0	root	_	_",
     				"3	être	être	AUX	_	VerbForm=Inf	5	cop	_	_",
     				"4	un	un	DET	_	Definite=Ind|Gender=Masc|Number=Sing|PronType=Art	5	det	_	_",
     				"5	général	général	NOUN	_	Gender=Masc|Number=Sing	2	xcomp	_	_",
     				"6	étranger	étranger	ADJ	_	Gender=Masc|Number=Sing	5	amod	_	_",
     				"7	.	.	PUNCT	_	_	2	punct	_	_");
     		String all = """
 	[Pour]	pour	ADP	_	_	3	case	_	start_char=4|end_char=8|ner=O|XmlId=w_1959_1
 	la	le	DET	_	Definite=Def|Gender=Fem|Number=Sing|PronType=Art	3	det	_	start_char=9|end_char=11|ner=O|XmlId=w_1959_3
 	métropole	métropole	NOUN	_	Gender=Fem|Number=Sing	15	obl:mod	_	start_char=12|end_char=21|ner=O|XmlId=w_1959_4
 	française	français	ADJ	_	Gender=Fem|Number=Sing	3	amod	_	start_char=22|end_char=31|ner=O|XmlId=w_1959_5
 	,	,	PUNCT	_	_	3	punct	_	start_char=31|end_char=32|ner=O|XmlId=w_1959_6
 	pour	pour	ADP	_	_	8	case	_	start_char=33|end_char=37|ner=O|XmlId=w_1959_7
 	l'	le	DET	_	Definite=Def|Number=Sing|PronType=Art	8	det	_	start_char=38|end_char=40|ner=O|XmlId=w_1959_8
 	Algérie	Algérie	PROPN	_	Number=Sing	15	obl:mod	_	start_char=40|end_char=47|ner=S-LOC|XmlId=w_1959_9
 	,	,	PUNCT	_	_	8	punct	_	start_char=47|end_char=48|ner=O|XmlId=w_1959_10
 	pour	pour	ADP	_	_	12	case	_	start_char=49|end_char=53|ner=O|XmlId=w_1959_11
 	la	le	DET	_	Definite=Def|Gender=Fem|Number=Sing|PronType=Art	12	det	_	start_char=54|end_char=56|ner=O|XmlId=w_1959_12
 	communauté	communauté	NOUN	_	Gender=Fem|Number=Sing	15	obl:mod	_	start_char=57|end_char=67|ner=O|XmlId=w_1959_13
 	,	,	PUNCT	_	_	12	punct	_	start_char=67|end_char=68|ner=O|XmlId=w_1959_14
 	je	il	PRON	_	Number=Sing|Person=1|PronType=Prs	15	nsubj	_	start_char=69|end_char=71|ner=O|XmlId=w_1959_15
 	forme	former	VERB	_	Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin	0	root	_	start_char=72|end_char=77|ner=O|XmlId=w_1959_16
 	des	un	DET	_	Definite=Ind|Number=Plur|PronType=Art	17	det	_	start_char=78|end_char=81|ner=O|XmlId=w_1959_17
 	voeux	vœu	NOUN	_	Gender=Masc|Number=Plur	15	obj	_	start_char=83|end_char=88|ner=O|XmlId=w_1959_18
 	ardents	ardent	ADJ	_	Gender=Masc|Number=Plur	17	amod	_	start_char=90|end_char=97|ner=O|XmlId=w_1959_19
 	et	et	CCONJ	_	_	20	cc	_	start_char=98|end_char=100|ner=O|XmlId=w_1959_20
 	confiants	confiant	ADJ	_	Gender=Masc|Number=Plur	18	conj	_	start_char=101|end_char=110|ner=O|XmlId=w_1959_21
 -22	au	_	_	_	_	_	_	_	start_char=111|end_char=113|ner=O|XmlId=w_1959_22
 	à	à	ADP	_	_	24	case	_	XmlId=w_1959_23
 	le	le	DET	_	Definite=Def|Gender=Masc|Number=Sing|PronType=Art	24	det	_	XmlId=w_1959_24
 	premier	premier	ADJ	_	Gender=Masc|Number=Sing	24	amod	_	start_char=114|end_char=121|ner=O|XmlId=w_1959_25
 	jour	jour	NOUN	_	Gender=Masc|Number=Sing	15	obl:mod	_	start_char=122|end_char=126|ner=O|XmlId=w_1959_26
 	de	de	ADP	_	_	26	case	_	start_char=127|end_char=129|ner=O|XmlId=w_1959_27
 	1960	1960	NUM	_	Number=Plur	24	nmod	_	start_char=130|end_char=134|ner=O|XmlId=w_1959_28
 	.	.	PUNCT	_	_	15	punct	_	start_char=134|end_char=135|ner=O|XmlId=w_1959_29
     """;
     		List<String> conll = Arrays.asList(all.split("\n"));
     		System.out.println("FILE: " + print(file, conll, null, null));
+    	}

     		super(module)
+    	}
     	public final String merge (String s1, String s2) {
     		if (s1 == s2) return s1
     		if (s1 == "") return s2
     		if (s2 == "") return s1
     		if (s1 == "_") return s2
     		if (s2 == "_") return s1
     		if (s1 == ""  && s2 == "") return "_"
     		if (s1 == "_" && s2 == "_") return "_"
     		return s1+"|"+s2
+    	}
     	@Override
     	public void process() {
-...
     				return
+    			}
+    		}
     		def files = conlluSrcDirectory.listFiles()
     		files.sort()
     		// Keep or not contractions
     		boolean keepContractions =  "true".equals(UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.KEEP_MULTIWORD_TOKENS, UDPreferences.getInstance().getString(UDPreferences.KEEP_MULTIWORD_TOKENS)));
     			println "Remove multiword tokens (keepContractions=$keepContractions)  & add XmlId if necessary"
     			ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
     			for (File conlluFile : files) {
     				cpb_texts.tick()
     				if (conlluFile.getName().endsWith(".conllu")) {
     					String textid = FileUtils.stripExtension(conlluFile)
     					int wcounter = 1;
     					ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8");
     					for (int i = 0 ; i < lines.size() ; i++) {
     						String line = lines[i]
     						if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
     						def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
     						if (split[-1] != null && !split[-1].contains("XmlId=")) {
     							if (split[-1] == "_") {
     								split[-1] = "XmlId=w_"+textid+"_"+(wcounter++)
     							} else {
     								split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++)
+    							}
+    						}
     						if (!keepContractions && split[0] != null && split[0].contains("-")) {
     							println "REMOVE"
     							println "\t"+lines[i]
     							println "\t"+lines[i+1]
     							println "\t"+lines[i+2]
     							def split1 = lines[i+1].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
     							def split2 = lines[i+2].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
     							if (split[2] == "_") split[2] = merge(split1[2],split2[2])
     							if (split[3] == "_") split[3] = merge(split1[3], split2[3])
     							if (split[4] == "_") split[4] = merge(split1[4], split2[4])
     							if (split[5] == "_") split[5] = merge(split1[5], split2[5])
     							if (split[6] == "_") split[6] = split1[6]
     							if (split[7] == "_") split[7] = merge(split1[7], split2[7])
     							if (split[8] == "_") split[8] = merge(split1[8], split2[8])
     							lines.remove(i+1)
     							lines.remove(i+1)
+    						}
     						lines[i] = split.join("\t")
+    					}
     					IOUtils.write(conlluFile, lines.join("\n") + "\n")
+    				}
+    			}
     			cpb_texts.done()
     //		// Fix missing XmlId in conllu files
     //		println "Setting word XmlID if necessary"
     //		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
     //		for (File conlluFile : files) {
     //			cpb_texts.tick()
     //			if (conlluFile.getName().endsWith(".conllu")) {
     //				String textid = FileUtils.stripExtension(conlluFile)
     //				int wcounter = 1;
     //				ArrayList<String> lines = IOUtils.getLines(conlluFile, "UTF-8");
     //				for (int i = 0 ; i < lines.size() ; i++) {
     //					String line = lines[i]
     //					if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
     //
     //					def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
     //					if (split[-1] != null && !split[-1].contains("XmlId=")) {
     //						split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++)
     //						lines[i] = split.join("\t")
     //					}
     //
     //				}
     //				IOUtils.write(conlluFile, lines.join("\n") + "\n")
     //			}
     //		}
     //		cpb_texts.done()
     		File metadataFile = Metadatas.findMetadataFile(module.sourceDirectory)
     		File srcDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu2tei")
     		srcDirectory.deleteDir()
-...
     			println "Aborting. No CONLL file found in $inputDirectory."
     			return false
+    		}
     		files.sort()
     		ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
     		println "Splitting CoNLL-U files..."
-...
     			println "Aborting. No CONLL file found in $inputDirectory."
     			return false
+    		}
     		files.sort()
     		def properties = Arrays.asList(ImportCoNLLUAnnotations.UD_PROPERTY_NAMES)
     		String prefix = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.UDPREFIX, UDPreferences.getInstance().getString(UDPreferences.UDPREFIX));
     		UDPreferences.getInstance().setProjectPreferenceValue(project, UDPreferences.UDPREFIX, prefix); // copy the current preference into the corpus preference
     		boolean keepContractions =  "true".equals(UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.KEEP_CONTRACTIONS, UDPreferences.getInstance().getString(UDPreferences.KEEP_CONTRACTIONS)));
     		def headPropertiesToProject = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.IMPORT_HEAD_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_HEAD_TO_PROJECT)).split(",") as Set
     		def depsPropertiesToProject = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.IMPORT_DEPS_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_DEPS_TO_PROJECT)).split(",") as Set
-...
     				} else {
     					HashMap<String, String> wProperties = new HashMap<String, String>()
     					LinkedHashMap<String, String> wProperties = new LinkedHashMap<String, String>()
     					def split = line.split("\t")
     					def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length)
     					if (split.size() == properties.size()) {
     						String id = split[0]
     						for (int i = 0 ; i < split.size() ; i++) {
     							wProperties[properties[i]] = split[i]
+    						}
     						if (wProperties.get("id").equals("1")) { // it's a new sentence, store the current if any and starts a new sentence
     							if (words.size() > 0) {
     								def sentence = [par_id, sent_id, words, comments]
     								content.add(sentence)
     								sent_id = ""
     								par_id = "1"
     								comments = []
     								words = []
+    							}
     //						if (wProperties.get("id").equals("1") || wProperties.get("id").startsWith("1-")) { // it's a new sentence, store the current if any and starts a new sentence
     //							if (words.size() > 0) {
     //								def sentence = [par_id, sent_id, words, comments]
     //								content.add(sentence)
     //
     //								sent_id = ""
     //								par_id = "1"
     //								comments = []
     //								words = []
     //							}
     //						}
     						if (wProperties[properties[0]].contains(".")) { // id
     							// empty node
     						} else {
     							words << wProperties
+    						}
     						words << wProperties
     					} else {
     						//println "Warning: not a line: "+line
+    					}
+    				}
+    			}
     			if (words.size() > 0) { // last sentence ?
     				def sentence = [par_id, sent_id, words, comments]
     				content.add(sentence)
+    			}
     			if (content.size() == 0) {
     				continue;
+    			}
     			//println "${content.size()} sentences found."
     			File xmlFile = new File(srcDirectory, text_id+".xml")
     			// println "xmlFile=$xmlFile"
     			BufferedOutputStream output = new BufferedOutputStream(new FileOutputStream(xmlFile))
-...
     					writer.writeCharacters("\n")
+    				}
     				if (!keepContractions) { // merge properties in the "-" word and remove the parts
     					for (int i = 0 ; i < words.size() ; i++) {
     						def word = words[i]
     						String id = word[0]
     						if (id.contains("-")) { // multi-word line
     							int index = id.indexOf("-")
     							String id1 = id.substring(0, index)
     							String id2 = id.substring(index+1)
     							def token1 = sentence[id1]
     							def token2 = sentence[id2]
     							if (token1 == null || token2 == null) {
     								println "Error: text $text_id paragraph $par_id sent $sent_id word $id has wrong token ids $id1 and $id2 -> $token1 and $token2"
     								continue
+    							}
     							for (String p : properties) {
     								if (p == "id") continue // don't merge the form property
     									if (p == "form") continue // don't merge the form property
     									word[p] =  token1[p] + "+" + token2[p]
+    							}
     							words.remove(i+1) // remove the token
     							words.remove(i+1) // remove the token
+    						}
+    					}
+    				}
     				if (headPropertiesToProject.size() > 0 || depsPropertiesToProject.size() > 0) {
     					LinkedHashMap sentencehash = new LinkedHashMap()
     					//println "WORDS="+words

     				File tigerCorpusExistingDirectory = new File(this.binaryDirectory, "tiger/"+corpusName);
     				MainCorpus corpus = this.project.getFirstChild(MainCorpus.class);
     				corpus.compute(false); // load  the corpus in CQP
     				TIGERSearchEngine.writeOffsetDataFiles(corpus, "editionId", tigerCorpusExistingDirectory, tigerDirectory, tigerCorpusExistingDirectory)
     				TIGERSearchEngine.writeOffsetDataFiles(corpus, "editionId", tigerCorpusExistingDirectory, tigerDirectory, tigerCorpusExistingDirectory, kee)
+    			}
     		} else {
     			println "Can not do the TIGER indexes step."

     	public static String UDPREFIX = "ud_prefix";
     	public static String KEEP_CONTRACTIONS = "import_keep_contractions";
     	public static String KEEP_MULTIWORD_TOKENS = "import_KEEP_MULTIWORD_TOKENS";
     	public static String IMPORT_USE_NEW_DOC_ID = "import_use_new_doc_id";
-...
     		preferences.put(UDPREFIX, "ud-");
     		preferences.put(IMPORT_HEAD_TO_PROJECT, "upos,deprel");
     		preferences.put(IMPORT_DEPS_TO_PROJECT, "upos,deprel");
     		preferences.putBoolean(KEEP_CONTRACTIONS, true);
     		preferences.putBoolean(KEEP_MULTIWORD_TOKENS, false);
     		preferences.putBoolean(IMPORT_USE_NEW_DOC_ID, true);
     		preferences.putBoolean(IMPORT_BUILD_TIGERSEARCH_INDEXES, true);
+    	}

     					if (buffer.length() > 0) buffer.append("\t");
     					if (prop.equals("form") && startOfSubMatch) buffer.append("[");
     					buffer.append(values.get(prefix+prop)[p]);
     					String v = values.get(prefix+prop)[p];
     					if (v.startsWith("|") && v.endsWith("|")) v = v.substring(1, v.length() - 1); // remove | for CQP multi value
     					buffer.append(v);
     					if (prop.equals("form") && endOfSubMatch) buffer.append("]");
+    				}

     import org.txm.conllu.core.function.UDTreeSearch;
     import org.txm.conllu.core.preferences.UDPreferences;
     import org.txm.core.engines.EngineType;
     import org.txm.core.preferences.TBXPreferences;
     import org.txm.core.preferences.TXMPreferences;
     import org.txm.core.results.TXMResult;
     import org.txm.objects.CorpusBuild;
-...
     import org.txm.searchengine.core.SearchEngine;
     import org.txm.searchengine.core.SearchEngineProperty;
     import org.txm.searchengine.core.Selection;
     import org.txm.searchengine.cqp.CQPPreferences;
     import org.txm.searchengine.cqp.CQPSearchEngine;
     import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
     import org.txm.searchengine.cqp.corpus.CQPCorpus;
-...
     	 * @return A CQL query to build a Subcorpus of UD sentences
     	 */
     	public static String getUDSCQPQueryString(String prefix) {
     		return "["+prefix+"id=\"1\"] ["+prefix+"id !=\"1\"]+ [:"+prefix+"id=\"1|__UNDEF__\":]";  //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
     		String emptyvalue = TBXPreferences.getInstance().getString(TBXPreferences.EMPTY_PROPERTY_VALUE_CODE);
     		return "["+prefix+"id=\"1(-.+)?\"] []* [:"+prefix+"id=\"1(-.+)?|"+emptyvalue+"\":]";  //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
+    	}
     	public String hasAdditionalDetailsForResult(TXMResult result) {

     		gdata2.colspan = 4; // one line
     		useNewDocIdButton.setLayoutData(gdata2);
     		keepWordContractionsButton = toolkit.createButton(sectionClient, "Keep word contractions when importing CoNLL-U files", SWT.CHECK);
     		keepWordContractionsButton = toolkit.createButton(sectionClient, "Keep multiword tokens when importing CoNLL-U files", SWT.CHECK);
     		gdata2 = getButtonLayoutData();
     		gdata2.colspan = 4; // one line
     		keepWordContractionsButton.setLayoutData(gdata2);
-...
     		buildTIGERIndexesButton.setSelection(customNode.getBoolean(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES, UDPreferences.getInstance().getBoolean(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES))); //$NON-NLS-1$
     		useNewDocIdButton.setSelection(customNode.getBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID, UDPreferences.getInstance().getBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID))); //$NON-NLS-1$
     		keepWordContractionsButton.setSelection(customNode.getBoolean(UDPreferences.KEEP_CONTRACTIONS, UDPreferences.getInstance().getBoolean(UDPreferences.KEEP_CONTRACTIONS))); //$NON-NLS-1$
     		keepWordContractionsButton.setSelection(customNode.getBoolean(UDPreferences.KEEP_MULTIWORD_TOKENS, UDPreferences.getInstance().getBoolean(UDPreferences.KEEP_MULTIWORD_TOKENS))); //$NON-NLS-1$
     		udPropertiesPrefixButton.setText(customNode.get(UDPreferences.UDPREFIX, UDPreferences.getInstance().getString(UDPreferences.UDPREFIX))); //$NON-NLS-1$
     		headPropertiesText.setText(customNode.get(UDPreferences.IMPORT_HEAD_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_HEAD_TO_PROJECT))); //$NON-NLS-1$
     		depsPropertiesText.setText(customNode.get(UDPreferences.IMPORT_DEPS_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_DEPS_TO_PROJECT))); //$NON-NLS-1$
-...
     			Preferences customNode = project.getPreferencesScope().getNode(UDPreferences.getInstance().getPreferencesNodeQualifier());
     			customNode.putBoolean(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES, buildTIGERIndexesButton.getSelection());
     			customNode.putBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID, useNewDocIdButton.getSelection());
     			customNode.putBoolean(UDPreferences.KEEP_CONTRACTIONS, keepWordContractionsButton.getSelection());
     			customNode.putBoolean(UDPreferences.KEEP_MULTIWORD_TOKENS, keepWordContractionsButton.getSelection());
     			customNode.put(UDPreferences.UDPREFIX, udPropertiesPrefixButton.getText());
     			customNode.put(UDPreferences.IMPORT_HEAD_TO_PROJECT, headPropertiesText.getText());
     			customNode.put(UDPreferences.IMPORT_DEPS_TO_PROJECT, depsPropertiesText.getText());

     		this.addField(new BooleanFieldEditor(UDPreferences.IMPORT_USE_NEW_DOC_ID, "Use new odc id when importing CoNLL-U files", this.getFieldEditorParent()));
     		this.addField(new BooleanFieldEditor(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES, "Build TIGERSearch indexes as well", this.getFieldEditorParent()));
     		this.addField(new BooleanFieldEditor(UDPreferences.KEEP_CONTRACTIONS, "Keep word contractions when importing CoNLL-U files", this.getFieldEditorParent()));
     		this.addField(new BooleanFieldEditor(UDPreferences.KEEP_MULTIWORD_TOKENS, "Keep multiword tokens when importing CoNLL-U files", this.getFieldEditorParent()));
     		udPrefixField = new StringFieldEditor(UDPreferences.UDPREFIX, "UD properties prefix", this.getFieldEditorParent());
     		this.addField(udPrefixField);
     		this.addField(new StringFieldEditor(UDPreferences.IMPORT_HEAD_TO_PROJECT, "UD head properties to project (comma separated list)", this.getFieldEditorParent()));

Laboratoire ICAR » Plateforme TXM

Révision 3994