/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

     import org.eclipse.swt.SWT;
     import org.eclipse.swt.widgets.Button;
     import org.eclipse.swt.widgets.Composite;
     import org.eclipse.swt.widgets.Group;
     import org.eclipse.swt.widgets.Label;
     import org.eclipse.swt.widgets.Text;
     import org.eclipse.ui.forms.events.ExpansionAdapter;
-...
     import org.txm.objects.Project;
     import org.txm.rcp.editors.imports.ImportFormEditor;
     import org.txm.rcp.editors.imports.sections.ImportEditorSection;
     import org.txm.rcp.swt.widget.RadioGroup;
     public class CoNLLUSection extends ImportEditorSection {
-...
     	Button buildTIGERIndexesButton;
     	Button useNewDocIdButton;
     	Button keepWordContractionsButton;
     	RadioGroup keepWordContractionsButton;
     	Text udPropertiesPrefixButton;
     	private Text headPropertiesText;
     	private Text depsPropertiesText;
-...
     		gdata2.colspan = 4; // one line
     		useNewDocIdButton.setLayoutData(gdata2);
     		keepWordContractionsButton = toolkit.createButton(sectionClient, "Keep multiword tokens when importing CoNLL-U files", SWT.CHECK);
     		keepWordContractionsButton = new RadioGroup(sectionClient, SWT.NONE, "Multiwords management", new String[][] {{UDPreferences.MULTIWORDS, "Multiwords"}, {UDPreferences.TOKENS, "Tokens"}, {UDPreferences.ALL, "All"}});//toolkit.create toolkit.createButton(sectionClient, "Keep multiword tokens when importing CoNLL-U files", SWT.CHECK);
     		keepWordContractionsButton.setToolTipText("Multiwords: only the multiwords are indexed, Tokens: Only the multiwords tokens are indexed, All: both are indexed");
     		gdata2 = getButtonLayoutData();
     		gdata2.colspan = 4; // one line
     		keepWordContractionsButton.setLayoutData(gdata2);
-...
     		buildTIGERIndexesButton.setSelection(customNode.getBoolean(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES, UDPreferences.getInstance().getBoolean(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES))); //$NON-NLS-1$
     		useNewDocIdButton.setSelection(customNode.getBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID, UDPreferences.getInstance().getBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID))); //$NON-NLS-1$
     		keepWordContractionsButton.setSelection(customNode.getBoolean(UDPreferences.KEEP_MULTIWORD_TOKENS, UDPreferences.getInstance().getBoolean(UDPreferences.KEEP_MULTIWORD_TOKENS))); //$NON-NLS-1$
     		keepWordContractionsButton.setSelection(customNode.get(UDPreferences.MULTIWORDS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.MULTIWORDS_MANAGEMENT))); //$NON-NLS-1$
     		udPropertiesPrefixButton.setText(customNode.get(UDPreferences.UDPREFIX, UDPreferences.getInstance().getString(UDPreferences.UDPREFIX))); //$NON-NLS-1$
     		headPropertiesText.setText(customNode.get(UDPreferences.IMPORT_HEAD_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_HEAD_TO_PROJECT))); //$NON-NLS-1$
     		depsPropertiesText.setText(customNode.get(UDPreferences.IMPORT_DEPS_TO_PROJECT, UDPreferences.getInstance().getString(UDPreferences.IMPORT_DEPS_TO_PROJECT))); //$NON-NLS-1$
-...
     			Preferences customNode = project.getPreferencesScope().getNode(UDPreferences.getInstance().getPreferencesNodeQualifier());
     			customNode.putBoolean(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES, buildTIGERIndexesButton.getSelection());
     			customNode.putBoolean(UDPreferences.IMPORT_USE_NEW_DOC_ID, useNewDocIdButton.getSelection());
     			customNode.putBoolean(UDPreferences.KEEP_MULTIWORD_TOKENS, keepWordContractionsButton.getSelection());
     			customNode.put(UDPreferences.MULTIWORDS_MANAGEMENT, keepWordContractionsButton.getSelection());
     			customNode.put(UDPreferences.UDPREFIX, udPropertiesPrefixButton.getText());
     			customNode.put(UDPreferences.IMPORT_HEAD_TO_PROJECT, headPropertiesText.getText());
     			customNode.put(UDPreferences.IMPORT_DEPS_TO_PROJECT, depsPropertiesText.getText());

     		this.addField(new BooleanFieldEditor(UDPreferences.IMPORT_USE_NEW_DOC_ID, "Use new odc id when importing CoNLL-U files", this.getFieldEditorParent()));
     		this.addField(new BooleanFieldEditor(UDPreferences.IMPORT_BUILD_TIGERSEARCH_INDEXES, "Build TIGERSearch indexes as well", this.getFieldEditorParent()));
     		this.addField(new BooleanFieldEditor(UDPreferences.KEEP_MULTIWORD_TOKENS, "Keep multiword tokens when importing CoNLL-U files", this.getFieldEditorParent()));
     		this.addField(new BooleanFieldEditor(UDPreferences.MULTIWORDS_MANAGEMENT, "Keep multiword tokens when importing CoNLL-U files", this.getFieldEditorParent()));
     		udPrefixField = new StringFieldEditor(UDPreferences.UDPREFIX, "UD properties prefix", this.getFieldEditorParent());
     		this.addField(udPrefixField);
     		this.addField(new StringFieldEditor(UDPreferences.IMPORT_HEAD_TO_PROJECT, "UD head properties to project (comma separated list)", this.getFieldEditorParent()));

     import java.nio.channels.FileChannel;
     import java.util.ArrayList;
     import java.util.HashMap;
     import java.util.LinkedHashMap;
     import java.util.LinkedHashSet;
     import java.util.List;
-...
     	HashMap<CorpusBuild, TSCorpus> corpora = null;
     	public TSCorpus getTSCorpus(CorpusBuild corpus) {
     		CorpusBuild root = corpus.getRootCorpusBuild();
     		TSCorpus tscorpus = corpora.get(root);
     		if (tscorpus != null) {
-...
     		TSCorpus tcorpus = this.getTSCorpus(corpus);
     		TSResult result = queryTIGER(corpus, query);
     		if (result == null) { // no result
     			return new EmptySelection(query);
+    		}
     		return fromTIGERMatchToSelection(tcorpus, result, query, corpus);
+    	}
     	public TSResult queryTIGER(CorpusBuild corpus, IQuery query) throws Exception {
     		TSCorpus tcorpus = this.getTSCorpus(corpus);
-...
     				return result;
+    			}
     			int[] cpos = { matches.get(0).getStart(), matches.get(matches.size() - 1).getEnd() };
     			//apply shifts on start and end positions
     			MappedByteBuffer offsetsMapped = tcorpus.getOffsetsMapped();
     			if (offsetsMapped != null) {
     				cpos[0] -= offsetsMapped.getInt(cpos[0] * Integer.BYTES);
     				cpos[1] -= offsetsMapped.getInt(cpos[1] * Integer.BYTES);
+    			}
     			int[] starts = tcorpus.getSentenceStartPositions();
     			int sent_max = starts.length;
     			int sent_min = 0;
-...
     					break;
+    				}
+    			}
     			Log.finest("QUERYING sentences: " + sent_min + " -> " + sent_max); //$NON-NLS-1$ //$NON-NLS-2$
     			result = tcorpus.query(query.getQueryString().replace("\n", " "), sent_min, sent_max, -1); //$NON-NLS-1$ //$NON-NLS-2$
+    		}
-...
     		return new TIGERSelection(query, result2);
+    	}
     	@Override
     	public Query newQuery() {
     		return new TIGERQuery();
-...
     		PrintWriter writer = IOUtils.getWriter(logprop, "UTF-8"); //$NON-NLS-1$
     		writer.println("# Default log configuration of the TIGERSearch suite"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
     				"log4j.rootLogger=SEVERE,Logfile"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
     				"log4j.rootLogger=INFO,Logfile"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
     				"log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=SEVERE"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
     				"log4j.appender.Logfile=org.apache.log4j.RollingFileAppender"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
     				"log4j.appender.Logfile.File="+logprop.getAbsolutePath()+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
-...
     				"log4j.appender.Logfile.MaxBackupIndex=1"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
     				"log4j.appender.Logfile.layout=org.apache.log4j.PatternLayout"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
     				"log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n"); //$NON-NLS-1$
     		writer.close();
     		BasicConfigurator.configure();
     		String driverFilename = TigerSearchPreferences.getInstance().getString(TigerSearchPreferences.DRIVER_FILENAME);
     		File master = new File(sourceDirectory, driverFilename);
     		if (!master.exists()) {
     //			File[] xmlFiles = sourceDirectory.listFiles(new FileFilter() {
     //				public boolean accept(File file) {
     //					if (file.isDirectory()) return false;
     //					if (file.isHidden()) return false;
     //					String filename = file.getName();
     //					if (filename.equals("import.xml")) return false;
     //					if (!filename.endsWith(".xml")) return false;
     //
     //					return true;
     //				}
     //			});
     //
     //			if (xmlFiles == null) {
     				System.out.println("No master file found in: "+sourceDirectory);
     				return false;
     //			}
     //			master = xmlFiles[0];
     			//			File[] xmlFiles = sourceDirectory.listFiles(new FileFilter() {
     			//				public boolean accept(File file) {
     			//					if (file.isDirectory()) return false;
     			//					if (file.isHidden()) return false;
     			//					String filename = file.getName();
     			//					if (filename.equals("import.xml")) return false;
     			//					if (!filename.endsWith(".xml")) return false;
     			//
     			//					return true;
     			//				}
     			//			});
     			//
     			//			if (xmlFiles == null) {
     			System.out.println("No master file found in: "+sourceDirectory);
     			return false;
     			//			}
     			//			master = xmlFiles[0];
+    		}
     		String uri = master.getAbsolutePath(); // TIGER corpus source root file
     		File tigerBinDir = new File(tigerDir, corpusName);
     		tigerBinDir.mkdir();
     		try {
     			LinkedHashMap<String, ArrayList<String>> errorsPerTIGERSubcorpus = new LinkedHashMap<>();
     			IndexBuilderErrorHandler handler = new SimpleErrorHandler(tigerBinDir.getAbsolutePath()) {
     				public void setMessage(String message) { }
     				String currentSubcorpus = "";
     				public void setMessage(String message) {
     					if (message.startsWith("Reading subcorpus ")) {
     						currentSubcorpus = message.substring("Reading subcorpus ".length());
+    					}
+    				}
     				public void newSentenceError(String sid, String message) {
     					if (!errorsPerTIGERSubcorpus.containsKey(currentSubcorpus)) errorsPerTIGERSubcorpus.put(currentSubcorpus, new ArrayList<String>());
     					errorsPerTIGERSubcorpus.get(currentSubcorpus).add("Error at "+sid+": "+message);
+    				}
     				public void newSentenceWarning(String sid, String message) {
     					if (!errorsPerTIGERSubcorpus.containsKey(currentSubcorpus)) errorsPerTIGERSubcorpus.put(currentSubcorpus, new ArrayList<String>());
     					errorsPerTIGERSubcorpus.get(currentSubcorpus).add("Warning at "+sid+": "+message);
+    				}
     				public void setNumberOfSentences(int number) { }
     				public void setProgressBar(int value) { }
     			};
     			XMLIndexing indexing = new XMLIndexing(corpusName, uri, tigerBinDir.getAbsolutePath(), handler, false);
     			indexing.startIndexing();
     			File logs = new File(tigerBinDir, "indexing.log"); //$NON-NLS-1$
     			String txt = IOUtils.getText(logs);
     			if (txt.contains("Error in corpus graph ")) {
     				Log.warning("Error while importing TIGER corpus: "+txt);
     				return false;
     			if (errorsPerTIGERSubcorpus.size() > 0) {
     				for (String s : errorsPerTIGERSubcorpus.keySet()) {
     					Log.info("In "+s);
     					for (String s2 : errorsPerTIGERSubcorpus.get(s)) {
     						Log.info("	"+s2);
+    					}
+    				}
+    			}
+    		}
     		catch (Exception e) {

     		super(module)
+    	}
     	public final String merge (String s1, String s2) {
     		if (s1 == s2) return s1
     		if (s1 == "") return s2
     		if (s2 == "") return s1
     		if (s1 == "_") return s2
     		if (s2 == "_") return s1
     		if (s1 == ""  && s2 == "") return "_"
     		if (s1 == "_" && s2 == "_") return "_"
     		return s1+"|"+s2
     	public final String merge(String orig, def sss) {
     		int n = 0;
     		for (String s : sss) {
     			if (s != "" && s != "_") {
     				def ssset = new HashSet(sss);
     				if (ssset.size() == 1) return ssset.join(" + ")
     				return sss.join(" + ")
+    			}
+    		}
     		return orig;
+    	}
     	@Override
-...
     			conlluSrcDirectory.deleteDir()
     			conlluSrcDirectory.mkdirs()
     			println "Split CoNLL-U files per text..."
     			if (!splitCoNLLUFiles(inputDirectory, conlluSrcDirectory, project)) {
     				return
+    			}
-...
     		files.sort()
     		// Keep or not contractions
     		boolean keepContractions =  "true".equals(UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.KEEP_MULTIWORD_TOKENS, UDPreferences.getInstance().getString(UDPreferences.KEEP_MULTIWORD_TOKENS)));
     		String keepContractions =  UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.MULTIWORDS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.MULTIWORDS_MANAGEMENT));
     			println "Remove multiword tokens (keepContractions=$keepContractions)  & add XmlId if necessary"
     			println "Multiwords managment ($keepContractions) & add XmlId if necessary & remove empty nodes"
     			ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size())
     			for (File conlluFile : files) {
     				cpb_texts.tick()
-...
     							} else {
     								split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++)
+    							}
+    						}
     						if (!keepContractions && split[0] != null && split[0].contains("-")) {
     							println "REMOVE"
     							println "\t"+lines[i]
     							println "\t"+lines[i+1]
     							println "\t"+lines[i+2]
     							def split1 = lines[i+1].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
     							def split2 = lines[i+2].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
     							if (split[2] == "_") split[2] = merge(split1[2],split2[2])
     							if (split[3] == "_") split[3] = merge(split1[3], split2[3])
     							if (split[4] == "_") split[4] = merge(split1[4], split2[4])
     							if (split[5] == "_") split[5] = merge(split1[5], split2[5])
     							if (split[6] == "_") split[6] = split1[6]
     							if (split[7] == "_") split[7] = merge(split1[7], split2[7])
     							if (split[8] == "_") split[8] = merge(split1[8], split2[8])
     							lines.remove(i+1)
     							lines.remove(i+1)
     						if (split[0].contains(".")) {
     							//println "REMOVE EMPTY NODE: $split : "+
     							lines.remove(i)
     							i--
     							continue; // next !
+    						}
     						lines[i] = split.join("\t")
     						if (keepContractions == UDPreferences.ALL) {
     							// ok on fait rien
     						} else if (keepContractions == UDPreferences.TOKENS) {
     							if (split[0].contains("-")) {
     								//println "REMOVE - $split"
     								lines.remove(i)
     								i--
     								continue; /// next !
+    							}
     						} else if (keepContractions == UDPreferences.MULTIWORDS) {
     							if (split[0].contains("-")) {
     								int n1 = Integer.parseInt(split[0].substring(0,  split[0].indexOf("-")));
     								split[0] = ""+n1
     								int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
     								int n =  1 + n2 - n1
     								def splits = []
     								for (int j = 1 ; j <= n ;j++) {
     									def tmp = lines[i+j].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
     									if (tmp[0].contains(".")) {
     										//println "PRE-REMOVE EMPTY NODE: $split : "+
     										lines.remove(i+j)
     										j--
     										continue // next token
+    									}
     									splits << tmp
+    								}
     								for (int j = 2 ; j < 8 ; j++) {
     									split[j] = merge(split[j], splits.collect(){it[j]})
+    								}
     								if (split[9].length() > 0) split[9] += "|"
     								split[9] += "expand="+splits.collect(){it[1]}.join("_")
     								//println "REMOVE non- $split"
     								for (int j = 1 ; j <= n ;j++) {
     									splits << lines[i+j].split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
     									lines.remove(i+1)
+    								}
+    							}
+    						}
     						lines[i] = split.join("\t") // rebuild the line
+    					}
     					IOUtils.write(conlluFile, lines.join("\n") + "\n")
     					IOUtils.write(conlluFile, lines.join("\n") + "\n") // CoNLLU needs the last line
+    				}
+    			}
     			cpb_texts.done()
-...
+    	}
     	public static def splitCoNLLUFiles(File inputDirectory, File srcDirectory, def project) {
     		def files = inputDirectory.listFiles()
     		def files = inputDirectory.listFiles(new FilenameFilter() {
     			boolean accept(File dir, String name) {
     				return name.toLowerCase().endsWith(".conllu")
+    			}
     		});
     		if (files == null) {
     			println "Aborting. No CONLL file found in $inputDirectory."

     			$commentlines++;
     			next;
+    		}
     		# Added by AL for contractions
     		elsif ( $words[$w] =~ /^\d+-\d+/ ) {
     			#	print LOG "Contraction line loop 1: $words[$w]\n";
     			$commentlines++;
     			#	$contractions++;
     			next;
+    		}
     	#	# Added by AL for contractions
     	#	elsif ( $words[$w] =~ /^\d+-\d+/ ) {
+    	#
     	#		#	print LOG "Contraction line loop 1: $words[$w]\n";
     	#		$commentlines++;
+    	#
     	#		#	$contractions++;
     	#		next;
     	#	}
     		else {
     			if ( defined($opt_c) ) {
     				$words[$w] =~ s/coord(\d+)-//g;
-...
     			next;
+    		}
     		#Added AL for contractions
     		if ( $words[$i] =~ /^\d+-\d+/ ) {
     #		#Added AL for contractions
     #		if ( $words[$i] =~ /^\d+-\d+/ ) {
+    #
     #			#       print LOG "Contraction loop 2 : $words[$i]\n";
     #			next;
     #		}
     			#       print LOG "Contraction loop 2 : $words[$i]\n";
     			next;
+    		}
     		else {
     			@cols = split( /\t/, $words[$i] );
-...
       <value name="case:pred">case:pred</value> <!-- Welsh -->
       <value name="case:voc">case:voc</value> <!-- Irish, Scottish Gaelic -->
       <value name="case">case marking</value>
       <value name="case2">case2 marking</value>
       <value name="cc:nc">cc:nc</value> <!-- Old French -->
       <value name="cc:nc">Coordinated conjunct : non coordonant</value>
       <value name="cc:preconj">cc:preconj</value> <!-- Arabic, English, Erzya, Estonian, Faroese, Finnish, German, Indonesian, Komi Permyak, Komi Zyrian, Moksha, North Sami, Persian, Polish, Portuguese, Romanian, Slovenian, Spanish, Thai, Turkish -->

     		File mainFile = new File(tigerXMLDirectory, "main.xml");
     		File perlScript = BundleUtils.getFile("org.txm.conllu.core", "groovy", "/org/txm/scripts/importer/conllu/", "conll2tiger-ud.pl");
     		System.out.println("PERL: "+perlScript);
     		DeleteDir.deleteDirectory(tigerXMLDirectory);
     		//println "clean&mkdir $tigerXMLDirectory"
     		tigerXMLDirectory.mkdir();

     	public static String UDPREFIX = "ud_prefix";
     	public static String KEEP_MULTIWORD_TOKENS = "import_KEEP_MULTIWORD_TOKENS";
     	public static String MULTIWORDS_MANAGEMENT = "import_MULTIWORDS_MANAGEMENT";
     	public static String MULTIWORDS = "multiwords";
     	public static String TOKENS = "tokens";
     	public static String ALL = "all";
     	public static String IMPORT_USE_NEW_DOC_ID = "import_use_new_doc_id";
     	public static String IMPORT_HEAD_TO_PROJECT = "import_head_to_project";
-...
     		preferences.put(UDPREFIX, "ud-");
     		preferences.put(IMPORT_HEAD_TO_PROJECT, "upos,deprel");
     		preferences.put(IMPORT_DEPS_TO_PROJECT, "upos,deprel");
     		preferences.putBoolean(KEEP_MULTIWORD_TOKENS, false);
     		preferences.put(MULTIWORDS_MANAGEMENT, TOKENS); // boths, multiwords, tokens
     		preferences.putBoolean(IMPORT_USE_NEW_DOC_ID, true);
     		preferences.putBoolean(IMPORT_BUILD_TIGERSEARCH_INDEXES, true);
+    	}

     			// change statistical computing
     			if (TBXPreferences.getInstance().getBoolean(TBXPreferences.EXPERT_USER)) {
     //				modeText = new Text(extendedParametersArea, SWT.BORDER); // not yet ready
     				modeText = new Text(extendedParametersArea, SWT.BORDER); // not yet ready
+    			}
     			// Result area

Laboratoire ICAR » Plateforme TXM

Révision 4001