/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 460

     <?xml version="1.0" encoding="UTF-8"?>
     <projectDescription>
     	<name>TreeTagger</name>
     	<comment></comment>
     	<projects>
     	</projects>
     	<buildSpec>
     		<buildCommand>
     			<name>org.eclipse.jdt.core.javabuilder</name>
     			<arguments>
     			</arguments>
     		</buildCommand>
     		<buildCommand>
     			<name>org.eclipse.pde.ManifestBuilder</name>
     			<arguments>
     			</arguments>
     		</buildCommand>
     		<buildCommand>
     			<name>org.eclipse.pde.SchemaBuilder</name>
     			<arguments>
     			</arguments>
     		</buildCommand>
     	</buildSpec>
     	<natures>
     		<nature>org.eclipse.pde.PluginNature</nature>
     		<nature>org.eclipse.jdt.core.javanature</nature>
     	</natures>
     </projectDescription>

     package treetagger;
     import org.eclipse.jface.resource.ImageDescriptor;
     import org.eclipse.ui.plugin.AbstractUIPlugin;
     import org.osgi.framework.BundleContext;
     /**
      * The activator class controls the plug-in life cycle
      */
     public class Activator extends AbstractUIPlugin {
     	// The plug-in ID
     	public static final String PLUGIN_ID = "TreeTagger"; //$NON-NLS-1$
     	// The shared instance
     	private static Activator plugin;
     	/**
     	 * The constructor
     	 */
     	public Activator() {
+    	}
     	/*
     	 * (non-Javadoc)
     	 * @see org.eclipse.ui.plugin.AbstractUIPlugin#start(org.osgi.framework.BundleContext)
     	 */
     	public void start(BundleContext context) throws Exception {
     		super.start(context);
     		plugin = this;
+    	}
     	/*
     	 * (non-Javadoc)
     	 * @see org.eclipse.ui.plugin.AbstractUIPlugin#stop(org.osgi.framework.BundleContext)
     	 */
     	public void stop(BundleContext context) throws Exception {
     		plugin = null;
     		super.stop(context);
+    	}
     	/**
     	 * Returns the shared instance
+    	 *
     	 * @return the shared instance
     	 */
     	public static Activator getDefault() {
     		return plugin;
+    	}
     	/**
     	 * Returns an image descriptor for the image file at the given
     	 * plug-in relative path
+    	 *
     	 * @param path the path
     	 * @return the image descriptor
     	 */
     	public static ImageDescriptor getImageDescriptor(String path) {
     		return imageDescriptorFromPlugin(PLUGIN_ID, path);
+    	}
+    }

     package org.txm.treetagger.commands;
     import java.io.BufferedOutputStream;
     import java.io.BufferedReader;
     import java.io.BufferedWriter;
     import java.io.File;
     import java.io.FileOutputStream;
     import java.io.OutputStreamWriter;
     import java.io.PrintStream;
     import java.io.PrintWriter;
     import java.util.ArrayList;
     import java.util.Arrays;
     import java.util.HashMap;
     import java.util.HashSet;
     import java.util.LinkedHashMap;
     import java.util.LinkedHashSet;
     import java.util.List;
     import org.eclipse.core.commands.AbstractHandler;
     import org.eclipse.core.commands.ExecutionEvent;
     import org.eclipse.core.commands.ExecutionException;
     import org.eclipse.core.runtime.IProgressMonitor;
     import org.eclipse.core.runtime.IStatus;
     import org.eclipse.core.runtime.Status;
     import org.eclipse.jface.viewers.ISelection;
     import org.eclipse.jface.viewers.IStructuredSelection;
     import org.eclipse.ui.IWorkbenchWindow;
     import org.eclipse.ui.handlers.HandlerUtil;
     import org.kohsuke.args4j.Option;
     import org.txm.core.preferences.TBXPreferences;
     import org.txm.core.preferences.TXMPreferences;
     import org.txm.index.core.functions.Index;
     import org.txm.index.core.functions.Line;
     import org.txm.rcp.swt.widget.parameters.ParametersDialog;
     import org.txm.rcp.utils.JobHandler;
     import org.txm.searchengine.cqp.AbstractCqiClient;
     import org.txm.searchengine.cqp.CQPEngine;
     import org.txm.searchengine.cqp.corpus.Corpus;
     import org.txm.searchengine.cqp.corpus.MainCorpus;
     import org.txm.searchengine.cqp.corpus.Property;
     import org.txm.searchengine.cqp.corpus.query.Match;
     import org.txm.searchengine.cqp.corpus.query.Query;
     import org.txm.utils.DeleteDir;
     import org.txm.utils.io.IOUtils;
     import org.txm.utils.logger.Log;
     import org.txm.utils.treetagger.TreeTagger;
     /**
      * Our sample handler extends AbstractHandler, an IHandler base class.
      * @see org.eclipse.core.commands.IHandler
      * @see org.eclipse.core.commands.AbstractHandler
      */
     public class Train extends AbstractHandler {
     	public Corpus corpus = null;
     	@Option(name="model", usage="The model file to create", widget="CreateFile", required=true, def="fr.par")
     	public File model = null;
     	@Option(name="posProperty", usage="The pos property", widget="String", required=true, def="frpos")
     	public String posProperty = null;
     	@Option(name="sentenceTag", usage="The pos property", widget="String", required=true, def="SENT")
     	public String sentenceTag = null;
     	@Option(name="lemmaProperty", usage="The lemma property", widget="String", required=true, def="frlemma")
     	public String lemmaProperty = null;
     	@Option(name="lexique", usage="Lexicon file", widget="File", required=true, def="lexicon.txt")
     	public File lexique = null;
     	@Option(name="options", usage="TreeTagger supplementary options", widget="String", required=true, def="")
     	public String options = null;
     	/**
+    	 *
     	 */
     	public Object execute(ExecutionEvent event) throws ExecutionException {
     		IWorkbenchWindow window = HandlerUtil.getActiveWorkbenchWindowChecked(event);
     		ISelection isel = window.getActivePage().getSelection();
     		if (isel instanceof IStructuredSelection) {
     			IStructuredSelection sel = (IStructuredSelection)isel;
     			Object first = sel.getFirstElement();
     			if (first instanceof Corpus) {
     				corpus = (Corpus)first;
     				if (ParametersDialog.open(this)) {
     					train(corpus, model, lexique, new String[]{posProperty, lemmaProperty}, sentenceTag, options.split("  "));
     					return corpus;
+    				}
+    			}
+    		}
     		System.out.println("Wrong selection.");
     		return null;
+    	}
     	public static void train(final Corpus corpus, final File model, final File lexique, final String[] properties, final String sentenceTag, final String[] options) {
     		JobHandler job = new JobHandler("Applying TreeTagger to "+corpus+" corpus.") {
     			@Override
     			protected IStatus run(IProgressMonitor monitor) {
     				this.runInit(monitor);
     				try {
     					File lexique2 = lexique;
     					MainCorpus mainCorpus = corpus.getMainCorpus();
     					File corpusBinaryDirectory = mainCorpus.getBaseDirectory();
     					System.out.println("TRAIN : "+corpus+" with "+lexique2+" to create "+model+ " with properties "+Arrays.toString(properties));
     					if (properties == null || properties.length != 2) {
     						System.out.println("Error can't continue with selected word properties: "+Arrays.toString(properties));
     						return Status.CANCEL_STATUS;
+    					}
     					for (String p : properties) {
     						Property prop = corpus.getProperty(p);
     						if (prop == null) {
     							System.out.println("Missing property in corpus: "+p);
     							return Status.CANCEL_STATUS;
+    						}
+    					}
     					Property pos = corpus.getProperty(properties[0]);
     					Property lemma = corpus.getProperty(properties[1]);
     					// Prepare temporary directory
     					File treetaggerSrcDirectory = new File(mainCorpus.getBaseDirectory(), "treetagger");
     					DeleteDir.deleteDirectory(treetaggerSrcDirectory);
     					treetaggerSrcDirectory.mkdirs();
     					HashMap<String, HashSet<String>> simplified_lexicon = null;
     					HashMap<String, HashSet<String>> simplified_lexicon_errors = null;
     					int error_counter = 0;
     					// Create Lexicon file from an Index
     					if (lexique2 == null || !lexique2.exists()) {
     						System.out.println("Warning: no lexicon file or given lexicon file does not exist ("+lexique2+"). Using corpus Index...");
     						File lexiconfile = new File(treetaggerSrcDirectory, "lexicon.txt");
     						List<Property> corpusProperties = new ArrayList<Property>();
     						corpusProperties.add(mainCorpus.getProperty("word"));
     						for (String p : properties) {
     							Property prop = mainCorpus.getProperty(p);
     							if (prop == null) {
     								System.out.println("Error, a property is missing: "+p);
     								return Status.CANCEL_STATUS;
+    							}
     							corpusProperties.add(prop);
+    						}
     						Index index = new Index(mainCorpus, new Query("[]"), corpusProperties);
     						List<Line> lines = index.getAllLines();
     						LinkedHashMap<String, ArrayList<String>> lex = new LinkedHashMap<String, ArrayList<String>>();
     						HashMap<String, HashSet<String>> allPosValues = new HashMap<String, HashSet<String>>();
     						for (Line l : lines) {
     							List<List<String>> values = l.getUnitsProperties();
     							String form = values.get(0).get(0);
     							if (!lex.containsKey(form)) {
     								ArrayList<String> pairs = new ArrayList<String>();
     								HashSet<String> posValues = new HashSet<String>();
     								allPosValues.put(form, posValues);
     								lex.put(form, pairs);
+    							}
     							ArrayList<String> pairs = lex.get(form);
     							HashSet<String> posValues = allPosValues.get(form);
     							String posValue = values.get(1).get(0);
     							String lemmaValue = values.get(2).get(0);
     							if (posValues.contains(posValue)) {
     							} else {
     								posValues.add(posValue);
     								pairs.add(posValue);
     								pairs.add(lemmaValue);
+    							}
+    						}
     						BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(lexiconfile), "UTF-8"));
     						for (String form : lex.keySet()) {
     							writer.write(form);
     							boolean tab = true;
     							for (String v : lex.get(form)) {
     								if (tab) writer.write("\t"+v);
     								else writer.write(" "+v);
     								tab = !tab;
+    							}
     							writer.write("\n");
+    						}
     						writer.close();
     						lexique2 = lexiconfile;
     					} else { // diagnose lexicon content
     						simplified_lexicon = new HashMap<String, HashSet<String>>();
     						simplified_lexicon_errors = new HashMap<String, HashSet<String>>();
     						BufferedReader reader = IOUtils.getReader(lexique2);
     						String line = reader.readLine();
     						while (line != null) {
     							String[] split = line.split("\t", 2);
     							HashSet<String> posValues = new HashSet<String>();
     							simplified_lexicon.put(split[0], posValues);
     							for (String poslemme : split[1].split("\t")) {
     								String[] split2 = poslemme.split(" ", 2);
     								posValues.add(split2[0]);
+    							}
     							line = reader.readLine();
+    						}
     						reader.close();
+    					}
     					// create TT SRC file from CWB indexes
     					File ttSrcFile = new File(treetaggerSrcDirectory, mainCorpus.getName()+".tt");
     					System.out.println("TT SRC file: "+ttSrcFile.getAbsolutePath());
     					BufferedOutputStream fos = new BufferedOutputStream(new FileOutputStream(ttSrcFile));
     					PrintStream ps = new PrintStream(fos);
     					LinkedHashSet<Integer> positions = new LinkedHashSet<Integer>();
     					Property word = corpus.getProperty("word");
     					AbstractCqiClient CQI = CQPEngine.getCqiClient();
     					for (Match m : corpus.getMatches()) {
     						for (int i = m.getStart() ; i <= m.getEnd() ; i++) { // end match must be included
     							positions.add(i);
     							if (positions.size() >= 1000) { // avoid too big array
     								int[] positions_array = new int[positions.size()];
     								int ip = 0;
     								for (int p : positions) positions_array[ip++] = p;
     								String[] words = CQI.cpos2Str(word.getQualifiedName(), positions_array);
     								String[] values = CQI.cpos2Str(pos.getQualifiedName(), positions_array);
     								for (int iW = 0 ; iW < words.length ; iW++) {
     									String w = words[iW];
     									if (w != null) {
     										String s = w+"\t"+values[iW];
     										ps.println(s);
     										if (simplified_lexicon != null) { // check given lexicon
     											if (simplified_lexicon.containsKey(w)) {
     												if (!simplified_lexicon.get(w).contains(values[iW])) {
     													//System.out.println("Lexicon error: cannot find pos="+values[iW]+" for form="+w);
     													if (!simplified_lexicon_errors.containsKey(w)) simplified_lexicon_errors.put(w, new HashSet<String>());
     													HashSet<String> error_values = simplified_lexicon_errors.get(w);
     													error_values.add(values[iW]);
     													error_counter++;
+    												}
     											} else {
     												//System.out.println("Lexicon error: cannot find form="+w);
     												if (!simplified_lexicon_errors.containsKey(w)) simplified_lexicon_errors.put(w, new HashSet<String>());
     												HashSet<String> error_values = simplified_lexicon_errors.get(w);
     												error_values.add("#"+values[iW]);
     												error_counter++;
+    											}
+    										}
+    									}
+    								}
     								positions.clear();
+    							}
+    						}
+    					}
     					if (positions.size() > 0) { // write last words
     						int[] positions_array = new int[positions.size()];
     						int ip = 0;
     						for (int p : positions) positions_array[ip++] = p;
     						String[] words = CQI.cpos2Str(word.getQualifiedName(), positions_array);
     						String[] values = CQI.cpos2Str(pos.getQualifiedName(), positions_array);
     						for (int iW = 0 ; iW < words.length ; iW++) {
     							String w = words[iW];
     							if (w != null) {
     								String s = w+"\t"+values[iW];
     								ps.println(s);
+    							}
+    						}
     						positions.clear();
+    					}
     					ps.close();
     					if (simplified_lexicon_errors != null && simplified_lexicon_errors.size() > 0) {
     						File error_file = new File(treetaggerSrcDirectory, "errors.txt");
     						PrintWriter errorwriter = IOUtils.getWriter(error_file);
     						int c = 0;
     						System.out.println("Warning, lexicon errors ("+error_counter+") found with words:");
     						for (String w : simplified_lexicon_errors.keySet()) {
     							errorwriter.println(w+"="+simplified_lexicon_errors.get(w));
     							if (c < 10) {
     								System.out.println(w+"="+simplified_lexicon_errors.get(w));
     								c++;
     								if (c == 10) System.out.println("... errors display is trucated, see "+error_file.getAbsolutePath());
+    							}
+    						}
     						errorwriter.close();
     						//System.out.println("Cannot apply train-treetagger if lexicon is missing words and pos.");
     						//return Status.CANCEL_STATUS;
     						File lexique3 = new File(lexique2.getParentFile(), lexique2.getName()+".fix");
     						BufferedReader reader = IOUtils.getReader(lexique2);
     						PrintWriter writer = IOUtils.getWriter(lexique3);
     						String line = reader.readLine();
     						while (line != null) {
     							String w = line.split("\t", 2)[0];
     							if (simplified_lexicon_errors.containsKey(w)) {
     								for (String p : simplified_lexicon_errors.get(w)) {
     									if (!p.startsWith("#"))
     										line += ("\t"+p+" <no_lemma>"); // append missing value
+    								}
     								simplified_lexicon_errors.remove(w);
+    							}
     							writer.println(line);
     							line = reader.readLine();
+    						}
     						// write missing words
     						for (String w2 : simplified_lexicon_errors.keySet()) {
     							writer.print(w2);
     							for (String p : simplified_lexicon_errors.get(w2)) {
     								writer.print("\t"+p+" <no_lemma>");
+    							}
     							writer.println("");
+    						}
     						reader.close();
     						writer.close();
     						System.out.println("Adding words to a temporary lexicon: "+lexique3);
     						lexique2 = lexique3;
+    					}
     					// Create open class file : contains all pos values
     					File openclassfile = new File(treetaggerSrcDirectory, "openclasses.txt");
     					PrintWriter openClassFileWriter = IOUtils.getWriter(openclassfile);
     //					Lexicon poslexicon = corpus.getLexicon(pos);
     //					String[] posValues = poslexicon.getForms();
     //					for (int iV = 0 ; iV < posValues.length ; iV++) {
     //						if (iV == 0) openClassFileWriter.print(posValues[iV]);
     //						else openClassFileWriter.print(" "+posValues[iV]);
     //					}
     					openClassFileWriter.close();
     					// Call treetagger-train
     					if (ttSrcFile.exists() && lexique2.exists() && openclassfile.exists()) {
     						System.out.println("Running ");
     						String treetaggerBinDirectory = new File(TXMPreferences.getString(TBXPreferences.TREETAGGER_INSTALL_PATH, TBXPreferences.PREFERENCES_NODE), "bin").getAbsolutePath();
     						if (!treetaggerBinDirectory.endsWith("/")) treetaggerBinDirectory += "/";
     						TreeTagger tt = new TreeTagger(treetaggerBinDirectory, options);
     						tt.settoken();
     						tt.setquiet();
     						tt.setlemma();
     						tt.setsgml();
     						tt.setst(sentenceTag);
     						tt.setproto();
     						tt.setutf8();
     						tt.debug(true);
     						tt.traintreetagger(lexique2.getAbsolutePath(), openclassfile.getAbsolutePath(), ttSrcFile.getAbsolutePath(), model.getAbsolutePath());
     						System.out.println("Done: "+model.getAbsolutePath());
     					} else {
     						System.out.println("Aborting.");
+    					}
     					return Status.OK_STATUS;
     				} catch (Exception e) {
     					System.out.println("Error while training TT: "+e);
     					Log.printStackTrace(e);
+    				}
     				return Status.CANCEL_STATUS;
+    			}
     		};
     		job.schedule();
+    	}
+    }

     package org.txm.treetagger.commands;
     import java.io.File;
     import java.io.FileFilter;
     import java.util.Arrays;
     import java.util.HashMap;
     import org.eclipse.core.commands.AbstractHandler;
     import org.eclipse.core.commands.ExecutionEvent;
     import org.eclipse.core.commands.ExecutionException;
     import org.eclipse.core.runtime.IProgressMonitor;
     import org.eclipse.core.runtime.IStatus;
     import org.eclipse.core.runtime.Status;
     import org.eclipse.jface.viewers.ISelection;
     import org.eclipse.jface.viewers.IStructuredSelection;
     import org.eclipse.ui.IWorkbenchWindow;
     import org.eclipse.ui.handlers.HandlerUtil;
     import org.kohsuke.args4j.Option;
     import org.txm.rcp.commands.workspace.UpdateCorpus;
     import org.txm.rcp.swt.widget.parameters.ParametersDialog;
     import org.txm.rcp.utils.JobHandler;
     import org.txm.scripts.teitxm.Annotate;
     import org.txm.searchengine.cqp.corpus.Corpus;
     import org.txm.searchengine.cqp.corpus.MainCorpus;
     /**
      * Our sample handler extends AbstractHandler, an IHandler base class.
      * @see org.eclipse.core.commands.IHandler
      * @see org.eclipse.core.commands.AbstractHandler
      */
     public class Apply extends AbstractHandler {
     	@Option(name="model", usage="Model file", widget="File", required=true, def="model.par")
     	public File model = null;
     	@Option(name="posProperty", usage="The pos property", widget="String", required=true, def="frpos")
     	public String posProperty = null;
     	@Option(name="lemmaProperty", usage="The lemma property", widget="String", required=true, def="frlemma")
     	public String lemmaProperty = null;
     	@Option(name="options", usage="TreeTagger supplementary options", widget="String", required=true, def="")
     	public String options = null;
     	/**
+    	 *
     	 */
     	public Object execute(ExecutionEvent event) throws ExecutionException {
     		Corpus corpus = null;
     		IWorkbenchWindow window = HandlerUtil.getActiveWorkbenchWindowChecked(event);
     		ISelection isel = window.getActivePage().getSelection();
     		if (isel instanceof IStructuredSelection) {
     			IStructuredSelection sel = (IStructuredSelection)isel;
     			Object first = sel.getFirstElement();
     			if (first instanceof Corpus) {
     				corpus = (Corpus)first;
     				if (ParametersDialog.open(this)) {
     					apply(corpus, model, new String[]{posProperty, lemmaProperty}, options.split("  "));
     					return corpus;
+    				}
+    			}
+    		}
     		System.out.println("Wrong selection.");
     		return null;
+    	}
     	public static void apply(Corpus corpus, final File model, final String[] properties, final String[] options) {
     		final MainCorpus mainCorpus = corpus.getMainCorpus();
     		final File corpusBinaryDirectory = mainCorpus.getBaseDirectory();
     		final File txmDirectory = new File(corpusBinaryDirectory, "txm/"+mainCorpus.getName());
     		if (!txmDirectory.exists()) {
     			System.out.println("Can't apply TreeTagger to a corpus with no XML-TXM files.");
+    		}
     		final File[] files = txmDirectory.listFiles(new FileFilter() {
     			@Override
     			public boolean accept(File file) {
     				return file.isFile() && file.canWrite() && file.getName().endsWith(".xml");
+    			}
     		});
     		if (files == null || files.length == 0) {
     			System.out.println("Can't apply TreeTagger to a corpus with no XML-TXM files in "+txmDirectory);
+    		}
     		String lang = model.getName();
     		if (!lang.endsWith(".par")) {
     			System.out.println("Model file name must ends with the '.par' extension");
     			return;
+    		}
     		lang = lang.substring(0, lang.indexOf(".par"));
     		final HashMap<String, String> hash = new HashMap<String, String>();
     		for (File txmFile : files) {
     			hash.put(txmFile.getName(), lang);
+    		}
     		for (int i = 0 ; i < properties.length ; i++) properties[i] = properties[i].trim();
     		System.out.println("APPLY : "+model+" to "+corpus+" updating "+Arrays.toString(properties)+ " with options "+Arrays.toString(options));
     		JobHandler job = new JobHandler("Applying TreeTagger to "+corpus+" corpus.") {
     			@Override
     			protected IStatus run(IProgressMonitor monitor) {
     				this.runInit(monitor);
     				Annotate annotator = new Annotate();
     				annotator.setModelsDirectory(model.getParentFile());
     				annotator.setDebug();
     				if (!annotator.run(corpusBinaryDirectory, txmDirectory, hash, true, properties, options)) {
     					System.out.println("Fail to apply TreeTagger with "+txmDirectory+" files.");
     					return Status.CANCEL_STATUS;
+    				}
     				System.out.println("Done. Updating corpus...");
     				if (UpdateCorpus.update(mainCorpus) == null) {
     					System.out.println("Fail to update corpus indexes and editions.");
+    				}
     				System.out.println("Done.");
     				return Status.OK_STATUS;//frppos
+    			}
     		};
     		job.schedule();
+    	}
+    }

     package org.txm.treetagger.commands;
     import java.io.BufferedReader;
     import java.io.File;
     import java.io.FileFilter;
     import java.io.PrintWriter;
     import java.util.Arrays;
     import java.util.Collections;
     import java.util.HashSet;
     import java.util.LinkedHashMap;
     import java.util.LinkedHashSet;
     import org.apache.commons.lang.StringUtils;
     import org.eclipse.core.commands.AbstractHandler;
     import org.eclipse.core.commands.ExecutionEvent;
     import org.eclipse.core.commands.ExecutionException;
     import org.eclipse.core.runtime.IProgressMonitor;
     import org.eclipse.core.runtime.IStatus;
     import org.eclipse.core.runtime.Status;
     import org.eclipse.jface.viewers.ISelection;
     import org.eclipse.jface.viewers.IStructuredSelection;
     import org.eclipse.ui.IWorkbenchWindow;
     import org.eclipse.ui.handlers.HandlerUtil;
     import org.kohsuke.args4j.Option;
     import org.txm.Toolbox;
     import org.txm.core.preferences.TBXPreferences;
     import org.txm.core.preferences.TXMPreferences;
     import org.txm.rcp.swt.widget.parameters.ParametersDialog;
     import org.txm.rcp.utils.JobHandler;
     import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
     import org.txm.searchengine.cqp.corpus.Corpus;
     import org.txm.searchengine.cqp.corpus.MainCorpus;
     import org.txm.searchengine.cqp.corpus.Property;
     import org.txm.stat.utils.ConsoleProgressBar;
     import org.txm.utils.DeleteDir;
     import org.txm.utils.io.FileCopy;
     import org.txm.utils.io.IOUtils;
     import org.txm.utils.Tuple;
     /**
      * Our sample handler extends AbstractHandler, an IHandler base class.
      * @see org.eclipse.core.commands.IHandler
      * @see org.eclipse.core.commands.AbstractHandler
      */
     public class LemmaProjection extends AbstractHandler {
     	protected static final String EXTRA = "extra";
     	@Option(name="dictionary", usage="TSV Dictionary file with form, msd, lemma, source columns", widget="File", required=true, def="frolex.tsv")
     	public File dictionary = null;
     	@Option(name="extrarules", usage="form+pos rules files", widget="File", required=false, def="extrarules.tsv")
     	public File extrarules = null;
     	@Option(name="posproperty", usage="The lexicon property to read", widget="String", required=true, def="frpos")
     	public String posproperty = null;
     	@Option(name="lemmaproperty", usage="The property to create/update in the corpus", widget="String", required=true, def="plemma")
     	public String lemmaproperty = null;
     	@Option(name="formAsLemmaPosList", usage="Pos values lemma exceptions", widget="String", required=false, def="NOMPro")
     	public String formAsLemmaPosList = null;
     	@Option(name="sourcePriorityList", usage="The property to create/update in the corpus", widget="String", required=true, def="TL")
     	public String sourcePriorityList = null;
     	/**
+    	 *
     	 */
     	public Object execute(ExecutionEvent event) throws ExecutionException {
     		Corpus corpus = null;
     		IWorkbenchWindow window = HandlerUtil.getActiveWorkbenchWindowChecked(event);
     		ISelection isel = window.getActivePage().getSelection();
     		if (isel instanceof IStructuredSelection) {
     			IStructuredSelection sel = (IStructuredSelection)isel;
     			Object first = sel.getFirstElement();
     			if (first instanceof Corpus) {
     				corpus = (Corpus)first;
     				if (ParametersDialog.open(this)) {
     					LinkedHashSet<String> formAsLemmaPosSet = new LinkedHashSet<String>();
     					formAsLemmaPosSet.addAll(Arrays.asList(formAsLemmaPosList.split(",")));
     					LinkedHashSet<String> sourcePrioritySet = new LinkedHashSet<String>();
     					if (extrarules != null && extrarules.exists()) sourcePrioritySet.add(EXTRA); // extra must be the first source
     					sourcePrioritySet.addAll(Arrays.asList(sourcePriorityList.split(",")));
     					System.out.println("formAsLemmaPosSet="+formAsLemmaPosSet);
     					System.out.println("sourcePrioritySet="+sourcePrioritySet);
     					apply(corpus, dictionary, extrarules, posproperty, lemmaproperty, formAsLemmaPosSet, sourcePrioritySet);
     					return corpus;
+    				}
+    			}
+    		}
     		System.out.println("Wrong selection.");
     		return null;
+    	}
     	public static void apply(final Corpus corpus, final File dictionary, final File extrarules, final String posproperty,
     			final String targetproperty, final LinkedHashSet<String> formAsLemmaPosList, final LinkedHashSet<String> sourceprioritylist) {
     		final MainCorpus mainCorpus = corpus.getMainCorpus();
     		final File corpusBinaryDirectory = mainCorpus.getBaseDirectory();
     		final File txmDirectory = new File(corpusBinaryDirectory, "txm/"+mainCorpus.getName());
     		if (!txmDirectory.exists()) {
     			System.out.println("Can't process a corpus with no XML-TXM files directory: "+txmDirectory);
     			return;
+    		}
     		final File[] files = txmDirectory.listFiles(new FileFilter() {
     			@Override
     			public boolean accept(File file) {
     				return file.isFile() && file.canWrite() && file.getName().endsWith(".xml");
+    			}
     		});
     		Property pos = null;
     		try {
     			pos = mainCorpus.getProperty(posproperty);
     		} catch (CqiClientException e1) {
     			// TODO Auto-generated catch block
     			e1.printStackTrace();
+    		}
     		if (pos == null) {
     			System.out.println("No pos property found with name="+posproperty);
     			return;
+    		}
     		if (files == null || files.length == 0) {
     			System.out.println("Can't process a corpus with no XML-TXM files in "+txmDirectory);
     			return;
+    		}
     		System.out.println("APPLYING : "+dictionary+" to "+mainCorpus+": creating/updating "+targetproperty+ " property with lexicon "+dictionary);
     		JobHandler job = new JobHandler("Creating/Updating "+targetproperty+" property.") {
     			@Override
     			protected IStatus run(IProgressMonitor monitor) {
     				this.runInit(monitor);
     				Tuple t;
     				LinkedHashMap<String, LinkedHashMap<String, LinkedHashMap<String, String>>> rules = new LinkedHashMap<String, LinkedHashMap<String, LinkedHashMap<String, String>>>();
     				HashSet<String> formAsLemmaPosSet = new HashSet<String>();
     				try {
     					// load rules
     					BufferedReader reader = IOUtils.getReader(dictionary);
     					String line = reader.readLine();
     					while (line != null) {
     						String[] splitTab = line.split("\t");
     						if (splitTab.length != 5) {
     							System.out.println("Error in dictionary files with line='"+line+"': length is not 5. Found: "+Arrays.toString(splitTab));
     							line = reader.readLine();
     							reader.close();
     							return Status.CANCEL_STATUS;
+    						}
     						String form = splitTab[0];
     						String pos = splitTab[1];//.replace("<no_pos>|", "").replace("|<no_pos>|", "").replace("|<no_pos>", "");
     						String lemma = splitTab[2];//.replace("<no_lemma>|", "").replace("|<no_lemma>|", "").replace("|<no_lemma>", "");
     						String source = splitTab[3];
     						if (! rules.containsKey(form)) rules.put(form, new LinkedHashMap<String, LinkedHashMap<String, String>>());
     						LinkedHashMap<String, LinkedHashMap<String, String>> posHash = rules.get(form);
     						if (!lemma.equals("<no_lemma>")) {
     							if (!posHash.containsKey(pos)) posHash.put(pos, new LinkedHashMap<String, String>());
     							LinkedHashMap<String, String> sourceHash = posHash.get(pos);
     							sourceHash.put(source,  lemma);
+    						}
     						line = reader.readLine();
+    					}
     					reader.close();
     					System.out.println("Dictionary rules loaded: "+rules.size());
     					if (extrarules.exists()) {
     						reader = IOUtils.getReader(extrarules);
     						line = reader.readLine();
     						while (line != null) {
     							String[] splitTab = line.split("\t");
     							if (splitTab.length != 3) {
     								System.out.println("Error in extra rule files with line='"+line+"': length is not 3.");
     								line = reader.readLine();
     								continue;
+    							}
     							String form = splitTab[0];
     							String pos = splitTab[1];
     							String lemma = splitTab[2];
     							if (! rules.containsKey(form)) rules.put(form, new LinkedHashMap<String, LinkedHashMap<String, String>>());
     							LinkedHashMap<String, LinkedHashMap<String, String>> posHash = rules.get(form);
     							if (!posHash.containsKey(pos)) posHash.put(pos, new LinkedHashMap<String, String>());
     							LinkedHashMap<String, String> sourceHash = posHash.get(pos);
     							sourceHash.put(EXTRA,  lemma);
+    						}
     						reader.close();
     						System.out.println("Dictionary extra rules loaded: "+rules.size());
     					} else {
     						System.out.println("No extra rule loaded.");
+    					}
     					PrintWriter writer = IOUtils.getWriter("/tmp/rules.txt");
     					for (String k : rules.keySet()) {
     						writer.println("FORM="+k);
     						LinkedHashMap<String, LinkedHashMap<String, String>> rules2 = rules.get(k);
     						for (String k2 : rules2.keySet()) {
     							writer.println(" POS="+k2);
     							LinkedHashMap<String, String> rules3 = rules2.get(k2);
     							for (String k3 : rules3.keySet()) {
     								writer.println("  SOURCE="+k3);
     								String ls2 = rules3.get(k3);
     								writer.println("   LEMMA="+ls2);
+    							}
+    						}
+    					}
     					writer.close();
     					System.out.println("RULE DUMP: /tmp/rules.txt");
     					// load rules
     					for (String s : formAsLemmaPosList) {
     						formAsLemmaPosSet.add(s);
+    					}
     					System.out.println("POS exception rules loaded: "+formAsLemmaPosSet.size());
     					// save previous version of XML-TXM files
     					File previousXMLTXMDirectory = new File(txmDirectory.getAbsolutePath()+"_previous");
     					DeleteDir.deleteDirectory(previousXMLTXMDirectory);
     					FileCopy.copyFiles(txmDirectory, previousXMLTXMDirectory);
     					// work
     					File noMatchsFile = new File(TXMPreferences.getString(TBXPreferences.USER_TXM_HOME, TBXPreferences.PREFERENCES_NODE), "results/nomatch.txt");
     					HashSet<String> noMatchsSet = new HashSet<String>();
     					ConsoleProgressBar cpb = new ConsoleProgressBar(files.length);
     					for (File xmlFile : files) {
     						cpb.tick();
     						XMLLemmaProjection p = new XMLLemmaProjection(xmlFile, rules, formAsLemmaPosSet, sourceprioritylist, posproperty, targetproperty);
     						File tmpFile = new File(xmlFile.getParentFile(), "tmp_"+xmlFile.getName());
     						if (p.process(tmpFile)) {
     							if (xmlFile.delete() && tmpFile.renameTo(xmlFile)) {
     								// ok
     							} else {
     								System.out.println("Error during lemma projection: can't replace XML-TXM file: "+xmlFile);
     								return Status.CANCEL_STATUS;
+    							}
     						} else {
     							System.out.println("Error during lemma projection. Aborting.");
     							return Status.CANCEL_STATUS;
+    						}
     						if (p.getNoMatchValues().size() > 0) {
     							System.out.println("No matchs found with file "+xmlFile.getName()+": "+p.getNoMatchValues());
     							noMatchsSet.addAll(p.getNoMatchValues());
+    						}
+    					}
     					if (noMatchsSet.size() > 0) {
     						System.out.println("Missing lemma values report saved in: "+noMatchsFile);
     						IOUtils.write(noMatchsFile, StringUtils.join(noMatchsSet, "\n"));
+    					}
     					cpb.done();
     					monitor.worked(50);
     					// update corpus
     					// update corpus indexes and edition
     					//					String txmhome = Toolbox.getParam(Toolbox.USER_TXM_HOME);
     					//
     					//					BaseParameters params = corpus.getBase().getBaseParameters();
     					//					params.getKeyValueParameters().put(ImportKeys.MULTITHREAD, "false"); //too soon
     					//					params.getKeyValueParameters().put(ImportKeys.DEBUG, Log.getLevel().intValue() < Level.WARNING.intValue()); // need debug for experimental stuff
     					//					params.getKeyValueParameters().put(ImportKeys.UPDATECORPUS, "true");
     					//
     					//					monitor.setTaskName("Updating corpus");
     					//					File scriptDir = new File(txmhome, "scripts/import");
     					//					File script = new File(scriptDir, "xtzLoader.groovy");
     					//					System.out.println("Updating corpus "+corpus+" using "+params.paramFile);
     					//					boolean ret = ExecuteImportScript.executeScript(script.getAbsolutePath(), params);
     					//					if (!ret) {
     					//						System.out.println("Error during corpus re-import, check the XML-TXM files. Previous version can be restored from "+previousXMLTXMDirectory);
     					//						return Status.CANCEL_STATUS;
     					//					}
     					//					Display.getDefault().syncExec(new Runnable() {
     					//						@Override
     					//						public void run() {CloseEditorsUsing.corpus(corpus);}
     					//					});
     					//					monitor.worked(50);
     				} catch (Exception e) {
     					e.printStackTrace();
     					return Status.CANCEL_STATUS;
+    				}
     				System.out.println("Done.");
     				return Status.OK_STATUS;
+    			}
     		};
     		job.schedule();
+    	}
+    }

     package org.txm.treetagger.commands;
     import java.io.File;
     import java.io.IOException;
     import java.util.HashSet;
     import java.util.LinkedHashMap;
     import java.util.LinkedHashSet;
     import java.util.regex.Pattern;
     import javax.xml.stream.XMLStreamException;
     import org.txm.importer.StaxIdentityParser;
     public class XMLLemmaProjection extends StaxIdentityParser {
     	// form -> pos -> source -> lemma
     	protected LinkedHashMap<String, LinkedHashMap<String, LinkedHashMap<String, String>>> rules = null;
     	protected HashSet<String> formAsLemmaPosList = null;
     	protected String lemmaProperty;
     	protected HashSet<String> noMatchValues = new HashSet<String>();
     	protected String posProperty;
     	protected LinkedHashSet<String> lemmaSourcePriorityList;
     	public XMLLemmaProjection(File infile, LinkedHashMap<String, LinkedHashMap<String,
     			LinkedHashMap<String, String>>> rules,
     			HashSet<String> formAsLemmaPosList,
     			LinkedHashSet<String> lemmaSourcePriorityList,
     			String posProperty, String lemmaProperty) throws IOException, XMLStreamException {
     		super(infile);
     		this.rules = rules;
     		this.formAsLemmaPosList = formAsLemmaPosList;
     		this.lemmaSourcePriorityList = lemmaSourcePriorityList;
     		this.lemmaProperty = lemmaProperty;
     		this.posProperty = posProperty;
     		// the XML-TXM files word properties name starts wit # (they are references)
     		if (!this.lemmaProperty.startsWith("#")) this.lemmaProperty = "#"+this.lemmaProperty;
     		if (!this.posProperty.startsWith("#")) this.posProperty = "#"+this.posProperty;
+    	}
     	boolean inW = false, inAna = false, inForm;
     	LinkedHashMap<String, String> anaValues = new LinkedHashMap<String, String>();
     	LinkedHashMap<String, String> anaResps = new LinkedHashMap<String, String>();
     	String typeName = null;
     	String respName = null;
     	String formValue, typeValue = null;
     	@Override
     	public void processStartElement() throws XMLStreamException, IOException {
     		if (!inW) super.processStartElement(); // don't write W content
     		if (localname.equals("w")) {
     			inW = true;
     			anaValues.clear();
     			anaResps.clear();
     			//initialize the new type to a empty value in case there is transformation rule
     			anaValues.put(lemmaProperty, "");
     			anaResps.put(lemmaProperty, "#txm_recode");
     		} else if (localname.equals("ana")) {
     			inAna = true;
     			typeName = parser.getAttributeValue(null, "type");
     			respName = parser.getAttributeValue(null, "resp");
     			anaResps.put(typeName, respName);
     			//if (typeName != null) typeName = typeName.substring(1); // remove #
     			typeValue = "";
     		} else if (localname.equals("form")) {
     			inForm = true;
     			formValue = "";
+    		}
+    	}
     	@Override
     	public void processCharacters() throws XMLStreamException {
     		if (inW && inAna) typeValue+=parser.getText();
     		else if (inW && inForm) formValue+=parser.getText();
     		else super.processCharacters();
+    	}
     	@Override
     	public void processEndElement() throws XMLStreamException {
     		if (localname.equals("w")) {
     			inW = false;
     			// write W content
     			try {
     				// get the value to test
     				String posValue = anaValues.get(posProperty);
     				if (posValue == null) {
     					posValue = "<no_pos>";
     //					anaValues.put(posProperty, "<no_pos>");
     //					anaResps.put(posProperty, "txm_recode");
+    				}
     				String value = updateAnaValuesIfMatch(formValue.trim(), posValue.trim());
     				//System.out.println("form="+formValue+" + pos="+posValue+" -> "+value);
     				anaValues.put(lemmaProperty, value);
     				anaResps.put(lemmaProperty, "#txm_recode");
     				// write the word element
     				writer.writeStartElement("txm:form");
     				writer.writeCharacters(formValue);
     				writer.writeEndElement();
     				for (String k : anaValues.keySet()) {
     					writer.writeStartElement("txm:ana");
     					writer.writeAttribute("resp", anaResps.get(k));
     					writer.writeAttribute("type", k);
     					writer.writeCharacters(anaValues.get(k));
     					writer.writeEndElement();
+    				}
     			} catch (XMLStreamException e) {
     				e.printStackTrace();
+    			}
     		} else if (localname.equals("ana")) {
     			anaValues.put(typeName, typeValue);
     			inAna = false;
     		} else if (localname.equals("form")) {
     			inForm = false;
+    		}
     		if (!inW) super.processEndElement(); // don't write W content now
+    	}
     	protected String updateAnaValuesIfMatch(String formValue, String posValue) {
     		if (posValue == null) return "";
     		if (formAsLemmaPosList.contains(posValue)) return formValue;
     		if (formValue.equals("virge")) System.out.println("testing: "+formValue+" "+posValue);
     		if (formValue.equals("virge")) System.out.println("form connue? "+rules.containsKey(formValue));
     		if (rules.containsKey(formValue)) {
     			LinkedHashMap<String, LinkedHashMap<String, String>> posHash = rules.get(formValue);
     //			if (posHash.containsKey(posValue)) {
     //				LinkedHashMap<String, String> sourceHash = posHash.get(posValue);
     //				for (String source : lemmaSourcePriorityList) {
     //					if (sourceHash.containsKey(source)) {
     //						return sourceHash.get(source);
     //					}
     //				}
     //			}
     			if (formValue.equals("virge")) System.out.println(" tests"+posHash.keySet());
     			for (String posRegexp : posHash.keySet()) {
     				if (posValue.matches(posRegexp)) {
     					return posHash.get(posRegexp).toString();
+    				}
+    			}
+    		}
     		// try without maj
     		String formValueMin = formValue.toLowerCase();
     		if (rules.containsKey(formValueMin)) {
     			LinkedHashMap<String, LinkedHashMap<String, String>> posHash = rules.get(formValueMin);
     //			if (posHash.containsKey(posValue)) {
     //				LinkedHashMap<String, String> sourceHash = posHash.get(posValue);
     //				for (String source : lemmaSourcePriorityList) {
     //					if (sourceHash.containsKey(source)) {
     //						return sourceHash.get(source);
     //					}
     //				}
     //			}
     			for (String posRegexp : posHash.keySet()) {
     				if (posValue.matches(posRegexp)) {
     					return posHash.get(posRegexp).toString();
+    				}
+    			}
+    		}
     		noMatchValues.add(formValue+"|"+posValue);
     		return "!"+formValue;
+    	}
     	public HashSet<String> getNoMatchValues() {
     		return noMatchValues;
+    	}
     	public static void main(String args[]) {
     		File xmlFile = new File("/home/mdecorde/TXM/corpora/XTZTEXTUALPLANS/txm/XTZTEXTUALPLANS/test.xml");
     		File tmpFile = new File("/home/mdecorde/TXM/corpora/XTZTEXTUALPLANS/txm/XTZTEXTUALPLANS/test-o.xml");
     		String posProperty = "type";
     		String newType = "lemma";
     		LinkedHashMap<Pattern[], String> rules = new LinkedHashMap<Pattern[], String>();
     		rules.put(new Pattern[]{Pattern.compile("w"), Pattern.compile("w")}, "WORD");
     		rules.put(new Pattern[]{Pattern.compile("x.+"), Pattern.compile("w")}, "XWORD");
     		rules.put(new Pattern[]{Pattern.compile("y"), Pattern.compile("w")}, "YWORD");
     		rules.put(new Pattern[]{Pattern.compile("y.*"), Pattern.compile("w")}, "YMULTIWORD");
     		//XMLPropertyProjection converter = new XMLPropertyProjection(xmlFile, rules, posProperty, newType);
     		//System.out.println(converter.process(tmpFile));
+    	}
+    }

     package org.txm.treetagger.commands;
     import java.io.BufferedReader;
     import java.io.File;
     import java.io.FileFilter;
     import java.util.Arrays;
     import java.util.HashSet;
     import java.util.LinkedHashMap;
     import java.util.LinkedHashSet;
     import org.apache.commons.lang.StringUtils;
     import org.eclipse.core.commands.AbstractHandler;
     import org.eclipse.core.commands.ExecutionEvent;
     import org.eclipse.core.commands.ExecutionException;
     import org.eclipse.core.runtime.IProgressMonitor;
     import org.eclipse.core.runtime.IStatus;
     import org.eclipse.core.runtime.Status;
     import org.eclipse.jface.viewers.ISelection;
     import org.eclipse.jface.viewers.IStructuredSelection;
     import org.eclipse.ui.IWorkbenchWindow;
     import org.eclipse.ui.handlers.HandlerUtil;
     import org.kohsuke.args4j.Option;
     import org.txm.Toolbox;
     import org.txm.core.preferences.TBXPreferences;
     import org.txm.core.preferences.TXMPreferences;
     import org.txm.rcp.swt.widget.parameters.ParametersDialog;
     import org.txm.rcp.utils.JobHandler;
     import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
     import org.txm.searchengine.cqp.corpus.Corpus;
     import org.txm.searchengine.cqp.corpus.MainCorpus;
     import org.txm.searchengine.cqp.corpus.Property;
     import org.txm.stat.utils.ConsoleProgressBar;
     import org.txm.utils.DeleteDir;
     import org.txm.utils.io.FileCopy;
     import org.txm.utils.io.IOUtils;
     import org.txm.utils.Tuple;
     /**
      * Our sample handler extends AbstractHandler, an IHandler base class.
      * @see org.eclipse.core.commands.IHandler
      * @see org.eclipse.core.commands.AbstractHandler
      */
     public class RemoveProperties extends AbstractHandler {
     	@Option(name="propertiesList", usage="The properties to remove", widget="String", required=true, def="plemma")
     	public String propertiesList = null;
     	/**
+    	 *
     	 */
     	public Object execute(ExecutionEvent event) throws ExecutionException {
     		Corpus corpus = null;
     		IWorkbenchWindow window = HandlerUtil.getActiveWorkbenchWindowChecked(event);
     		ISelection isel = window.getActivePage().getSelection();
     		if (isel instanceof IStructuredSelection) {
     			IStructuredSelection sel = (IStructuredSelection)isel;
     			Object first = sel.getFirstElement();
     			if (first instanceof Corpus) {
     				corpus = (Corpus)first;
     				if (ParametersDialog.open(this)) {
     					LinkedHashSet<String> propertiesSet = new LinkedHashSet<String>();
     					propertiesSet.addAll(Arrays.asList(propertiesList.split(",")));
     					apply(corpus, propertiesSet);
     					return corpus;
+    				}
+    			}
+    		}
     		System.out.println("Wrong selection.");
     		return null;
+    	}
     	public static void apply(final Corpus corpus, final HashSet<String> propertiesSet) {
     		final MainCorpus mainCorpus = corpus.getMainCorpus();
     		final File corpusBinaryDirectory = mainCorpus.getBaseDirectory();
     		final File txmDirectory = new File(corpusBinaryDirectory, "txm/"+mainCorpus.getName());
     		if (!txmDirectory.exists()) {
     			System.out.println("Can't process a corpus with no XML-TXM files directory: "+txmDirectory);
     			return;
+    		}
     		final File[] files = txmDirectory.listFiles(new FileFilter() {
     			@Override
     			public boolean accept(File file) {
     				return file.isFile() && file.canWrite() && file.getName().endsWith(".xml");
+    			}
     		});
     		if (files == null || files.length == 0) {
     			System.out.println("Can't process a corpus with no XML-TXM files in "+txmDirectory);
     			return;
+    		}
     		System.out.println("Removing "+propertiesSet+" to "+mainCorpus+" XML-TXM files...");
     		JobHandler job = new JobHandler("Removing "+propertiesSet+" to "+mainCorpus+" XML-TXM files.") {
     			@Override
     			protected IStatus run(IProgressMonitor monitor) {
     				this.runInit(monitor);
     				LinkedHashMap<String, LinkedHashMap<String, LinkedHashMap<String, String>>> rules = new LinkedHashMap<String, LinkedHashMap<String, LinkedHashMap<String, String>>>();
     				HashSet<String> no_change_rules = new HashSet<String>();
     				try {
     					// save previous version of XML-TXM files
     					File previousXMLTXMDirectory = new File(txmDirectory.getAbsolutePath()+"_previous");
     					DeleteDir.deleteDirectory(previousXMLTXMDirectory);
     					FileCopy.copyFiles(txmDirectory, previousXMLTXMDirectory);
     					// work
     					File noMatchsFile = new File(TXMPreferences.getString(TBXPreferences.USER_TXM_HOME, TBXPreferences.PREFERENCES_NODE), "results/nomatch.txt");
     					HashSet<String> noMatchsSet = new HashSet<String>();
     					ConsoleProgressBar cpb = new ConsoleProgressBar(files.length);
     					for (File xmlFile : files) {
     						cpb.tick();
     						XMLRemoveProperties p = new XMLRemoveProperties(xmlFile, propertiesSet);
     						File tmpFile = new File(xmlFile.getParentFile(), "tmp_"+xmlFile.getName());
     						if (p.process(tmpFile)) {
     							if (xmlFile.delete() && tmpFile.renameTo(xmlFile)) {
     								// ok
     							} else {
     								System.out.println("Error during properties removal: can't replace XML-TXM file: "+xmlFile);
     								return Status.CANCEL_STATUS;
+    							}
     						} else {
     							System.out.println("Error during properties removal. Aborting.");
     							return Status.CANCEL_STATUS;
+    						}
+    					}
     					cpb.done();
     					monitor.worked(50);
     				} catch (Exception e) {
     					e.printStackTrace();
     					return Status.CANCEL_STATUS;
+    				}
     				System.out.println("Done.");
     				return Status.OK_STATUS;
+    			}
     		};
     		job.schedule();
+    	}
+    }

     package org.txm.treetagger.commands;
     import java.io.File;
     import java.io.IOException;
     import java.util.HashSet;
     import java.util.LinkedHashMap;
     import java.util.regex.Pattern;
     import javax.xml.stream.XMLStreamException;
     import org.txm.importer.StaxIdentityParser;
     /**
      * Remove XML-TXM file ana elements which 'type' attribute value is in a set
+     *
      * @author mdecorde
+     *
      */
     public class XMLRemoveProperties extends StaxIdentityParser {
     	// form -> pos -> source -> lemma
     	protected HashSet<String> propertiesSet = null;
     	/**
+    	 *
     	 * @param infile the XML-TXM file to process
     	 * @param propertiesSet the set of ana@type attributes to remove
     	 * @throws XMLStreamException
     	 * @throws IOException
     	 */
     	public XMLRemoveProperties(File infile, HashSet<String> propertiesSet) throws IOException, XMLStreamException {
     		super(infile);
     		this.propertiesSet = new HashSet<String>();
     		for (String property : propertiesSet) {
     			// the XML-TXM files word properties name starts with # (they are references)
     			if (!property.startsWith("#")) property = "#"+property;
     			this.propertiesSet.add(property);
+    		}
+    	}
     	boolean inW = false, inAna = false;
     	String typeName = null;
     	@Override
     	public void processStartElement() throws XMLStreamException, IOException {
     		if (localname.equals("w")) {
     			inW = true;
     		} else if (inW && localname.equals("ana")) {
     			inAna = true;
     			typeName = parser.getAttributeValue(null, "type");
     			if (propertiesSet.contains(typeName)) return; // don't write this element start tag
+    		}
     		super.processStartElement();
+    	}
     	@Override
     	public void processCharacters() throws XMLStreamException {
     		if (inW && typeName != null && propertiesSet.contains(typeName)) {return;} // don't write the element content
     		super.processCharacters();
+    	}
     	@Override
     	public void processEndElement() throws XMLStreamException {
     		if (localname.equals("w")) {
     			inW = false;
     		} else if (inW && localname.equals("ana")) {
     			inAna = false;
     			if (propertiesSet.contains(typeName)) {typeName = null; return;} // don't write the element end tag
     			typeName = null;
+    		}
     		super.processEndElement(); // don't write W content now
+    	}
     	public static void main(String args[]) {
     		File xmlFile = new File("/home/mdecorde/TXM/corpora/XTZTEXTUALPLANS/txm/XTZTEXTUALPLANS/test.xml");
     		File tmpFile = new File("/home/mdecorde/TXM/corpora/XTZTEXTUALPLANS/txm/XTZTEXTUALPLANS/test-o.xml");
     		String posProperty = "type";
     		String newType = "lemma";
     		LinkedHashMap<Pattern[], String> rules = new LinkedHashMap<Pattern[], String>();
     		rules.put(new Pattern[]{Pattern.compile("w"), Pattern.compile("w")}, "WORD");
     		rules.put(new Pattern[]{Pattern.compile("x.+"), Pattern.compile("w")}, "XWORD");
     		rules.put(new Pattern[]{Pattern.compile("y"), Pattern.compile("w")}, "YWORD");
     		rules.put(new Pattern[]{Pattern.compile("y.*"), Pattern.compile("w")}, "YMULTIWORD");
     		//XMLPropertyProjection converter = new XMLPropertyProjection(xmlFile, rules, posProperty, newType);
     		//System.out.println(converter.process(tmpFile));
+    	}
+    }

... Ce différentiel a été tronqué car il excède la taille maximale pouvant être affichée.

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 460