/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 2941

     	 * @return the string
     	 */
     	public static String format(String str, String lang) {
     		if (lang == null)
     		if (lang == null) {
     			lang = Locale.getDefault().getLanguage();
+    		}
     		for (String punc : getNoSpaceAfter(lang)) {
     			str = str.replace(punc + " ", punc); //$NON-NLS-1$
+    		}
-...
     		return str;
+    	}
     	public static boolean isSpaceAfterNotNeeded(String str, String lang) {
     		for (String t : getNoSpaceAfter(lang)) {
     			if (str.endsWith(t)) {
     				return true;
+    			}
+    		}
     		return false;
+    	}
     	public static boolean isSpaceBeforeNotNeeded(String str, String lang) {
     		for (String t : getNoSpaceBefore(lang)) {
     			if (str.startsWith(t)) {
     				return true;
+    			}
+    		}
     		return false;
+    	}
     	/**
     	 * Format.
+    	 *

           </command>
           <command
                 categoryId="TIGERSearch4TXM.commands.category"
                 defaultHandler="org.txm.tigersearch.commands.ImportCONNLUAnnotations"
                 id="org.txm.tigersearch.commands.ImportCONNLUAnnotations"
                 name="Import CONNL-u Annotations...">
                 defaultHandler="org.txm.tigersearch.commands.ImportCONNLUAnnotationsFromDirectory"
                 id="org.txm.tigersearch.commands.ImportCONNLUAnnotationsFromDirectory"
                 name="Import CONNL-u Annotations from a directory...">
           </command>
           <command
                 categoryId="TIGERSearch4TXM.commands.category"
-...
                 id="org.txm.tigersearch.commands.ExportCorpusAsCONNLU"
                 name="Export CONNL-u Annotations...">
           </command>
           <command
                 categoryId="TIGERSearch4TXM.commands.category"
                 defaultHandler="org.txm.tigersearch.commands.ImportCONNLUAnnotationsFromFile"
                 id="org.txm.tigersearch.commands.ImportCONNLUAnnotationsFromFile"
                 name="Import CONNL-u Annotations from a CoNNL-U file...">
           </command>
        </extension>
        <extension
              point="org.eclipse.core.expressions.propertyTesters">
-...
                    </visibleWhen>
                 </command>
                 <command
                       commandId="org.txm.tigersearch.commands.ImportCONNLUAnnotations"
                       commandId="org.txm.tigersearch.commands.ImportCONNLUAnnotationsFromFile"
                       icon="icons/functions/UDplus.png"
                       style="push">
                    <visibleWhen
-...
                       </or>
                    </visibleWhen>
                 </command>
                 <command
                       commandId="org.txm.tigersearch.commands.ImportCONNLUAnnotationsFromDirectory"
                       icon="icons/functions/UDplus.png"
                       style="push">
                    <visibleWhen
                          checkEnabled="false">
                       <or>
                          <test
                                forcePluginActivation="true"
                                property="org.txm.rcp.testers.TIGERSearchReady"
                                value="TIGERSearchReady">
                          </test>
                          <reference
                                definitionId="OneMainCorpusSelected">
                          </reference>
                       </or>
                    </visibleWhen>
                 </command>
              </menu>
           </menuContribution>
           <menuContribution

     // Copyright © 2010-2020 ENS de Lyon., University of Franche-Comté
     // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
     // Lyon 2, University of Franche-Comté, University of Nice
     // Sophia Antipolis, University of Paris 3.
     //
     // The TXM platform is free software: you can redistribute it
     // and/or modify it under the terms of the GNU General Public
     // License as published by the Free Software Foundation,
     // either version 2 of the License, or (at your option) any
     // later version.
     //
     // The TXM platform is distributed in the hope that it will be
     // useful, but WITHOUT ANY WARRANTY; without even the implied
     // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
     // PURPOSE. See the GNU General Public License for more
     // details.
     //
     // You should have received a copy of the GNU General
     // Public License along with the TXM platform. If not, see
     // http://www.gnu.org/licenses.
     //
     //
     //
     // $LastChangedDate:$
     // $LastChangedRevision:$
     // $LastChangedBy:$
     //
     package org.txm.tigersearch.commands;
     import java.io.BufferedReader;
     import java.io.File;
     import java.io.FileFilter;
     import java.io.IOException;
     import java.util.HashMap;
     import javax.xml.stream.XMLStreamException;
     import org.apache.commons.lang.StringUtils;
     import org.eclipse.core.commands.AbstractHandler;
     import org.eclipse.core.commands.ExecutionEvent;
     import org.eclipse.core.commands.ExecutionException;
     import org.eclipse.jface.viewers.IStructuredSelection;
     import org.eclipse.ui.handlers.HandlerUtil;
     import org.kohsuke.args4j.Option;
     import org.txm.core.messages.TXMCoreMessages;
     import org.txm.objects.Text;
     import org.txm.rcp.commands.workspace.UpdateCorpus;
     import org.txm.rcp.swt.widget.parameters.ParametersDialog;
     import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
     import org.txm.searchengine.cqp.corpus.CQPCorpus;
     import org.txm.searchengine.cqp.corpus.MainCorpus;
     import org.txm.searchengine.cqp.serverException.CqiServerError;
     import org.txm.utils.io.FileCopy;
     import org.txm.utils.io.IOUtils;
     import org.txm.utils.logger.Log;
     import org.txm.xml.xmltxm.XMLTXMWordPropertiesInjection;
     import cern.colt.Arrays;
     /**
      * Import CONNLU annotations into a TXM corpus
+     *
      * IF the corpus already contains CONNLU annotations, they are replaced
+     *
      * @author mdecorde.
      */
     public class ImportCONNLUAnnotations extends AbstractHandler {
     	public static final String ID = ImportCONNLUAnnotations.class.getName();
     	@Option(name = "connluDirectory", usage = "connluDirectory", widget = "Folder", required = true, def = "connlu-directory")
     	File connluDirectory;
     	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
     	String propertiesPrefix;
     	/*
     	 * (non-Javadoc)
     	 * @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent)
     	 */
     	@Override
     	public Object execute(final ExecutionEvent event) throws ExecutionException {
     		IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event);
     		Object s = selection.getFirstElement();
     		if (!(s instanceof MainCorpus)) {
     			Log.warning("Selection is not a corpus. Aborting.");
     			return null;
+    		}
     		if (!ParametersDialog.open(this)) {
     			return null;
+    		}
     		if (connluDirectory == null || !connluDirectory.exists() || !connluDirectory.isDirectory() || connluDirectory.listFiles().length == 0) {
     			Log.warning("Error: connlu directory is empty: " + connluDirectory);
     			return null;
+    		}
     		CQPCorpus corpus = (CQPCorpus) s;
     		MainCorpus mainCorpus = corpus.getMainCorpus();
     		try {
     			return importAnnotations(mainCorpus, connluDirectory, propertiesPrefix);
+    		}
     		catch (Exception e) {
     			Log.warning(e);
     			e.printStackTrace();
+    		}
     		return null;
+    	}
     	/**
+    	 *
     	 * if import CONNLU annotations in the corpus with the same name already exists, it is replaced
+    	 *
     	 * @param corpus
     	 * @param connluDirectory
     	 * @param propertiesPrefix
     	 * @return the number of imported annotations
     	 * @throws CqiClientException
     	 * @throws CqiServerError
     	 * @throws IOException
     	 * @throws XMLStreamException
     	 */
     	public static int importAnnotations(MainCorpus mainCorpus, File connluDirectory, String propertiesPrefix) throws IOException,
     			CqiServerError, CqiClientException, XMLStreamException {
     		Log.info(TXMCoreMessages.bind("Importing CONNL-u annotations of {0} in {1} using the ''{2}'' prefix...", connluDirectory, mainCorpus, propertiesPrefix));
     		File[] files = connluDirectory.listFiles(new FileFilter() {
     			@Override
     			public boolean accept(File file) {
     				return file.isFile() && file.getName().endsWith(".conllu");
+    			}
     		});
     		int nTextProcessed = 0;
     		int nWords = 0;
     		int nWordsInserted = 0;
     		for (File coonluFile : files) {
     			String textid = coonluFile.getName().substring(0, coonluFile.getName().length() - 7);
     			Log.info("** processing text: " + textid);
     			Text text = mainCorpus.getProject().getText(textid);
     			if (text == null) {
     				Log.warning("No text found with ID=" + textid);
     				continue;
+    			}
     			File xmltxmFile = mainCorpus.getProject().getText(textid).getXMLTXMFile();
     			File xmltxmUpdatedFile = new File(System.getProperty("java.io.tmpdir"), xmltxmFile.getName());
     			XMLTXMWordPropertiesInjection processor = new XMLTXMWordPropertiesInjection(xmltxmFile);
     			HashMap<String, HashMap<String, String>> rules = new HashMap<>();
     			processor.setProperties(rules);
     			BufferedReader reader = IOUtils.getReader(coonluFile);
     			String line = reader.readLine();
     			String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
     			int nWords2 = 0;
     			int nLine = 0;
     			String sent_id = null;
     			String newpar_id = null;
     			String newdoc_id = null;
     			while (line != null) {
     				nLine++;
     				if (line.length() == 0) {
     					line = reader.readLine();
     					continue; // comment
+    				}
     				if (line.startsWith("#")) {
     					if (line.startsWith("# sent_id = ")) {
     						sent_id = line.substring(12).trim();
+    					}
     					else if (line.startsWith("# newdoc id = ")) {
     						newdoc_id = line.substring(14).trim();
+    					}
     					else if (line.startsWith("# newpar id = ")) {
     						newpar_id = line.substring(14).trim();
+    					}
     					else {
     						// nothing for now
+    					}
     					line = reader.readLine();
     					continue; // comment
+    				}
     				String[] split = line.split("\t", 10);
     				if (split.length < 10) {
     					Log.warning("Error: line " + nLine + " : " + line + " -> " + Arrays.toString(split) + " len=" + split.length);
     					line = reader.readLine();
     					continue; // comment
+    				}
     				String id = split[9];
     				int from = id.indexOf("XmlId=") + 6;
     				if (from < 6) {
     					Log.warning("Error: line " + nLine + " with no 'XmlId=': " + line);
     					line = reader.readLine();
     					continue;
+    				}
     				id = id.substring(from);
     				// System.out.println("ID=" + id);
     				if (id.contains("-")) continue; // TODO to manage later
     				HashMap<String, String> properties = new HashMap<>();
     				for (int i = 0; i < split.length; i++) {
     					properties.put("#" + propertiesPrefix + propNames[i], split[i]); // add the property name using the prefix ; XML-TXM types are prefixed with '#'
+    				}
     				if (sent_id != null) {
     					properties.put("#ud-sentid", sent_id);
     					sent_id = ""; // reset value for next sentence
     				} else {
     					properties.put("#ud-sentid", "");
+    				}
     				if (newdoc_id != null) {
     					properties.put("#ud-newdocid", newdoc_id);
     					newdoc_id = null; // reset value for next sentence
     				} else {
     					properties.put("#ud-newdocid", "");
+    				}
     				if (newpar_id != null) {
     					properties.put("#ud-newparid", newpar_id);
     					newpar_id = null; // reset value for next sentence
     				} else {
     					properties.put("#ud-newparid", "");
+    				}
     				processor.addProperty(id, properties);
     				nWords2++;
     				line = reader.readLine();
+    			}
     			reader.close();
     			if (nWords2 == 0) {
     				Log.warning("** No annotation found in " + coonluFile);
+    			}
     			nWords += nWords2;
     			Log.info("** loading annotations from : " + coonluFile);
     			if (processor.process(xmltxmUpdatedFile)) {
     				if (xmltxmFile.delete() && FileCopy.copy(xmltxmUpdatedFile, xmltxmFile)) {
     					if (processor.getNonActivatedRules().size() > 0) {
     						Log.warning("Warning: some words were not imported: "+StringUtils.join(processor.getNonActivatedRules(), ", "));
+    					}
+    				}
     				else {
     					Log.warning("** Warning: annotation import failed for replace the corpus XML-TXM file: " + xmltxmFile + ". TEMP file: " + xmltxmUpdatedFile);
     					return 0;
+    				}
+    			}
     			else {
     				Log.warning("** Warning: annotation import failed for text: " + textid);
     				return 0;
+    			}
     			if (processor.getNInsertions() == 0) {
     				Log.warning("** No annotation imported in " + textid);
+    			}
     			nWordsInserted += processor.getNInsertions();
     			nTextProcessed++;
+    		}
     		if (nTextProcessed == 0) {
     			Log.warning("** No text to process. Aborting.");
     			return 0;
+    		}
     		if (nWords == 0) {
     			Log.warning("** No annotation to import in corpus. Aborting.");
     			return 0;
+    		}
     		if (nWordsInserted == 0) {
     			Log.warning("** No annotation imported. Aborting.");
     			return 0;
+    		}
     		Log.info("XML-TXM source files updated. Updating indexes...");
     		UpdateCorpus.update(mainCorpus);
     		Log.info("Done.");
     		return 0;
+    	}
+    }

     // Copyright © 2010-2020 ENS de Lyon., University of Franche-Comté
     // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
     // Lyon 2, University of Franche-Comté, University of Nice
     // Sophia Antipolis, University of Paris 3.
     //
     // The TXM platform is free software: you can redistribute it
     // and/or modify it under the terms of the GNU General Public
     // License as published by the Free Software Foundation,
     // either version 2 of the License, or (at your option) any
     // later version.
     //
     // The TXM platform is distributed in the hope that it will be
     // useful, but WITHOUT ANY WARRANTY; without even the implied
     // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
     // PURPOSE. See the GNU General Public License for more
     // details.
     //
     // You should have received a copy of the GNU General
     // Public License along with the TXM platform. If not, see
     // http://www.gnu.org/licenses.
     //
     //
     //
     // $LastChangedDate:$
     // $LastChangedRevision:$
     // $LastChangedBy:$
     //
     package org.txm.tigersearch.commands;
     import java.io.BufferedReader;
     import java.io.File;
     import java.io.FileFilter;
     import java.io.IOException;
     import java.util.HashMap;
     import javax.xml.stream.XMLStreamException;
     import org.apache.commons.lang.StringUtils;
     import org.eclipse.core.commands.AbstractHandler;
     import org.eclipse.core.commands.ExecutionEvent;
     import org.eclipse.core.commands.ExecutionException;
     import org.eclipse.jface.viewers.IStructuredSelection;
     import org.eclipse.ui.handlers.HandlerUtil;
     import org.kohsuke.args4j.Option;
     import org.txm.core.messages.TXMCoreMessages;
     import org.txm.objects.Text;
     import org.txm.rcp.commands.workspace.UpdateCorpus;
     import org.txm.rcp.swt.widget.parameters.ParametersDialog;
     import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
     import org.txm.searchengine.cqp.corpus.CQPCorpus;
     import org.txm.searchengine.cqp.corpus.MainCorpus;
     import org.txm.searchengine.cqp.serverException.CqiServerError;
     import org.txm.utils.io.FileCopy;
     import org.txm.utils.io.IOUtils;
     import org.txm.utils.logger.Log;
     import org.txm.xml.xmltxm.XMLTXMWordPropertiesInjection;
     import cern.colt.Arrays;
     /**
      * Import CONNLU annotations into a TXM corpus
+     *
      * IF the corpus already contains CONNLU annotations, they are replaced
+     *
      * @author mdecorde.
      */
     public class ImportCONNLUAnnotationsFromFile extends AbstractHandler {
     	public static final String ID = ImportCONNLUAnnotationsFromFile.class.getName();
     	@Option(name = "connluFile", usage = "CoNLL-U file", widget = "FileOpen", required = true, def = "file.conllu")
     	File connluFile;
     	@Option(name = "textId", usage = "Identifier of the text to update", widget = "String", required = true, def = "text-id")
     	String textId;
     	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
     	String propertiesPrefix;
     	/*
     	 * (non-Javadoc)
     	 * @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent)
     	 */
     	@Override
     	public Object execute(final ExecutionEvent event) throws ExecutionException {
     		IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event);
     		Object s = selection.getFirstElement();
     		if (!(s instanceof MainCorpus)) {
     			Log.warning("Selection is not a corpus. Aborting.");
     			return null;
+    		}
     		if (!ParametersDialog.open(this)) {
     			return null;
+    		}
     		if (connluFile == null || !connluFile.exists() || !connluFile.isFile()) {
     			Log.warning("Error: cannot access to the connlu file: " + connluFile);
     			return null;
+    		}
     		CQPCorpus corpus = (CQPCorpus) s;
     		MainCorpus mainCorpus = corpus.getMainCorpus();
     		try {
     			return ImportCONNLUAnnotationsFromDirectory.importAnnotationsFromCoNNLUFile(mainCorpus, connluFile, propertiesPrefix, textId);
+    		}
     		catch (Exception e) {
     			Log.warning(e);
     			e.printStackTrace();
+    		}
     		return null;
+    	}
+    }

     	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
     	String propertiesPrefix;
     	@Option(name = "separator", usage = "Options", widget = "Separator", required = true, def = "options")
     	@Option(name = "separator", usage = "Options", widget = "Separator", required = true, def = "comments options")
     	Boolean separator = false;
     	// @Option(name = "useUDForms", usage = "use the ud form property instead of CQP 'word' property", widget = "Boolean", required = true, def = "true")
     	Boolean useUDForms = true;
     	@Option(name = "insertParagraphs", usage = "Insert paragraph marks in the Connlu corpus", widget = "Boolean", required = true, def = "true")
     	Boolean insertParagraphs = false;
     	@Option(name = "detectGap", usage = "Insert gap comment using the CQP 'gap' property", widget = "Boolean", required = true, def = "true")
     	Boolean detectGap = false;
     	// @Option(name = "transfertAllWords", usage = "Transfert word not initially in the Connlu corpus", widget = "Boolean", required = true, def = "true")
     	Boolean transfertAllWords = true;
     	@Option(name = "separator3", usage = "Options", widget = "Separator", required = true, def = "tokens options")
     	Boolean separator3 = false;
     	@Option(name = "insertNoSpaceAfter", usage = "Insert the NoSpaceAfter misc property if not initially in the Connlu corpus", widget = "Boolean", required = true, def = "true")
     	Boolean insertNoSpaceAfter = true;
     	@Option(name = "insertParagraphs", usage = "Insert paragraph marks in the Connlu corpus", widget = "Boolean", required = true, def = "true")
     	Boolean insertParagraphs = false;
     	@Option(name = "insertTokenWithoutUdAnnotations", usage = "if checked words without ud annotations are exported as well", widget = "Boolean", required = false, def = "false")
     	Boolean insertTokenWithoutUdAnnotations;
     	@Option(name = "detectGap", usage = "Insert gap comment using the CQP 'gap' property", widget = "Boolean", required = true, def = "true")
     	Boolean detectGap = false;
     	@Option(name = "defaultFormPropertyName", usage = "optional CQP property to fix the missing 'form' ud property", widget = "String", required = false, def = "")
     	String defaultFormPropertyName;
     	@Option(name = "formCorrPropertyName", usage = "optional CQP property to fix the missing 'form' ud property", widget = "String", required = false, def = "")
     	String formCorrPropertyName;
     	@Option(name = "defaultLemmaPropertyName", usage = "optional CQP property to fix the missing 'lemma' ud property", widget = "String", required = false, def = "")
     	String defaultLemmaPropertyName;
     	@Option(name = "lemmaCorrPropertyName", usage = "optional CQP property to fix the missing 'lemma' ud property", widget = "String", required = false, def = "")
     	String lemmaCorrPropertyName;
     	@Option(name = "defaultUposPropertyName", usage = "optional CQP property to fix the missing 'upos' ud property", widget = "String", required = false, def = "")
     	String defaultUposPropertyName;
     	@Option(name = "uposCorrPropertyName", usage = "optional CQP property to fix the missing 'upos' ud property", widget = "String", required = false, def = "")
     	String uposCorrPropertyName;
     	@Option(name = "defaultXposPropertyName", usage = "optional CQP property to fix the missing 'xpos' ud property", widget = "String", required = false, def = "")
     	String defaultXposPropertyName;
     	@Option(name = "xposCorrPropertyName", usage = "optional CQP property to fix the missing 'xpos' ud property", widget = "String", required = false, def = "")
     	String xposCorrPropertyName;
     	@Option(name = "punctStrong", usage = "optional strong punct tag to fix sentence limits", widget = "String", required = true, def = "PONfrt")
     	String punctStrong;
     	@Option(name = "separator2", usage = "Options", widget = "Separator", required = true, def = "sentence fix options")
     	Boolean separator2 = false;
     	@Option(name = "openingPunct", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "[\\-–«‘“\\(]")
     	String openingPunct;
     	/**
     	 * the UD property suffixes, will be used to create the CQP properties like propertiesPrefix + suffix
     	 */
     	public static String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
     	/*
-...
     		MainCorpus mainCorpus = corpus.getMainCorpus();
     		try {
     			return exportAnnotationsAsCorpus(mainCorpus, connluResultDirectory, propertiesPrefix, openingPunct,
     					formCorrPropertyName, lemmaCorrPropertyName, uposCorrPropertyName, xposCorrPropertyName, punctStrong,
     			return exportAnnotationsAsCorpus(mainCorpus, connluResultDirectory, propertiesPrefix, openingPunct, insertTokenWithoutUdAnnotations,
     					defaultFormPropertyName, defaultLemmaPropertyName, defaultUposPropertyName, defaultXposPropertyName,
     					detectGap, insertParagraphs, insertNoSpaceAfter);
+    		}
     		catch (Exception e) {
-...
+    	}
     	/**
     	 * export the corpus in a directory of conllu files (one per text)
+    	 *
     	 * if aTIGER corpus with the same name already exists, it is replaced
+    	 *
     	 * @param corpus
     	 * @param tigerCorpusDirectory
     	 * @return the number of imported annotations
     	 * @throws IndexException
     	 * @throws QueryIndexException
     	 * @param mainCorpus
     	 * @param conlluResultDirectory
     	 * @param prefix
     	 * @param openingPunct
     	 * @param insertTokenWithoutUdAnnotations
     	 * @param defaultFormPropertyName
     	 * @param defaultLemmaPropertyName
     	 * @param defaultUposPropertyName
     	 * @param defaultXposPropertyName
     	 * @param detectGap
     	 * @param insertParagraphs
     	 * @param insertNoSpaceAfter
     	 * @return the number of annotation exported
     	 * @throws UnexpectedAnswerException
     	 * @throws IOException
     	 * @throws CqiServerError
     	 * @throws CqiClientException
     	 * @throws CqiServerError
     	 * @throws IOException
     	 * @throws UnexpectedAnswerException
     	 * @throws InvalidCqpIdException
     	 */
     	public static int exportAnnotationsAsCorpus(MainCorpus mainCorpus, File conlluResultDirectory, String prefix, String openingPunct, String formCorrPropertyName, String lemmaCorrPropertyName,
     			String uposCorrPropertyName, String xposCorrPropertyName, String punctStrongRegex, boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter) throws UnexpectedAnswerException,
     	public static int exportAnnotationsAsCorpus(MainCorpus mainCorpus, File conlluResultDirectory, String prefix, String openingPunct, boolean insertTokenWithoutUdAnnotations,
     			String defaultFormPropertyName, String defaultLemmaPropertyName,
     			String defaultUposPropertyName, String defaultXposPropertyName, boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter)
     			throws UnexpectedAnswerException,
     			IOException,
     			CqiServerError,
     			CqiClientException, InvalidCqpIdException {
-...
     		int[] start_limits = mainCorpus.getTextStartLimits();
     		int[] end_limits = mainCorpus.getTextEndLimits();
     		HashSet<String> beforeSpacesRules = new HashSet<>(LangFormater.getNoSpaceBefore(mainCorpus.getLang()));
     		HashSet<String> afterSpacesRules = new HashSet<>(LangFormater.getNoSpaceAfter(mainCorpus.getLang()));
     		String lang = mainCorpus.getLang();
     		// HashSet<String> beforeSpacesRules = new HashSet<>(LangFormater.getNoSpaceBefore(mainCorpus.getLang()));
     		// HashSet<String> afterSpacesRules = new HashSet<>(LangFormater.getNoSpaceAfter(mainCorpus.getLang()));
     		for (String p : propNames) {
     			WordProperty wp = mainCorpus.getProperty(prefix + p);
-...
+    			}
+    		}
     		if (insertTokenWithoutUdAnnotations && (defaultFormPropertyName == null || mainCorpus.getProperty(defaultFormPropertyName) == null)) {
     			Log.warning("Error: the defaultFormPropertyName parameter needs to be set if insertTokenWithoutUdAnnotations is set to true");
     			return 0;
+    		}
     		for (int iText = 0; iText < start_limits.length; iText++) {
     			// Build corpus positions
-...
     			HashMap<Integer, String> sentidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, "ud-sentid");
     			HashMap<Integer, String> newdocidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, "ud-newdocid");
     			// build sentence, first pass using UD word sentence positions
     			ArrayList<ArrayList<Integer>> sentences = new ArrayList<>();
     			ArrayList<Integer> tmpSentence = new ArrayList<>();
-...
     					tmpSentence.clear();
+    				}
     				tmpSentence.add(p);
     				if (insertTokenWithoutUdAnnotations) {
     					tmpSentence.add(p); // insert all tokens
+    				}
     				else if (idValues[p] != 0) {
     					tmpSentence.add(p); // insert all tokens
+    				}
+    			}
     			positions = null; // free memory
-...
     				String[] ids = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("id").getQualifiedName(), sentencePositions);
     				String[] words = null;
     				if (formCorrPropertyName != null && formCorrPropertyName.length() > 0) {
     					words = getDefaultValues(mainCorpus, formCorrPropertyName, sentencePositions);
     				if (defaultFormPropertyName != null && defaultFormPropertyName.length() > 0) {
     					words = getDefaultValues(mainCorpus, defaultFormPropertyName, sentencePositions);
+    				}
     				String[] lemmas = null;
     				if (lemmaCorrPropertyName != null && lemmaCorrPropertyName.length() > 0) {
     					lemmas = getDefaultValues(mainCorpus, lemmaCorrPropertyName, sentencePositions);
     				if (defaultLemmaPropertyName != null && defaultLemmaPropertyName.length() > 0) {
     					lemmas = getDefaultValues(mainCorpus, defaultLemmaPropertyName, sentencePositions);
+    				}
     				String[] upos = null;
     				if (uposCorrPropertyName != null && uposCorrPropertyName.length() > 0) {
     					upos = getDefaultValues(mainCorpus, uposCorrPropertyName, sentencePositions);
     				if (defaultUposPropertyName != null && defaultUposPropertyName.length() > 0) {
     					upos = getDefaultValues(mainCorpus, defaultUposPropertyName, sentencePositions);
+    				}
     				String[] xpos = null;
     				if (xposCorrPropertyName != null && xposCorrPropertyName.length() > 0) {
     					xpos = getDefaultValues(mainCorpus, xposCorrPropertyName, sentencePositions);
     				if (defaultXposPropertyName != null && defaultXposPropertyName.length() > 0) {
     					xpos = getDefaultValues(mainCorpus, defaultXposPropertyName, sentencePositions);
+    				}
     				// String[] feats = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(featsCorrPropertyName).getQualifiedName(), sentencePositions);
-...
     				// String[] deprel = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions);
     				// String[] deps = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions);
     				HashMap<Integer, Integer> sentIds = new HashMap<>();
     				// fix ud properties using CQP values
     				for (int ip = 0; ip < sentence.size(); ip++) {
     					int p = sentence.get(ip);
     					if (idValues[p] != 0) { // store "old id -> new id"
     						sentIds.put(idValues[p], (ip + 1)); // from 1 to N
+    					}
     					// new word
     					if (miscValues[p].equals("_")) {
     						miscValues[p] = "XmlId=" + ids[ip];
-...
     					if (xpos != null && xposValues[p].equals("_")) {
     						xposValues[p] = xpos[ip];
+    					}
     					if (insertNoSpaceAfter) { // inserttino activated
+    				}
     				if (insertNoSpaceAfter) {
     					for (int ip = 0; ip < sentence.size(); ip++) { // fix SpaceAfter. !!! this needs to be done after ud properties are fixed
     						int p = sentence.get(ip);
     						// insertion activated
     						if (!miscValues[p].contains("SpaceAfter=")) { // only update if not present
     							if (afterSpacesRules.contains(formValues[p])) {
     								miscValues[p] += " SpaceAfter=Yes";
     							if (LangFormater.isSpaceAfterNotNeeded(formValues[p], lang)) {
     								miscValues[p] += "|SpaceAfter=No";
+    							}
     							else if (formValues.length < p + 1 && beforeSpacesRules.contains(formValues[p + 1])) {
     								miscValues[p] += " SpaceAfter=Yes";
     							else if (formValues.length > (p + 1) && LangFormater.isSpaceBeforeNotNeeded(formValues[p + 1], lang)) {
     								// if next token needs a space before, set SpaceAfter=Yes to the previous token
     								miscValues[p] += "|SpaceAfter=No";
+    							}
+    						}
+    					}
+    				}
     				// fixing sentence punct limits
     				while (sentence.size() > 0 && iSentence > 0 && formValues[sentence.get(0)].matches("\\p{P}") && !formValues[sentence.get(0)].matches(openingPunct)) {
     					System.out.println("FIXING: first non-openingPunct position " + formValues[sentence.get(0)] + " in " + iSentence);
     					int p2 = sentence.remove(0);
     					sentences.get(iSentence - 1).add(p2);
+    				}
     				//
     				while (sentence.size() > 0 && iSentence + 1 < sentences.size() && formValues[sentence.get(sentence.size() - 1)].matches(openingPunct)) {
     					System.out.println("FIXING: last openingPunct position " + formValues[sentence.get(sentence.size() - 1)] + " in " + iSentence);
     					int p2 = sentence.remove(sentence.size() - 1);
     					sentences.get(iSentence + 1).add(0, p2);
+    				}
     				// fixing sentence punct limits
     //					while (sentence.size() > 0 && iSentence > 0 && xpos[0].matches(punctStrongRegex)) {
     //						System.out.println("FIXING: first punctStrong position " + xposValues[sentence.get(0)] + " in " + iSentence);
     //						int p2 = sentence.remove(0);
     //						sentences.get(iSentence - 1).add(p2);
     //					}
     					while (sentence.size() > 0 && iSentence > 0 && formValues[sentence.get(0)].matches("\\p{P}") && !formValues[sentence.get(0)].matches(openingPunct)) {
     						System.out.println("FIXING: first non-openingPunct position " + formValues[sentence.get(0)] + " in " + iSentence);
     						int p2 = sentence.remove(0);
     						sentences.get(iSentence - 1).add(p2);
+    					}
     //
     					while (sentence.size() > 0 && iSentence + 1 < sentences.size() && formValues[sentence.get(sentence.size() - 1)].matches(openingPunct)) {
     						System.out.println("FIXING: last openingPunct position " + formValues[sentence.get(sentence.size() - 1)] + " in " + iSentence);
     						int p2 = sentence.remove(sentence.size() - 1);
     						sentences.get(iSentence + 1).add(0, p2);
+    					}
     				if (sentence.size() == 0) { // sentence was depleted after fixing it
     					sentences.remove(iSentence);
     					iSentence--;
     					continue;
+    				}
+    			}
     			for (int s = 0; s < sentences.size(); s++) {
     				// fix only ud sentences limits
     				ArrayList<Integer> sentence = sentences.get(s);
     				HashMap<Integer, Integer> oldToNewIds = new HashMap<>();
     				for (int ip = 0; ip < sentence.size(); ip++) { // computing old to new ids
     					int p = sentence.get(ip);
     					if (idValues[p] != 0) { // store "old id -> new id"
     						oldToNewIds.put(idValues[p], (ip + 1)); // from 1 to N
+    					}
+    				}
     				// fixing head and set missing head to 0 and root
     				for (int ip = 0; ip < sentence.size(); ip++) {
     					int p = sentence.get(ip);
-...
     					idValues[p] = (ip + 1);  // from 1 to N
     					// fixing head values
     					if (sentIds.containsKey(headValues[p])) {
     						headValues[p] = sentIds.get(headValues[p]);
     					if (oldToNewIds.containsKey(headValues[p])) {
     						headValues[p] = oldToNewIds.get(headValues[p]);
+    					}
     					else { // new word, set to default values
     						headValues[p] = -1;
     					else if (headValues[p] != 0) { // new word, set to default values
     						headValues[p] = 0;
     						deprelValues[p] = "_";
     						depsValues[p] = "_";
+    					}
-...
     					gap = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("gap").getQualifiedName(), sentencePositions);
+    				}
     				String[] tokens = new String[sentence.size()];
     				for (int ip = 0; ip < sentence.size(); ip++) {
     					tokens[ip] = formValues[sentence.get(ip)];
-...
     					writer.println("# newdoc id = " + textIds[iText]);
+    				}
     				if (sentidStartPositions.containsKey(sentence.get(0))) {
     					writer.println("# sent_id = " + sentidStartPositions.get(sentence.get(0)));
     				boolean foundSentId = false;
     				for (int ip : sentence) {
     					if (!foundSentId && sentidStartPositions.containsKey(ip)) {
     						writer.println("# sent_id = " + sentidStartPositions.get(ip));
     						foundSentId = true;
+    					}
+    				}
     				else {
     				if (!foundSentId) { // no sent_id found
     					writer.println("# sent_id = " + textIds[iText] + "-" + (iSentence + 1) + ".new");
+    				}
-...
     					iParagraph++;
+    				}
     				for (int ip = 0 ; ip < sentence.size() ; ip++) {
     				for (int ip = 0; ip < sentence.size(); ip++) {
     					int p = sentence.get(ip);
     					// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
     					writer.println(idValues[p] + "\t" + formValues[p] + "\t" + lemmaValues[p] + "\t" + uposValues[p]
     							+ "\t" + xposValues[p] + "\t" + featsValues[p] + "\t" + headValues[p] + "\t" + deprelValues[p]
-...
     			numberOfTextsWritten++;
+    		}
     		System.out.println("N words written: " + numberOfWordsWritten);
     		System.out.println("N sentences written: " + numberOfSentencesWritten);
     		System.out.println("N texts written: " + numberOfTextsWritten);
     		System.out.println("# words written: " + numberOfWordsWritten);
     		System.out.println("# sentences written: " + numberOfSentencesWritten);
     		System.out.println("# texts written: " + numberOfTextsWritten);
     		return numberOfWordsWritten;
+    	}
-...
+    		}
+    	}
+    }

     // Copyright © 2010-2020 ENS de Lyon., University of Franche-Comté
     // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
     // Lyon 2, University of Franche-Comté, University of Nice
     // Sophia Antipolis, University of Paris 3.
     //
     // The TXM platform is free software: you can redistribute it
     // and/or modify it under the terms of the GNU General Public
     // License as published by the Free Software Foundation,
     // either version 2 of the License, or (at your option) any
     // later version.
     //
     // The TXM platform is distributed in the hope that it will be
     // useful, but WITHOUT ANY WARRANTY; without even the implied
     // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
     // PURPOSE. See the GNU General Public License for more
     // details.
     //
     // You should have received a copy of the GNU General
     // Public License along with the TXM platform. If not, see
     // http://www.gnu.org/licenses.
     //
     //
     //
     // $LastChangedDate:$
     // $LastChangedRevision:$
     // $LastChangedBy:$
     //
     package org.txm.tigersearch.commands;
     import java.io.BufferedReader;
     import java.io.File;
     import java.io.FileFilter;
     import java.io.IOException;
     import java.util.HashMap;
     import javax.xml.stream.XMLStreamException;
     import org.apache.commons.lang.StringUtils;
     import org.eclipse.core.commands.AbstractHandler;
     import org.eclipse.core.commands.ExecutionEvent;
     import org.eclipse.core.commands.ExecutionException;
     import org.eclipse.jface.viewers.IStructuredSelection;
     import org.eclipse.ui.handlers.HandlerUtil;
     import org.kohsuke.args4j.Option;
     import org.txm.core.messages.TXMCoreMessages;
     import org.txm.objects.Text;
     import org.txm.rcp.commands.workspace.UpdateCorpus;
     import org.txm.rcp.swt.widget.parameters.ParametersDialog;
     import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
     import org.txm.searchengine.cqp.corpus.CQPCorpus;
     import org.txm.searchengine.cqp.corpus.MainCorpus;
     import org.txm.searchengine.cqp.serverException.CqiServerError;
     import org.txm.utils.io.FileCopy;
     import org.txm.utils.io.IOUtils;
     import org.txm.utils.logger.Log;
     import org.txm.xml.xmltxm.XMLTXMWordPropertiesInjection;
     import cern.colt.Arrays;
     /**
      * Import CONNLU annotations into a TXM corpus
+     *
      * IF the corpus already contains CONNLU annotations, they are replaced
+     *
      * @author mdecorde.
      */
     public class ImportCONNLUAnnotationsFromDirectory extends AbstractHandler {
     	public static final String ID = ImportCONNLUAnnotationsFromDirectory.class.getName();
     	@Option(name = "connluDirectory", usage = "connluDirectory", widget = "Folder", required = true, def = "connlu-directory")
     	File connluDirectory;
     	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
     	String propertiesPrefix;
     	/*
     	 * (non-Javadoc)
     	 * @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent)
     	 */
     	@Override
     	public Object execute(final ExecutionEvent event) throws ExecutionException {
     		IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event);
     		Object s = selection.getFirstElement();
     		if (!(s instanceof MainCorpus)) {
     			Log.warning("Selection is not a corpus. Aborting.");
     			return null;
+    		}
     		if (!ParametersDialog.open(this)) {
     			return null;
+    		}
     		if (connluDirectory == null || !connluDirectory.exists() || !connluDirectory.isDirectory() || connluDirectory.listFiles().length == 0) {
     			Log.warning("Error: connlu directory is empty: " + connluDirectory);
     			return null;
+    		}
     		CQPCorpus corpus = (CQPCorpus) s;
     		MainCorpus mainCorpus = corpus.getMainCorpus();
     		try {
     			return importAnnotations(mainCorpus, connluDirectory, propertiesPrefix);
+    		}
     		catch (Exception e) {
     			Log.warning(e);
     			e.printStackTrace();
+    		}
     		return null;
+    	}
     	/**
+    	 *
     	 * if import CONNLU annotations in the corpus with the same name already exists, it is replaced
+    	 *
     	 * @param corpus
     	 * @param connluDirectory
     	 * @param propertiesPrefix
     	 * @return the number of imported annotations
     	 * @throws CqiClientException
     	 * @throws CqiServerError
     	 * @throws IOException
     	 * @throws XMLStreamException
     	 */
     	public static int importAnnotations(MainCorpus mainCorpus, File connluDirectory, String propertiesPrefix) throws IOException,
     			CqiServerError, CqiClientException, XMLStreamException {
     		Log.info(TXMCoreMessages.bind("Importing CONNL-u annotations of {0} in {1} using the ''{2}'' prefix...", connluDirectory, mainCorpus, propertiesPrefix));
     		File[] files = connluDirectory.listFiles(new FileFilter() {
     			@Override
     			public boolean accept(File file) {
     				return file.isFile() && file.getName().endsWith(".conllu");
+    			}
     		});
     		int nTextProcessed = 0;
     		int nWordsInserted = 0;
     		for (File coonluFile : files) {
     			nWordsInserted += _importAnnotations(coonluFile, mainCorpus, propertiesPrefix, null);
     			nTextProcessed++;
+    		}
     		if (nTextProcessed == 0) {
     			Log.warning("** No text to process. Aborting.");
     			return 0;
+    		}
     		if (nWordsInserted == 0) {
     			Log.warning("** No annotation imported. Aborting.");
     			return 0;
+    		}
     		Log.info("XML-TXM source files updated. Updating indexes...");
     		UpdateCorpus.update(mainCorpus);
     		Log.info("Done.");
     		return 0;
+    	}
     	/**
+    	 *
     	 * if import CONNLU annotations in the corpus with the same name already exists, it is replaced
+    	 *
     	 * @param corpus
     	 * @param connluFile
     	 * @param propertiesPrefix
     	 * @return the number of imported annotations
     	 * @throws CqiClientException
     	 * @throws CqiServerError
     	 * @throws IOException
     	 * @throws XMLStreamException
     	 */
     	public static int importAnnotationsFromCoNNLUFile(MainCorpus mainCorpus, File connluFile, String propertiesPrefix, String textId) throws IOException,
     			CqiServerError, CqiClientException, XMLStreamException {
     		Log.info(TXMCoreMessages.bind("Importing CONNL-u annotations of {0} in {1} using the ''{2}'' prefix...", connluFile, mainCorpus, propertiesPrefix));
     		int nWordsInserted = _importAnnotations(connluFile, mainCorpus, propertiesPrefix, textId);
     		if (nWordsInserted == 0) {
     			Log.warning("** No annotation imported. Aborting.");
     			return 0;
+    		}
     		Log.info("XML-TXM source files updated. Updating indexes...");
     		UpdateCorpus.update(mainCorpus);
     		Log.info("Done.");
     		return 0;
+    	}
     	private static int _importAnnotations(File coonluFile, MainCorpus mainCorpus, String propertiesPrefix, String textId) throws IOException, XMLStreamException {
     		if (textId == null || textId.length() == 0) { // no text id provided, using the connlu file name
     			textId = coonluFile.getName().substring(0, coonluFile.getName().length() - 7);
+    		}
     		Log.info("** processing text: " + textId);
     		Text text = mainCorpus.getProject().getText(textId);
     		if (text == null) {
     			Log.warning("No text found with ID=" + textId);
     			return 0;
+    		}
     		File xmltxmFile = mainCorpus.getProject().getText(textId).getXMLTXMFile();
     		File xmltxmUpdatedFile = new File(System.getProperty("java.io.tmpdir"), xmltxmFile.getName());
     		XMLTXMWordPropertiesInjection processor = new XMLTXMWordPropertiesInjection(xmltxmFile);
     		HashMap<String, HashMap<String, String>> rules = new HashMap<>();
     		processor.setProperties(rules);
     		BufferedReader reader = IOUtils.getReader(coonluFile);
     		String line = reader.readLine();
     		String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
     		int nWords2 = 0;
     		int nLine = 0;
     		String sent_id = null;
     		String newpar_id = null;
     		String newdoc_id = null;
     		while (line != null) {
     			nLine++;
     			if (line.length() == 0) {
     				line = reader.readLine();
     				continue; // comment
+    			}
     			if (line.startsWith("#")) {
     				if (line.startsWith("# sent_id = ")) {
     					sent_id = line.substring(12).trim();
+    				}
     				else if (line.startsWith("# newdoc id = ")) {
     					newdoc_id = line.substring(14).trim();
+    				}
     				else if (line.startsWith("# newpar id = ")) {
     					newpar_id = line.substring(14).trim();
+    				}
     				else {
     					// nothing for now
+    				}
     				line = reader.readLine();
     				continue; // comment
+    			}
     			String[] split = line.split("\t", 10);
     			if (split.length < 10) {
     				Log.warning("Error: line " + nLine + " : " + line + " -> " + Arrays.toString(split) + " len=" + split.length);
     				line = reader.readLine();
     				continue; // comment
+    			}
     			String misc = split[9];
     			String[] miscValues = misc.split("\\|");
     			String id = null;
     			for (String miscValue : miscValues) {
     				if (miscValue.startsWith("XmlId=")) {
     					id = miscValue.substring(6);
+    				}
+    			}
     			HashMap<String, String> properties = new HashMap<>();
     			for (int i = 0; i < split.length; i++) {
     				properties.put("#" + propertiesPrefix + propNames[i], split[i]); // add the property name using the prefix ; XML-TXM types are prefixed with '#'
+    			}
     			if (sent_id != null) {
     				properties.put("#ud-sentid", sent_id);
     				sent_id = ""; // reset value for next sentence
+    			}
     			else {
     				properties.put("#ud-sentid", "");
+    			}
     			if (newdoc_id != null) {
     				properties.put("#ud-newdocid", newdoc_id);
     				newdoc_id = null; // reset value for next sentence
+    			}
     			else {
     				properties.put("#ud-newdocid", "");
+    			}
     			if (newpar_id != null) {
     				properties.put("#ud-newparid", newpar_id);
     				newpar_id = null; // reset value for next sentence
+    			}
     			else {
     				properties.put("#ud-newparid", "");
+    			}
     			if (id == null) {
     				Log.warning("No 'XmlId=' found for UD line: " + line);
+    			}
     			else {
     				processor.addProperty(id, properties);
     				nWords2++;
+    			}
     			line = reader.readLine();
+    		}
     		reader.close();
     		if (nWords2 == 0) {
     			Log.warning("** No annotation to import in " + coonluFile);
     			return 0;
+    		}
     		Log.info("** loading annotations from : " + coonluFile);
     		if (processor.process(xmltxmUpdatedFile)) {
     			if (xmltxmFile.delete() && FileCopy.copy(xmltxmUpdatedFile, xmltxmFile)) {
     				if (processor.getNonActivatedRules().size() > 0) {
     					Log.warning("Warning: some words were not imported: " + StringUtils.join(processor.getNonActivatedRules(), ", "));
+    				}
+    			}
     			else {
     				Log.warning("** Warning: annotation import failed for replace the corpus XML-TXM file: " + xmltxmFile + ". TEMP file: " + xmltxmUpdatedFile);
     				return 0;
+    			}
+    		}
     		else {
     			Log.warning("** Warning: annotation import failed for text: " + textId);
     			return 0;
+    		}
     		if (processor.getNInsertions() == 0) {
     			Log.warning("** No annotation imported in " + textId);
+    		}
     		return processor.getNInsertions();
+    	}
+    }

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 2941