/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 3539

     import org.txm.searchengine.cqp.CqpDataProxy;
     import org.txm.searchengine.cqp.corpus.*
     import org.txm.searchengine.cqp.corpus.query.CQLQuery
     import org.txm.utils.ConsoleProgressBar
     public class ToCoNLL2009 {
     	boolean debug = true
     	/**
     	 * To CoNLL2009.
-...
     	 * @param encoding the encoding of the outfile
     	 * @return true, if successful
     	 */
     	public boolean process(File outfile, CQPCorpus corpus, StructuralUnitProperty sentenceUnit, Property word, Property lemme, Property pos, String encoding) {
     	public boolean process(File outdir, CQPCorpus corpus, StructuralUnitProperty sentenceUnit, Property word, Property lemme, Property pos, String encoding) {
     		def matches = corpus.query(new CQLQuery("<s> []+ </s>"), "TMPEXPORTCONLL", false).getMatches()
     		if (debug) println matches.size()
     		int npositions = 0;
     		for (def match : matches) npositions += match.size() +1
     		if (debug) println "npositions= $npositions"
     		int[] positions = new int[npositions+1]
     		int i = 0;
     		for (def match : matches) {
     			for (int p : match.getRange())
     				positions[i++] = p
+    		}
     		List<String> words = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(word.getQualifiedName(), positions);
     		List<String> lemmes = null;
     		if (lemme != null) {
     			lemmes = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(lemme.getQualifiedName(), positions);
+    		}
     		List<String> poss = null;
     		if (pos != null) {
     			poss =  CorpusManager.getCorpusManager().getCqiClient().cpos2Str(pos.getQualifiedName(), positions);
+    		}
     		Writer writer = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"); //$NON-NLS-1$;
     		String lemmestr;
     		String posstr;
     		int id = 0;
     		int iposition = 0;
     		for (def match : matches) {
     			id = 0 // first position in sentence
     			for (i = match.start ; i < match.end ; i++) { // loop n times, n = mathc length
     				if (lemme != null)
     					lemmestr = lemmes.get(iposition);
     				else
     					lemmestr = "_";
     				if (pos != null)
     					posstr = poss.get(iposition);
     				else
     					posstr = "_";
     				//print(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\n");
     				writer.write(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\n");
     				id++; // next word
     				iposition++ // nextposition
     		String sstruct = sentenceUnit.getStructuralUnit().getName()
     		def texts = corpus.getCorpusTextIdsList();
     		def idProperty = corpus.getProperty("id")
     		ConsoleProgressBar cpb = new ConsoleProgressBar(texts.size());
     		for (def text : texts) {
     			cpb.tick()
     			def matches = corpus.query(new CQLQuery("[_.text_id=\"$text\"] expand to $sstruct"), "TMPEXPORTCONLL", false).getMatches()
     			if (debug) println matches.size()
     			int npositions = 0;
     			for (def match : matches) npositions += match.size() +1
     			if (debug) println "npositions= $npositions"
     			int[] positions = new int[npositions+1]
     			int i = 0;
     			for (def match : matches) {
     				for (int p : match.getRange())
     					positions[i++] = p
+    			}
     			List<String> words = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(word.getQualifiedName(), positions);
     			List<String> idsList = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(idProperty.getQualifiedName(), positions);
     			List<String> lemmes = null;
     			if (lemme != null) {
     				lemmes = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(lemme.getQualifiedName(), positions);
+    			}
     			List<String> poss = null;
     			if (pos != null) {
     				poss =  CorpusManager.getCorpusManager().getCqiClient().cpos2Str(pos.getQualifiedName(), positions);
+    			}
     			File outfile = new File(outdir, text+".conllu")
     			Writer writer = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"); //$NON-NLS-1$;
     			String lemmestr;
     			String posstr;
     			int id = 0;
     			int iposition = 0;
     			for (def match : matches) {
     				writer.write("\n");
     				writer.write("# newdoc id=$text\n");
     				writer.write("# sent_id=$match\n");
     				id = 0 // first position in sentence
     				for (i = match.start ; i < match.end ; i++) { // loop n times, n = mathc length
     					if (lemme != null)
     						lemmestr = lemmes.get(iposition);
     					else
     						lemmestr = "_";
     					if (pos != null)
     						posstr = poss.get(iposition);
     					else
     						posstr = "_";
     					//print(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\n");
     					writer.write(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\tXmlID="+idsList.get(iposition)+"\n");
     					id++; // next word
     					iposition++ // nextposition
+    				}
+    			}
     			writer.flush();
     			writer.close();
+    		}
     		writer.flush();
     		writer.close();
     		cpb.done()
     		return true;
+    	}
     	public static void main(String[] args) {
     		CQPCorpus corpus = null
     		File outfile = new File("/home/mdecorde/Bureau/conn_export.tsv")
     		String encoding = "UTF-8"
     		String sentenceProperty = "s_id"
     		String posProperty = "frpos"
     		String lemmaProperty = "frlemma"
     		def split = sentenceProperty.split("_", 2)
     		StructuralUnitProperty s = null;//corpus.getStructuralUnit(split[0]).getProperty(split[1])
     		Property word = null;//corpus.getProperty("word")
     		Property lemma = null;//corpus.getProperty(lemmaProperty)
     		Property pos = null;//corpus.getProperty(posProperty)
     		//		if (s == null) { println "Error sentence property: $sentenceProperty"; return}
     		//		if (word == null) { println "Error no word property"; return}
     		//		if (pos == null) { println "Error pos property: $posProperty"; return}
     		//		if (lemma == null) { println "Error lemma property: $lemmaProperty"; return}
     		ToCoNLL2009 processor = new ToCoNLL2009()
     		processor.process(outfile, corpus, s, word, lemma, pos, encoding)
+    	}

     import org.txm.searchengine.cqp.corpus.CQPCorpus;
     import org.txm.searchengine.cqp.corpus.MainCorpus;
     import org.txm.searchengine.cqp.serverException.CqiServerError;
     import org.txm.utils.ConsoleProgressBar;
     import org.txm.utils.logger.Log;
     /**
-...
     	public static int importAnnotations(MainCorpus mainCorpus, File conlluDirectory, String propertiesPrefix, Boolean normalizeWordIds,
     			Set<String> headPropertiesToProject, Set<String> depsPropertiesToProject, Set<String> udPropertiesToImport) throws IOException, CqiServerError, CqiClientException, XMLStreamException {
     		Log.info(TXMCoreMessages.bind("Importing CONLL-u annotations of {0} in {1} using the ''{2}'' prefix...", conlluDirectory, mainCorpus, propertiesPrefix));
     		Log.info(TXMCoreMessages.bind("Importing CONLL-U annotations of {0} in {1} using the ''{2}'' prefix...", conlluDirectory, mainCorpus, propertiesPrefix));
     		File[] files = conlluDirectory.listFiles(new FileFilter() {
     			@Override
-...
     		int nTextProcessed = 0;
     		int nWordsInserted = 0;
     		ConsoleProgressBar cpb = new ConsoleProgressBar(files.length);
     		for (File coonluFile : files) {
     			cpb.tick();
     			nWordsInserted += ImportCoNLLUAnnotations._importAnnotations(coonluFile, mainCorpus, propertiesPrefix, null, normalizeWordIds, headPropertiesToProject, depsPropertiesToProject, udPropertiesToImport);
     			nTextProcessed++;
+    		}
     		cpb.done();
     		if (nTextProcessed == 0) {
     			Log.warning("** No text to process. Aborting.");

     			if (line.startsWith("#")) {
     				if (line.startsWith("# sent_id = ")) {
     					// write previous sentence word properties
     					if (sentenceProperties.size() > 0) {
     						storeSentenceProperties(sentenceProperties, processor, propertiesPrefix, headPropertiesToProject, depsPropertiesToProject, udPropertiesToImport);
+    					}
     					sent_id = line.substring(12).trim();
+    				}
     				else if (line.startsWith("# newdoc id = ")) {
     					newdoc_id = line.substring(14).trim();
-...
     				continue; // comment
+    			}
     			if (split[0].equals("1")) { // a new sentence begins, write the previous if any
     				// write previous sentence word properties
     				if (sentenceProperties.size() > 0) {
     					storeSentenceProperties(sentenceProperties, processor, propertiesPrefix, headPropertiesToProject, depsPropertiesToProject, udPropertiesToImport);
+    				}
+    			}
     			String misc = split[9];
     			String[] miscValues = misc.split("\\|");
     			String wId = null;

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 3539