/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

     // $LastChangedRevision:$
     // $LastChangedBy:$
     //
     package org.txm.tokenizer;
     package org.txm.scripts.filters.Tokeniser;
     import static groovy.transform.TypeCheckingMode.SKIP
     import groovy.transform.CompileStatic
-...
     import javax.xml.stream.*
     import org.eclipse.ui.internal.dialogs.NewContentTypeDialog
     import org.txm.importer.PersonalNamespaceContext
     import org.txm.tokenizer.StringTokenizer
     import org.txm.tokenizer.SimpleStringTokenizer
-...
     import org.txm.xml.XPathHookActivator
     import org.w3c.dom.Node
     @CompileStatic
     public class ChunkTokenizerXml extends XMLProcessor {
     	XPathHookActivator activator;
-...
     	/** The prefix. */
     	String prefix;
     	String filename;
     	int wordcount = 0;
     	Pattern regLN;
     	Pattern regCTRL;
-...
     	 */
     	public ChunkTokenizerXml(File infile, TokenizerClasses tc) {
     		super(infile)
     		this.lang = tc.lang;
     		this.stringTokenizer = new SimpleStringTokenizer(lang);
-...
     		regLN = Pattern.compile("/\n/");
     		regCTRL = Pattern.compile("/\\p{C}/");
     		activator = new XPathHookActivator<>(hook, "//div|p|ab");
     		activator = new XPathHookActivator<>(hook, "//(div|p|ab|note|s|list|head|front|body|back|text)");
     		hook = new DOMIdentityHook("in_text_hook", activator, this) {
     			String id;
     			boolean inAna = false;
     			boolean inForm = false;
     			boolean inW = false;
     			ArrayList<String[]> anaValues = new ArrayList<>();
     			ArrayList<String[]> formValues = new ArrayList<>();
     			StringBuilder value = new StringBuilder();
     			String resp = "";
     			String type = "";
     			/**
     					String id;
     					boolean inAna = false;
     					boolean inForm = false;
     					boolean inW = false;
     					ArrayList<String[]> anaValues = new ArrayList<>();
     					ArrayList<String[]> formValues = new ArrayList<>();
     					StringBuilder value = new StringBuilder();
     					String resp = "";
     					String type = "";
     					/**
     			 * extends this method to process the DOM before it is written
     			 */
     			public void processDom() {
     				ArrayList<Node> textNodes = getTextNodes(dom);
     				if (textNodes.size() == 0) return; // easy
     				StringBuilder buffer = new StringBuilder(); // build a string to tokenize
     				for (Node textNode : textNodes) {
     					buffer.append(" "+textNode.getTextContent());
+    				}
     				int nNode = 0;
     				Node currentTextNode = textNodes.get(0);
     				String currentText = currentTextNode.getTextContent();
     				int curentTextIndex = 0;
     				StringBuilder currentNewText = new StringBuilder()
     				ArrayList<String> currentWords = new ArrayList<String>()
     				List<List<String>> sentences = stringTokenizer.processText(buffer.toString());
     				//println "text="+buffer.toString()
     				println "sentences=$sentences"
     				for (List<String> sent : sentences) {
     					if (nNode >= textNodes.size()) { // all nodes are updated
     						break;
+    					}
     					for (String word : sent) {
     						if (nNode >= textNodes.size()) { // all nodes are updated
     							break;
     					public void processDom() {
     						//println "Processing DOM with $stringTokenizer"
     						ArrayList<Node> textNodes = getTextNodes(dom);
     						if (textNodes.size() == 0) return; // easy
     						StringBuilder buffer = new StringBuilder(); // build a string to tokenize
     						for (Node textNode : textNodes) {
     							buffer.append(" "+textNode.getTextContent());
+    						}
     						int idx = currentText.indexOf(word, curentTextIndex);
     						if (idx >= 0) {
     							curentTextIndex = idx + word.length();
     						} else {
     							println "HOUSTON: word=$word nNode=$nNode currentText=$currentText index=$curentTextIndex words=$currentWords"
     							currentTextNode.setTextContent("");
     							for (String w : currentWords) {
     								Node newChild = dom.getOwnerDocument().createElementNS(null, "w");
     								newChild.setAttribute("id", "W_ID")
     								newChild.setTextContent(w);
     						int nNode = 0;
     						Node currentTextNode = textNodes.get(0);
     						String currentText = currentTextNode.getTextContent();
     						int curentTextIndex = 0;
     						StringBuilder currentNewText = new StringBuilder()
     						ArrayList<String> currentWords = new ArrayList<String>()
     						//println "text="+buffer.toString()
     						List<List<List<String>>> sentences = stringTokenizer.processText(buffer.toString());
     						//println "sentences=$sentences"
     						for (List<List<String>> sent : sentences) {
     							//println "sent: $sent"
     							for (int iWord = 0 ; iWord < sent.size() ; iWord++) {
     								currentTextNode.getParentNode().insertBefore(newChild, currentTextNode)
     								List<String> word = sent.get(iWord)
     								int idx = currentText.indexOf(word[0], curentTextIndex);
     								//println "TEST: $word at $curentTextIndex in $currentText"
     								if (idx >= 0) { // the whole word is in the current node text content
     									curentTextIndex = idx + word[0].length();
     									currentWords.add(word)
     									continue // NEXT WORD
+    								}
     								// see if the word is partially in the current text node
     //								String partialWord = ""
     //								for (int c = 0 ; c < word.length() - 1 ; c++) {
     //									idx = currentText.indexOf(word.substring(0, word.length() - c), curentTextIndex);
     //									if (idx >= 0) { // the partial word is in the current node text content
     //										partialWord = word.substring(0, word.length() - c)
     //										break;
     //									}
     //								}
     //								if (partialWord.length() > 0) { // yes, the word is partially in the text node
     //									//println "partialword=$partialWord"
     //									def sword = word.substring(partialWord.length())
     //									//println "sword=$sword"
     //									if (sword.length() > 0) { // should always happen
     //										sent.set(iWord, sword)
     //										//iWord-- // to re-process the word
     //									}
     //									curentTextIndex = idx + partialWord.length()
     //									currentWords.add(partialWord.toString())
     //								}
     								if (currentWords.size() > 0) {
     									writeWords(nNode, currentText, curentTextIndex, currentTextNode, currentWords)
+    								}
     								if (nNode < textNodes.size() - 1) {
     									currentNewText = new StringBuilder()
     									curentTextIndex = 0;
     									nNode++;
     									currentTextNode = textNodes.get(nNode);
     									currentText = currentTextNode.getTextContent();
     									iWord--; //
     								} else {
     									currentWords.add(word)
+    								}
+    							}
     							currentTextNode.getParentNode().removeChild(currentTextNode)
     							currentNewText = new StringBuilder()
     							currentWords.clear();
     							curentTextIndex = 0;
     							nNode++;
     							if (nNode < textNodes.size()) {
     								currentTextNode = textNodes.get(nNode);
     								currentText = currentTextNode.getTextContent();
     							if (currentWords.size() > 0) {
     								writeWords(nNode, currentText, curentTextIndex, currentTextNode, currentWords)
+    							}
     							if (stringTokenizer.doSentences()) {
     								Node newChild = dom.getOwnerDocument().createProcessingInstruction("txm", "</s>")
     								currentTextNode.getParentNode().insertBefore(newChild, currentTextNode)//appendChild(newChild)//
+    							}
+    						}
     						currentWords.add(word)
+    					}
+    				}
+    			}
     			public ArrayList<Node> getTextNodes(Node element) {
     				def children = element.getChildNodes()
     				ArrayList<Node> texts = new ArrayList<Node>()
     				for (int i = 0 ; i < children.getLength() ; i++) {
     					def node = children.item(i);
     					if (node.getNodeType() == Node.TEXT_NODE) {
     						texts.add(node)
     					} else if (node.getNodeType() == Node.ELEMENT_NODE) {
     						if (node.getLocalName().equals("w")) {
     							texts.add(node)
     						} else {
     							texts.addAll(getTextNodes(node));
     					public void writeWords(def nNode, def currentText, def curentTextIndex, def currentTextNode, def currentWords) {
     						//println "WRITING: nNode=$nNode currentText='${currentText.replace("\\n", " ")}' index=$curentTextIndex words=$currentWords"
     						currentTextNode.setTextContent("");
     						for (def w : currentWords) {
     							Node newChild = dom.getOwnerDocument().createElementNS(null, "w");
     							wordcount++
     							newChild.setAttribute("id", "w_"+filename+"_"+wordcount)
     							for (int i = 0 ; i < stringTokenizer.getAdditionalProperties().size() ; i++) {
     								//println "write att: "+stringTokenizer.getAdditionalProperties()[i]+"="+w[i+1]
     								newChild.setAttribute(stringTokenizer.getAdditionalProperties()[i], w[i+1])
+    							}
     							newChild.setTextContent(w[0]);
     							currentTextNode.getParentNode().insertBefore(newChild, currentTextNode)
+    						}
     						currentWords.clear();
+    					}
+    				}
     				return texts;
+    			}
     		};
     					public ArrayList<Node> getTextNodes(Node element) {
     						def children = element.getChildNodes()
     						ArrayList<Node> texts = new ArrayList<Node>()
     						for (int i = 0 ; i < children.getLength() ; i++) {
     							def node = children.item(i);
     							if (node.getNodeType() == Node.TEXT_NODE && node.getTextContent().trim().length() > 0) {
     								texts.add(node)
     							} else if (node.getNodeType() == Node.ELEMENT_NODE) {
     								if (node.getLocalName().equals("w")) {
     									if (retokenize) {
     										//texts.add(node)
+    									}
     								} else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(node.getLocalName()).matches()) {
     									//texts.add(node)
     								} else if (reg_outside_text_tags != null && reg_outside_text_tags.matcher(node.getLocalName()).matches()) { // ignore the tag and its content of the tag
     									//texts.add(node)
     								} else {
     									texts.addAll(getTextNodes(node));
+    								}
+    							}
+    						}
     						return texts;
+    					}
     				};
+    	}
     	/**
-...
+    	}
     	/** The wordcount. */
     	int wordcount = 0;
     	/** The ignorecontent. */
     	boolean ignorecontent = true; // tokenization starts with element name matching $reg_startTag
     	boolean insideword = false;
-...
     		text = regLN.matcher(text).replaceAll(WHITESPACE);
     		text = regCTRL.matcher(text).replaceAll(EMPTY);						// remove ctrl characters
     		println "tokenize text"
     		def sentences = stringTokenizer.processText(text);
     		for (def words : sentences) {
     			for (def word : words) {
     				wordcount++;
     				writer.writeStartElement(word_element_to_create);
     				writeWordAttributes();// id
     				writer.writeCharacters(word);
     				for (int i = 0 ; i < stringTokenizer.getAdditionalProperties() ; i++) {
     					String att = stringTokenizer.getAdditionalProperties().get(i)
     					writer.writeAttribute(att, word[i+1])
+    				}
     				writer.writeCharacters(word[0]);
     				writer.writeEndElement();
     				writer.writeCharacters("\n");
     				println "WRITE WORD: "+word[0]+" in "+writer
+    			}
     			if (stringTokenizer.doSentences())  {
     				writer.writeProcessingInstruction("txm", "</s>")

     				wordcount++;
     				writer.writeStartElement(word_element_to_create);
     				writeWordAttributes();// id
     				writer.writeCharacters(word);
     				for (int i = 0 ; i < stringTokenizer.getAdditionalProperties() ; i++) {
     					String att = stringTokenizer.getAdditionalProperties().get(i)
     					writer.writeAttribute(att, word[i+1])
+    				}
     				writer.writeCharacters(word[0]);
     				writer.writeEndElement();
     				writer.writeCharacters("\n");
+    			}

     package org.txm.scripts.importer.xtz
     import org.txm.scripts.filters.Tokeniser.SimpleTokenizerXml
     import org.txm.scripts.filters.Tokeniser.ChunkTokenizerXml
     import java.io.File;
-...
     				cpb.tick()
     				File infile = f;
     				File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName());
     				SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, TokenizerClasses.newTokenizerClasses(project.getPreferencesScope(), lang))
     				if (module.getProject().getAnnotate()) { // an annotation will be done, does the annotation engine needs another tokenizer ?
     				ChunkTokenizerXml tokenizer = new ChunkTokenizerXml(infile, TokenizerClasses.newTokenizerClasses(project.getPreferencesScope(), lang))
     				//if (module.getProject().getAnnotate()) { // an annotation will be done, does the annotation engine needs another tokenizer ?
     					String engineName = module.getProject().getImportParameters().node("annotate").get("engine", "TreeTagger")
     					def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine(engineName)
     					def stringTokenizer = engine.getStringTokenizer(lang)
     					if (stringTokenizer != null) {
     						tokenizer.setStringTokenizer(stringTokenizer)
+    					}
+    				}
     				//}
     				tokenizer.setRetokenize(retokenize)
     				if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) {
     					tokenizer.setOutSideTextTags(outSideTextTagsRegex)
-...
+    				}
     				// tokenize !
     				if (!tokenizer.process()) {
     				if (!tokenizer.process(outfile)) {
     					println("Failed to process "+f)
     					outfile.delete()
+    				}

     import org.txm.utils.logger.Log;
     import org.txm.xml.xmltxm.XMLTXMWordPropertiesInjection;
     import cz.cuni.mff.ufal.udpipe.InputFormat;
     import cz.cuni.mff.ufal.udpipe.Model;
     import cz.cuni.mff.ufal.udpipe.MultiwordToken;
     import cz.cuni.mff.ufal.udpipe.MultiwordTokens;
     import cz.cuni.mff.ufal.udpipe.Sentence;
     import cz.cuni.mff.ufal.udpipe.Sentences;
     import cz.cuni.mff.ufal.udpipe.Version;
-...
     	 */
     	public static String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
     	/**
     	 * TODO re-think the process. The UDPStringTokenizer do the job
     	 */
     	@Override
     	public boolean processFile(File xmlFile, File binaryCorpusDirectory, HashMap<String, Object> parameters) {
     		if (!isRunning()) return false;
     		String lang = null;
     		Object ps = parameters.get("langs");
     		Object p = parameters.get("lang");
     //		String lang = null;
     //		Object ps = parameters.get("langs");
     //		Object p = parameters.get("lang");
     //
     //		if (p == null && ps == null) {
     //			Log.warning("Warning: can't annotate. No 'lang' (String) or 'langs' (Map<String, String>) parameter specified in " + parameters);
     //			return false;
     //		}
     //
     //		if (ps != null && ps instanceof Map) {
     //			Map<?, ?> map = (Map<?, ?>) ps;
     //			String text_id = xmlFile.getName();
     //			if (map.get(text_id) != null) {
     //				lang = map.get(text_id).toString().toLowerCase();
     //				if (!canAnnotateLang(lang)) {
     //					Log.warning("Warning: can't annotate text_id=${text_id} with $lang, will use the default lang=$p");
     //					return false;
     //				}
     //			}
     //		}
     //
     //		if (lang == null && p == null) {
     //			System.out.println(NLS.bind("** Error: no 'lang' parameter given: {0}. Aborting TreeTagger annotation.", parameters));
     //			return false;
     //		}
     //		else {
     //			lang = p.toString();
     //		}
     //
     //		if (!canAnnotateLang(lang)) {
     //			return false;
     //		}
     //
     //		File modelsDirectory = new File(UDPipePreferences.getInstance().getString(UDPipePreferences.MODELS_PATH)); // default models directory is set in the Toolbox
     //		File modelFile = new File(modelsDirectory, lang + ".udpipe");
     //		//System.out.println("model="+modelFile.getAbsolutePath());
     //
     //		try {
     //			long time = System.currentTimeMillis();
     //			// get words
     //			XMLTXMToUDPipeXMLParser wparser = new XMLTXMToUDPipeXMLParser(xmlFile.toURI().toURL());
     //			if (!wparser.process(null)) {
     //				Log.warning("Error while parsing: " + xmlFile);
     //				return false;
     //			}
     //			//System.out.println("words built in: "+(System.currentTimeMillis() - time));
     //			Sentences sentences = wparser.getSentences();
     //
     //			time = System.currentTimeMillis();
     //			// System.out.println("SENTENCES PARSED: " + UDPipeJavaUtils.toString(sentences));
     //			// tag
     //			UDPipeJavaUtils.processSentences(modelFile.getAbsolutePath(), sentences);
     //			//System.out.println("sentences parsed in: "+(System.currentTimeMillis() - time));
     //			// System.out.println("SENTENCES RESULT: " + UDPipeJavaUtils.toString(sentences));
     //
     //			// update the XML-TXM file
     //			time = System.currentTimeMillis();
     //			XMLTXMWordPropertiesInjection injector = new XMLTXMWordPropertiesInjection(xmlFile);
     //			HashMap<String, HashMap<String, String>> rules = new HashMap<>();
     //			for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
     //				Sentence sentence = sentences.get(iSentence);
     //				Words words = sentence.getWords();
     //				MultiwordTokens mwt = sentence.getMultiwordTokens();
     //				HashMap<Integer, MultiwordToken> firstWordToToken = new HashMap<Integer, MultiwordToken>();
     //				HashMap<Integer, MultiwordToken> lastWordToToken = new HashMap<Integer, MultiwordToken>();
     //				for (int i = 0 ; i < mwt.size(); i++) {
     //					MultiwordToken mw = mwt.get(i);
     //					firstWordToToken.put(mw.getIdFirst(), mw);
     //					lastWordToToken.put(mw.getIdLast(), mw);
     //				}
     //
     //				for (int iWord = 0; iWord < words.size(); iWord++) {
     //					Word word = words.get(iWord);
     //					String form = word.getForm();
     //					if (Sentence.getRootForm().equals(form)) continue;
     //
     //					if (firstWordToToken.containsKey(word.getId())) {
     //						MultiwordToken mw = firstWordToToken.get(word.getId());
     //
     //						String misc = mw.getMisc();
     //						int idx = misc.indexOf(XMLIDMISC);
     //						int idx2 = misc.indexOf("|", idx + 6);
     //						if (idx2 < 0) idx2 = misc.length();
     //						String id = misc.substring(idx + 6, idx2);
     //						if (id != null && id.length() > 0) {
     //							HashMap<String, String> properties = UDPipeJavaUtils.wordToHashMap(sentence, mw, "#ud-");
     //							rules.put(id, properties);
     //						}
     //					} else if (lastWordToToken.containsKey(word.getId())) {
     //						// already written
     //					} else {
     //
     //						String misc = word.getMisc();
     //						int idx = misc.indexOf(XMLIDMISC);
     //						int idx2 = misc.indexOf("|", idx + 6);
     //						if (idx2 < 0) idx2 = misc.length();
     //						String id = misc.substring(idx + 6, idx2);
     //						if (id != null && id.length() > 0) {
     //							HashMap<String, String> properties = UDPipeJavaUtils.wordToHashMap(word, "#ud-");
     //							rules.put(id, properties);
     //						}
     //					}
     //				}
     //			}
     //			File outFile = new File(binaryCorpusDirectory, xmlFile.getName() + ".tmp");
     //			injector.setProperties(rules);
     //			if (injector.process(outFile) && outFile.exists()) {
     //				xmlFile.delete();
     //				outFile.renameTo(xmlFile);
     //			}
     //			else {
     //				Log.warning("Error while processing: " + xmlFile);
     //			}
     //			System.out.println("ud properties injected in: "+(System.currentTimeMillis() - time));
     //		}
     //		catch (IOException | XMLStreamException e) {
     //			// TODO Auto-generated catch block
     //			e.printStackTrace();
     //		}
     		if (p == null && ps == null) {
     			Log.warning("Warning: can't annotate. No 'lang' (String) or 'langs' (Map<String, String>) parameter specified in " + parameters);
     			return false;
+    		}
     		if (ps != null && ps instanceof Map) {
     			Map<?, ?> map = (Map<?, ?>) ps;
     			String text_id = xmlFile.getName();
     			if (map.get(text_id) != null) {
     				lang = map.get(text_id).toString().toLowerCase();
     				if (!canAnnotateLang(lang)) {
     					Log.warning("Warning: can't annotate text_id=${text_id} with $lang, will use the default lang=$p");
     					return false;
+    				}
+    			}
+    		}
     		if (lang == null && p == null) {
     			System.out.println(NLS.bind("** Error: no 'lang' parameter given: {0}. Aborting TreeTagger annotation.", parameters));
     			return false;
+    		}
     		else {
     			lang = p.toString();
+    		}
     		if (!canAnnotateLang(lang)) {
     			return false;
+    		}
     		File modelsDirectory = new File(UDPipePreferences.getInstance().getString(UDPipePreferences.MODELS_PATH)); // default models directory is set in the Toolbox
     		File modelFile = new File(modelsDirectory, lang + ".udpipe");
     		//System.out.println("model="+modelFile.getAbsolutePath());
     		try {
     			// get words
     			XMLTXMToUDPipeXMLParser wparser = new XMLTXMToUDPipeXMLParser(xmlFile.toURI().toURL());
     			if (!wparser.process(null)) {
     				Log.warning("Error while parsing: " + xmlFile);
     				return false;
+    			}
     			Sentences sentences = wparser.getSentences();
     			// System.out.println("SENTENCES PARSED: " + UDPipeJavaUtils.toString(sentences));
     			// tag
     			UDPipeJavaUtils.processSentences(modelFile.getAbsolutePath(), sentences);
     			// System.out.println("SENTENCES RESULT: " + UDPipeJavaUtils.toString(sentences));
     			// update XML-TXM files
     			XMLTXMWordPropertiesInjection injector = new XMLTXMWordPropertiesInjection(xmlFile);
     			HashMap<String, HashMap<String, String>> rules = new HashMap<>();
     			for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
     				Sentence sentence = sentences.get(iSentence);
     				Words words = sentence.getWords();
     				for (int iWord = 0; iWord < words.size(); iWord++) {
     					Word word = words.get(iWord);
     					String form = word.getForm();
     					if ("<root>".equals(form)) continue;
     					String misc = word.getMisc();
     					int idx = misc.indexOf(XMLIDMISC);
     					int idx2 = misc.indexOf("|", idx + 6);
     					if (idx2 < 0) idx2 = misc.length();
     					String id = misc.substring(idx + 6, idx2);
     					if (id != null && id.length() > 0) {
     						HashMap<String, String> properties = UDPipeJavaUtils.wordToHashMap(word, "#ud-");
     						rules.put(id, properties);
+    					}
+    				}
+    			}
     			File outFile = new File(binaryCorpusDirectory, xmlFile.getName() + ".tmp");
     			injector.setProperties(rules);
     			if (injector.process(outFile) && outFile.exists()) {
     				xmlFile.delete();
     				outFile.renameTo(xmlFile);
+    			}
     			else {
     				Log.warning("Error while processing: " + xmlFile);
+    			}
+    		}
     		catch (IOException | XMLStreamException e) {
     			// TODO Auto-generated catch block
     			e.printStackTrace();
+    		}
     		// update xml-txm files
     		return true;
+    	}

TXM/branches/eltec/org.txm.udpipe.core/src/org/txm/udpipe/core/XMLTXMToUDPipeXMLParser.java (revision 3283)
173	173	flagform = false;
174	174	form = form.trim();
175	175	form = form.replace("\n", "").replace("<", "<");
176		Word word = new Word();
177		word.setForm(form);
	176
	177	sentence.addWord(form);
	178	Word word = sentence.getWords().get((int) (sentence.getWords().size() - 1));
178	179	word.setMisc(UDPipeEngine.XMLIDMISC + wordId);
179		sentence.getWords().add(word);
180	180	}
181	181	break;
182	182
...	...
195	195	}
196	196	}
197	197
198		if (sentence != null && sentence.getWords().size() > 0) {
	198	if (sentence != null && sentence.getWords().size() > 1) { // a sent contains at least <root>
199	199	sentences.add(sentence);
200	200	}
201	201

     import cz.cuni.mff.ufal.udpipe.InputFormat;
     import cz.cuni.mff.ufal.udpipe.Model;
     import cz.cuni.mff.ufal.udpipe.MultiwordToken;
     import cz.cuni.mff.ufal.udpipe.MultiwordTokens;
     import cz.cuni.mff.ufal.udpipe.OutputFormat;
     import cz.cuni.mff.ufal.udpipe.Sentence;
     import cz.cuni.mff.ufal.udpipe.Sentences;
-...
     			model.parse(sent, "");
     			model.tag(sent, "");
     //			System.out.println(toString(sent));
     			//			System.out.println(toString(sent));
+    		}
     		return sentences;
-...
     	public static String toString(Sentence sent) {
     		StringBuilder buffer = new StringBuilder();
     		Words words = sent.getWords();
     		for (int i = 0; i < words.size(); i++) {
     		MultiwordTokens mwt = sent.getMultiwordTokens();
     		HashMap<Integer, MultiwordToken> firstWordToToken = new HashMap<Integer, MultiwordToken>();
     		HashMap<Integer, MultiwordToken> lastWordToToken = new HashMap<Integer, MultiwordToken>();
     		for (int i = 0 ; i < mwt.size(); i++) {
     			MultiwordToken mw = mwt.get(i);
     			firstWordToToken.put(mw.getIdFirst(), mw);
     			lastWordToToken.put(mw.getIdLast(), mw);
+    		}
     		for (int i = 0 ; i < words.size(); i++) {
     			//			MultiwordToken mw = mwt.get(i);
     			//			Word w = words.get(mw.getIdFirst());
     			Word w = words.get(i);
     			if (i > 0) buffer.append(" ");
     			buffer.append(w.getForm());
     			buffer.append("/" + w.getLemma());
     			buffer.append("/" + w.getUpostag());
     			buffer.append("/" + w.getXpostag());
     			buffer.append("/" + w.getFeats());
     			buffer.append("/" + w.getDeps());
     			buffer.append("/" + w.getDeprel());
     			buffer.append("/" + w.getHead());
     			buffer.append("/" + w.getMisc());
     			if (firstWordToToken.containsKey(w.getId())) {
     				MultiwordToken mw = firstWordToToken.get(w.getId());
     				Word w2 = words.get(mw.getIdLast());
     				buffer.append(""+w.getId()+"+"+w2.getId());
     				buffer.append("/" + w.getForm()+"+"+w2.getForm());
     				buffer.append("/" + w.getLemma()+"+"+w2.getLemma());
     				buffer.append("/" + w.getUpostag()+"+"+w2.getUpostag());
     				buffer.append("/" + w.getXpostag()+"+"+w2.getXpostag());
     				buffer.append("/" + w.getFeats()+"+"+w2.getFeats());
     				buffer.append("/" + w.getDeps()+"+"+w2.getDeps());
     				buffer.append("/" + w.getDeprel()+"+"+w2.getDeprel());
     				buffer.append("/" + w.getHead()+"+"+w2.getHead());
     				buffer.append("/" + w.getMisc()+"+"+w2.getMisc());
     			} else if (lastWordToToken.containsKey(w.getId())) {
     				// already written
     			} else {
     				buffer.append(w.getId());
     				buffer.append("/" + w.getForm());
     				buffer.append("/" + w.getLemma());
     				buffer.append("/" + w.getUpostag());
     				buffer.append("/" + w.getXpostag());
     				buffer.append("/" + w.getFeats());
     				buffer.append("/" + w.getDeps());
     				buffer.append("/" + w.getDeprel());
     				buffer.append("/" + w.getHead());
     				buffer.append("/" + w.getMisc());
+    			}
+    		}
     		return buffer.toString();
+    	}
-...
+    	}
     	public static void main(String[] args) {
     //		try {
     //			toConnluFile(new File("/tmp/result.connlu"), new File("/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe"),
     //					"Et un petit test... En deux phrases ? ou trois.");
     		//		try {
     		//			toConnluFile(new File("/tmp/result.connlu"), new File("/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe"),
     		//					"Et un petit test... En deux phrases ? ou trois.");
     		//		}
     		//		catch (UnsupportedEncodingException | FileNotFoundException e) {
     		//			// TODO Auto-generated catch block
     		//			e.printStackTrace();
     		//		}
     		UDPipeEngine.initializeUDLib(new File("/home/mdecorde/workspace047/org.txm.libs.udpipe"));
     //		System.out.println("VERSION: "+Version.current().toString());
     //		// /usr/lib/UDPipe/models/fr.udpipe
     //		// /home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe
     //		for (String p : new String[] {"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe",
     //				"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-partut-ud-2.4-190531.udpipe",
     //				"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-sequoia-ud-2.4-190531.udpipe",
     //		"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-spoken-ud-2.4-190531.udpipe"}) {
     //			System.out.println("M="+p);
     //			System.out.println(toString(process(p, "Une plaine, des champs cultivés que traverse une grande route.")));
     //		}
     //		catch (UnsupportedEncodingException | FileNotFoundException e) {
     //			// TODO Auto-generated catch block
     //			e.printStackTrace();
     //		}
     		UDPipeEngine.initializeUDLib(new File("/home/mdecorde/workspace047/org.txm.libs.udpipe"));
     		System.out.println("VERSION: "+Version.current().toString());
     		// /usr/lib/UDPipe/models/fr.udpipe
     		// /home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe
     		for (String p : new String[] {"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe",
     				"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-partut-ud-2.4-190531.udpipe",
     				"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-sequoia-ud-2.4-190531.udpipe",
     				"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-spoken-ud-2.4-190531.udpipe"}) {
     			System.out.println("M="+p);
     		process(p, "Et un petit test... En deux phrases ? ou trois.");
+    		}
     		Sentence sentence = new Sentence();
     		sentence.addWord("je");
     		sentence.addWord("suis");
     		sentence.addWord(".");
     		System.out.println("SENT="+toString(sentence));
+    	}
     	public static void processSentences(String modelPath, Sentences sentences) {
     		processSentences(Model.load(modelPath), sentences);
     		long time = System.currentTimeMillis();
     		Model m = Model.load(modelPath);
     		//System.out.println("Model load in: "+(System.currentTimeMillis()-time));
     		time = System.currentTimeMillis();
     		processSentences(m, sentences);
     		//System.out.println("sent processed in: "+(System.currentTimeMillis()-time));
+    	}
     	public static void processSentences(Model model, Sentences sentences) {
-...
     		for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
     			Sentence sent = sentences.get(iSentence);
     			model.parse(sent, "");
     			model.tag(sent, "");
     			model.parse(sent, "");
     			//System.out.println(toString(sent));
+    		}
+    	}
-...
     		return properties;
+    	}
     	/**
     	 * fill a map with values of "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc"
+    	 *
     	 * @param word
     	 * @param prefix
     	 * @return
     	 */
     	public static HashMap<String, String> wordToHashMap(Sentence sentence, MultiwordToken mword, String prefix) {
     		if (prefix == null) prefix = "";
     		Word word = sentence.getWords().get(mword.getIdFirst());
     		Word word2 = sentence.getWords().get(mword.getIdFirst());
     		HashMap<String, String> properties = new HashMap<>();
     		properties.put(prefix + "id", Integer.toString(word.getId()) + "+" + Integer.toString(word2.getId()));
     		properties.put(prefix + "form", word.getForm() + "+" + word2.getForm());
     		properties.put(prefix + "lemma", word.getLemma() + "+" + word2.getLemma());
     		properties.put(prefix + "upos", word.getUpostag() + "+" + word2.getUpostag());
     		properties.put(prefix + "xpos", word.getXpostag() + "+" + word2.getXpostag());
     		properties.put(prefix + "feats", word.getFeats() + "+" + word2.getFeats());
     		properties.put(prefix + "head", Integer.toString(word.getHead()) + "+" + Integer.toString(word2.getId()));
     		properties.put(prefix + "deprel", word.getDeprel() + "+" + word2.getDeprel());
     		properties.put(prefix + "deps", word.getDeps() + "+" + word2.getDeps());
     		properties.put(prefix + "misc", word.getMisc() + "+" + word2.getMisc());
     		return properties;
+    	}
+    }

     import java.io.File;
     import java.util.ArrayList;
     import java.util.Arrays;
     import java.util.HashMap;
     import java.util.List;
     import org.txm.tokenizer.StringTokenizer;
     import cz.cuni.mff.ufal.udpipe.InputFormat;
     import cz.cuni.mff.ufal.udpipe.Model;
     import cz.cuni.mff.ufal.udpipe.MultiwordToken;
     import cz.cuni.mff.ufal.udpipe.MultiwordTokens;
     import cz.cuni.mff.ufal.udpipe.Sentence;
     import cz.cuni.mff.ufal.udpipe.Word;
     import cz.cuni.mff.ufal.udpipe.Words;
-...
     		tokenizer = model.newTokenizer("");
+    	}
     	public final static List<String> ADDITIONAL_PROPERTIES = Arrays.asList("ud-id", "ud-form", "ud-lemma", "ud-upos", "ud-xpos", "ud-feats", "ud-head", "ud-deprel", "ud-deps", "ud-misc");
     	public List<String> getAdditionalProperties() {
     		return ADDITIONAL_PROPERTIES;
+    	}
     	@Override
     	public List<List<String>> processText(String text) {
     		ArrayList<List<String>> result = new ArrayList<>();
     	public ArrayList<ArrayList<ArrayList<String>>> processText(String text) {
     		ArrayList<ArrayList<ArrayList<String>>> result = new ArrayList<>();
     		tokenizer.setText(text);
     		Sentence sent = new Sentence();
     		while (tokenizer.nextSentence(sent)) {
     			List<String> sresult = new ArrayList<>();
     			model.parse(sent, "");
     			model.tag(sent, "");
     			ArrayList<ArrayList<String>> sresult = new ArrayList<>();
     			Words words = sent.getWords();
     			MultiwordTokens mwt = sent.getMultiwordTokens();
     			HashMap<Integer, MultiwordToken> firstWordToToken = new HashMap<Integer, MultiwordToken>();
     			HashMap<Integer, MultiwordToken> lastWordToToken = new HashMap<Integer, MultiwordToken>();
     			for (int i = 0 ; i < mwt.size(); i++) {
     				MultiwordToken mw = mwt.get(i);
     				firstWordToToken.put(mw.getIdFirst(), mw);
     				lastWordToToken.put(mw.getIdLast(), mw);
+    			}
     			// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
     			for (int iWord = 0; iWord < words.size(); iWord++) {
     				sresult.add(words.get(iWord).getForm());
     				Word word = words.get(iWord);
     				ArrayList<String> properties = new ArrayList<String>();
     				if (firstWordToToken.containsKey(word.getId())) {
     					MultiwordToken mw = firstWordToToken.get(word.getId());
     					Word word2 = words.get(mw.getIdLast());
     					properties.add(mw.getForm());
     					properties.add(""+word.getId());
     					properties.add(word.getForm() + "+" + word2.getForm());
     					properties.add(word.getLemma() + "+" + word2.getLemma());
     					properties.add(word.getUpostag() + "+" + word2.getUpostag());
     					properties.add(word.getXpostag() + "+" + word2.getXpostag());
     					properties.add(word.getFeats() + "+" + word2.getFeats());
     					properties.add(Integer.toString(word.getHead()) + "+" + word2.getHead());
     					properties.add(word.getDeprel() + "+" + word2.getDeprel());
     					properties.add(word.getDeps() + "+" + word2.getDeps());
     					properties.add(mw.getMisc() + "+" + word2.getMisc());
     					sresult.add(properties);
     				} else if (lastWordToToken.containsKey(word.getId())) {
     					// already written
     				} else {
     					properties.add(word.getForm());
     					properties.add(""+word.getId());
     					properties.add(word.getForm());
     					properties.add(word.getLemma());
     					properties.add(word.getUpostag());
     					properties.add(word.getXpostag());
     					properties.add(word.getFeats());
     					properties.add(Integer.toString(word.getHead()));
     					properties.add(word.getDeprel());
     					properties.add(word.getDeps());
     					properties.add(word.getMisc());
     					sresult.add(properties);
+    				}
+    			}
     			if (sresult.size() > 0) {
     				sresult.remove(0); // remove the <root> element
-...
     		return true;
+    	}
+    }

TXM/branches/eltec/org.txm.tokenizer.core/src/org/txm/tokenizer/SimpleStringTokenizer.groovy (revision 3283)
270	270	println "Result : "+tokenizer.processText(text).collect{"<"+it+">"}
271	271	}
272	272	}
	273
	274	@Override
	275	public List<String> getAdditionalProperties() {
	276	return Arrays.asList();
	277	}
273	278	}

     package org.txm.tokenizer;
     import java.util.ArrayList;
     import java.util.List;
     public interface StringTokenizer {
     	List<List<String>> processText(String text);
     	ArrayList<ArrayList<ArrayList<String>>> processText(String text);
     	boolean doSentences();
     	List<String> getAdditionalProperties();
+    }

     import org.eclipse.jface.viewers.ISelectionChangedListener;
     import org.eclipse.jface.viewers.IStructuredContentProvider;
     import org.eclipse.jface.viewers.SelectionChangedEvent;
     import org.eclipse.jface.viewers.StructuredSelection;
     import org.eclipse.jface.viewers.TableViewer;
     import org.eclipse.jface.viewers.TableViewerColumn;
     import org.eclipse.jface.viewers.Viewer;
-...
     import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
     import org.txm.searchengine.cqp.corpus.CQPCorpus;
     import org.txm.searchengine.cqp.corpus.Property;
     import org.txm.searchengine.cqp.corpus.StructuralUnit;
     import org.txm.searchengine.cqp.corpus.StructuralUnitProperty;
     import org.txm.searchengine.cqp.corpus.WordProperty;
     import org.txm.searchengine.cqp.corpus.query.Match;
-...
     		structComboLabel.setText(InternalViewUIMessages.structure);
     		structComboLabel.setLayoutData(new GridData(GridData.CENTER, GridData.CENTER, false, true));
     		this.structuralUnitsComboViewer = new StructuralUnitsComboViewer(parametersArea, this, true);
     		// Listener
     		this.structuralUnitsComboViewer.addSelectionChangedListener(new ISelectionChangedListener() {
-...
+    			}
     		});
     		// Word properties selector
     		propertiesSelector = new PropertiesSelector<>(parametersArea);
     		propertiesSelector.setLayoutData(new GridData(GridData.CENTER, GridData.CENTER, false, true));

     import java.io.Reader;
     import java.io.StringReader;
     import java.util.Arrays;
     import javax.xml.parsers.ParserConfigurationException;
     import javax.xml.stream.XMLInputFactory;
-...
     					elements++;
     					break;
     				case XMLStreamConstants.CHARACTERS:
     					//char[] tmp = Arrays.copyOfRange(parser.getTextCharacters(), parser.getTextStart(), parser.getTextStart()+parser.getTextLength());
     					//System.out.println("T='"+Arrays.toString(tmp)+"'");
     					Text textNode = doc.createTextNode(parser.getText());
     					currentElement.appendChild(textNode);
     					break;

     			parentParser.writer.writeCharacters("\n");
     			Element e = (Element) node;
     			NodeList children = e.getChildNodes();
     			String ns = e.getNamespaceURI();
     			String ln = e.getLocalName();
     			if (children.getLength() > 0) {
     				String ns = e.getNamespaceURI();
     				String ln = e.getLocalName();
     				if (ns == null) {
     					parentParser.writer.writeStartElement(ln);
     				} else {
-...
+    				}
+    			}
     			else {
     				parentParser.writer.writeEmptyElement(e.getNamespaceURI(), e.getLocalName());
     				if (ns == null) {
     					parentParser.writer.writeEmptyElement(ln);
     				} else {
     					parentParser.writer.writeEmptyElement(ns, ln);
+    				}
+    			}
     			for (int i = 0; i < e.getAttributes().getLength(); i++) {
-...
+    			}
     			if (children.getLength() > 0) {
     				parentParser.writer.writeEndElement();
     				parentParser.writer.writeCharacters("\n");
     				//parentParser.writer.writeCharacters("\n");
+    			}
+    		}
     		else if (node.getNodeType() == Node.TEXT_NODE) {
     			parentParser.writer.writeCharacters(node.getTextContent());
     		} else if (node.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE) {
     			parentParser.writer.writeProcessingInstruction(node.getNodeName(), node.getNodeValue());
     		} else if (node.getNodeType() == Node.CDATA_SECTION_NODE) {
     			parentParser.writer.writeCData(node.getNodeValue());
     		} else if (node.getNodeType() == Node.COMMENT_NODE) {
     			parentParser.writer.writeComment(node.getNodeValue());
+    		}
+    	}

     import java.io.File;
     import java.io.PrintWriter;
     import java.util.ArrayList;
     import java.util.Arrays;
     import java.util.Collection;
     import java.util.HashMap;
     import java.util.LinkedHashMap;
-...
     	public boolean loadParameters() throws Exception {
     		try {
     			String str = this.getStringParameterValue(TXMPreferences.STRUCTURAL_UNIT);
     			this.pStructuralUnit = this.getCorpus().getStructuralUnit(str);
     			if (str.length() == 0) {
     				try {
     					StructuralUnit struct = this.getCorpus().getStructuralUnit("text");
     					if (struct != null) {
     						this.pStructuralUnit = struct;
+    					}
     					struct = this.getCorpus().getStructuralUnit("div");
     					if (struct != null) {
     						this.pStructuralUnit = struct;
+    					}
     					struct = this.getCorpus().getStructuralUnit("p");
     					if (struct != null) {
     						this.pStructuralUnit = struct;
+    					}
+    				}
     				catch (CqiClientException e1) {
     					// TODO Auto-generated catch block
     					e1.printStackTrace();
+    				}
     			} else {
     				this.pStructuralUnit = this.getCorpus().getStructuralUnit(str);
+    			}
+    		}
     		catch (Exception e) {
     			Log.printStackTrace(e);
-...
+    		}
     		try {
     			String str = this.getStringParameterValue(TXMPreferences.STRUCTURAL_UNIT_PROPERTIES);
     			this.pStructuralUnitsProperties = StructuralUnitProperty.stringToProperties(this.getCorpus(), str);
     			if (str.length() == 0) {
     				StructuralUnitProperty structP = pStructuralUnit.getProperty("id");
     				if (structP != null) {
     					this.pStructuralUnitsProperties = Arrays.asList(structP);
+    				}
     				structP = pStructuralUnit.getProperty("n");
     				if (structP != null) {
     					this.pStructuralUnitsProperties = Arrays.asList(structP);
+    				}
     			} else {
     				this.pStructuralUnitsProperties = StructuralUnitProperty.stringToProperties(this.getCorpus(), str);
+    			}
+    		}
     		catch (Exception e3) {
     			Log.printStackTrace(e3);
-...
     		this.setDirty();
+    	}
     	@Override
     	public boolean setParameters(TXMParameters parameters) {
     		System.err.println("InternalView.setParameters(): not yet implemented.");

     		super.initializeDefaultPreferences();
     		Preferences preferences = this.getDefaultPreferencesNode();
     		preferences.put(STRUCTURAL_UNIT, TBXPreferences.DEFAULT_STRUCTURAL_UNIT);
     		preferences.put(STRUCTURAL_UNIT, "");
     		preferences.put(UNIT_PROPERTIES, TBXPreferences.DEFAULT_UNIT_PROPERTY);
     		preferences.put(STRUCTURAL_UNIT_PROPERTIES, TBXPreferences.DEFAULT_STRUCTURAL_UNIT_PROPERTY);
     		preferences.put(STRUCTURAL_UNIT_PROPERTIES, "");
     		preferences.putInt(CURRENT_PAGE, 0);
+    	}
+    }

Laboratoire ICAR » Plateforme TXM

Révision 3283