Révision 3283
| TXM/branches/eltec/org.txm.groovy.core/src/groovy/org/txm/scripts/filters/Tokeniser/ChunkTokenizerXml.groovy (revision 3283) | ||
|---|---|---|
| 41 | 41 |
// $LastChangedRevision:$ |
| 42 | 42 |
// $LastChangedBy:$ |
| 43 | 43 |
// |
| 44 |
package org.txm.tokenizer;
|
|
| 44 |
package org.txm.scripts.filters.Tokeniser;
|
|
| 45 | 45 |
|
| 46 | 46 |
import static groovy.transform.TypeCheckingMode.SKIP |
| 47 | 47 |
import groovy.transform.CompileStatic |
| ... | ... | |
| 52 | 52 |
|
| 53 | 53 |
import javax.xml.stream.* |
| 54 | 54 |
|
| 55 |
import org.eclipse.ui.internal.dialogs.NewContentTypeDialog |
|
| 55 | 56 |
import org.txm.importer.PersonalNamespaceContext |
| 56 | 57 |
import org.txm.tokenizer.StringTokenizer |
| 57 | 58 |
import org.txm.tokenizer.SimpleStringTokenizer |
| ... | ... | |
| 63 | 64 |
import org.txm.xml.XPathHookActivator |
| 64 | 65 |
import org.w3c.dom.Node |
| 65 | 66 |
|
| 66 |
@CompileStatic |
|
| 67 | 67 |
public class ChunkTokenizerXml extends XMLProcessor {
|
| 68 | 68 |
|
| 69 | 69 |
XPathHookActivator activator; |
| ... | ... | |
| 119 | 119 |
/** The prefix. */ |
| 120 | 120 |
String prefix; |
| 121 | 121 |
String filename; |
| 122 |
int wordcount = 0; |
|
| 122 | 123 |
|
| 123 | 124 |
Pattern regLN; |
| 124 | 125 |
Pattern regCTRL; |
| ... | ... | |
| 139 | 140 |
*/ |
| 140 | 141 |
public ChunkTokenizerXml(File infile, TokenizerClasses tc) {
|
| 141 | 142 |
super(infile) |
| 143 |
|
|
| 142 | 144 |
this.lang = tc.lang; |
| 143 | 145 |
this.stringTokenizer = new SimpleStringTokenizer(lang); |
| 144 | 146 |
|
| ... | ... | |
| 158 | 160 |
regLN = Pattern.compile("/\n/");
|
| 159 | 161 |
regCTRL = Pattern.compile("/\\p{C}/");
|
| 160 | 162 |
|
| 161 |
activator = new XPathHookActivator<>(hook, "//div|p|ab");
|
|
| 163 |
activator = new XPathHookActivator<>(hook, "//(div|p|ab|note|s|list|head|front|body|back|text)");
|
|
| 162 | 164 |
|
| 163 | 165 |
hook = new DOMIdentityHook("in_text_hook", activator, this) {
|
| 164 |
|
|
| 165 |
String id; |
|
| 166 |
|
|
| 167 |
boolean inAna = false; |
|
| 168 |
|
|
| 169 |
boolean inForm = false; |
|
| 170 |
|
|
| 171 |
boolean inW = false; |
|
| 172 |
|
|
| 173 |
ArrayList<String[]> anaValues = new ArrayList<>(); |
|
| 174 |
|
|
| 175 |
ArrayList<String[]> formValues = new ArrayList<>(); |
|
| 176 |
|
|
| 177 |
StringBuilder value = new StringBuilder(); |
|
| 178 |
|
|
| 179 |
String resp = ""; |
|
| 180 |
|
|
| 181 |
String type = ""; |
|
| 182 |
|
|
| 183 |
/** |
|
| 166 |
|
|
| 167 |
String id;
|
|
| 168 |
|
|
| 169 |
boolean inAna = false;
|
|
| 170 |
|
|
| 171 |
boolean inForm = false;
|
|
| 172 |
|
|
| 173 |
boolean inW = false;
|
|
| 174 |
|
|
| 175 |
ArrayList<String[]> anaValues = new ArrayList<>();
|
|
| 176 |
|
|
| 177 |
ArrayList<String[]> formValues = new ArrayList<>();
|
|
| 178 |
|
|
| 179 |
StringBuilder value = new StringBuilder();
|
|
| 180 |
|
|
| 181 |
String resp = "";
|
|
| 182 |
|
|
| 183 |
String type = "";
|
|
| 184 |
|
|
| 185 |
/**
|
|
| 184 | 186 |
* extends this method to process the DOM before it is written |
| 185 | 187 |
*/ |
| 186 |
public void processDom() {
|
|
| 187 |
|
|
| 188 |
ArrayList<Node> textNodes = getTextNodes(dom); |
|
| 189 |
if (textNodes.size() == 0) return; // easy |
|
| 190 |
|
|
| 191 |
StringBuilder buffer = new StringBuilder(); // build a string to tokenize |
|
| 192 |
for (Node textNode : textNodes) {
|
|
| 193 |
buffer.append(" "+textNode.getTextContent());
|
|
| 194 |
} |
|
| 195 |
|
|
| 196 |
int nNode = 0; |
|
| 197 |
Node currentTextNode = textNodes.get(0); |
|
| 198 |
String currentText = currentTextNode.getTextContent(); |
|
| 199 |
int curentTextIndex = 0; |
|
| 200 |
StringBuilder currentNewText = new StringBuilder() |
|
| 201 |
ArrayList<String> currentWords = new ArrayList<String>() |
|
| 202 |
List<List<String>> sentences = stringTokenizer.processText(buffer.toString()); |
|
| 203 |
//println "text="+buffer.toString() |
|
| 204 |
println "sentences=$sentences" |
|
| 205 |
for (List<String> sent : sentences) {
|
|
| 206 |
if (nNode >= textNodes.size()) { // all nodes are updated
|
|
| 207 |
break; |
|
| 208 |
} |
|
| 209 |
|
|
| 210 |
for (String word : sent) {
|
|
| 211 |
if (nNode >= textNodes.size()) { // all nodes are updated
|
|
| 212 |
break; |
|
| 188 |
public void processDom() {
|
|
| 189 |
|
|
| 190 |
//println "Processing DOM with $stringTokenizer" |
|
| 191 |
ArrayList<Node> textNodes = getTextNodes(dom); |
|
| 192 |
if (textNodes.size() == 0) return; // easy |
|
| 193 |
|
|
| 194 |
StringBuilder buffer = new StringBuilder(); // build a string to tokenize |
|
| 195 |
for (Node textNode : textNodes) {
|
|
| 196 |
buffer.append(" "+textNode.getTextContent());
|
|
| 213 | 197 |
} |
| 214 | 198 |
|
| 215 |
int idx = currentText.indexOf(word, curentTextIndex); |
|
| 216 |
if (idx >= 0) {
|
|
| 217 |
curentTextIndex = idx + word.length(); |
|
| 218 |
} else {
|
|
| 219 |
println "HOUSTON: word=$word nNode=$nNode currentText=$currentText index=$curentTextIndex words=$currentWords" |
|
| 220 |
currentTextNode.setTextContent("");
|
|
| 221 |
for (String w : currentWords) {
|
|
| 222 |
Node newChild = dom.getOwnerDocument().createElementNS(null, "w"); |
|
| 223 |
newChild.setAttribute("id", "W_ID")
|
|
| 224 |
newChild.setTextContent(w); |
|
| 199 |
int nNode = 0; |
|
| 200 |
Node currentTextNode = textNodes.get(0); |
|
| 201 |
String currentText = currentTextNode.getTextContent(); |
|
| 202 |
int curentTextIndex = 0; |
|
| 203 |
StringBuilder currentNewText = new StringBuilder() |
|
| 204 |
ArrayList<String> currentWords = new ArrayList<String>() |
|
| 205 |
//println "text="+buffer.toString() |
|
| 206 |
List<List<List<String>>> sentences = stringTokenizer.processText(buffer.toString()); |
|
| 207 |
//println "sentences=$sentences" |
|
| 208 |
for (List<List<String>> sent : sentences) {
|
|
| 209 |
//println "sent: $sent" |
|
| 210 |
for (int iWord = 0 ; iWord < sent.size() ; iWord++) {
|
|
| 225 | 211 |
|
| 226 |
currentTextNode.getParentNode().insertBefore(newChild, currentTextNode) |
|
| 212 |
List<String> word = sent.get(iWord) |
|
| 213 |
int idx = currentText.indexOf(word[0], curentTextIndex); |
|
| 214 |
//println "TEST: $word at $curentTextIndex in $currentText" |
|
| 215 |
if (idx >= 0) { // the whole word is in the current node text content
|
|
| 216 |
curentTextIndex = idx + word[0].length(); |
|
| 217 |
currentWords.add(word) |
|
| 218 |
continue // NEXT WORD |
|
| 219 |
} |
|
| 220 |
|
|
| 221 |
// see if the word is partially in the current text node |
|
| 222 |
// String partialWord = "" |
|
| 223 |
// for (int c = 0 ; c < word.length() - 1 ; c++) {
|
|
| 224 |
// idx = currentText.indexOf(word.substring(0, word.length() - c), curentTextIndex); |
|
| 225 |
// if (idx >= 0) { // the partial word is in the current node text content
|
|
| 226 |
// partialWord = word.substring(0, word.length() - c) |
|
| 227 |
// break; |
|
| 228 |
// } |
|
| 229 |
// } |
|
| 230 |
// if (partialWord.length() > 0) { // yes, the word is partially in the text node
|
|
| 231 |
// //println "partialword=$partialWord" |
|
| 232 |
// def sword = word.substring(partialWord.length()) |
|
| 233 |
// //println "sword=$sword" |
|
| 234 |
// if (sword.length() > 0) { // should always happen
|
|
| 235 |
// sent.set(iWord, sword) |
|
| 236 |
// //iWord-- // to re-process the word |
|
| 237 |
// } |
|
| 238 |
// curentTextIndex = idx + partialWord.length() |
|
| 239 |
// currentWords.add(partialWord.toString()) |
|
| 240 |
// } |
|
| 241 |
|
|
| 242 |
if (currentWords.size() > 0) {
|
|
| 243 |
writeWords(nNode, currentText, curentTextIndex, currentTextNode, currentWords) |
|
| 244 |
} |
|
| 245 |
|
|
| 246 |
if (nNode < textNodes.size() - 1) {
|
|
| 247 |
currentNewText = new StringBuilder() |
|
| 248 |
curentTextIndex = 0; |
|
| 249 |
nNode++; |
|
| 250 |
|
|
| 251 |
currentTextNode = textNodes.get(nNode); |
|
| 252 |
currentText = currentTextNode.getTextContent(); |
|
| 253 |
|
|
| 254 |
iWord--; // |
|
| 255 |
} else {
|
|
| 256 |
currentWords.add(word) |
|
| 257 |
} |
|
| 227 | 258 |
} |
| 228 |
currentTextNode.getParentNode().removeChild(currentTextNode) |
|
| 229 | 259 |
|
| 230 |
currentNewText = new StringBuilder() |
|
| 231 |
currentWords.clear(); |
|
| 232 |
curentTextIndex = 0; |
|
| 233 |
nNode++; |
|
| 234 |
if (nNode < textNodes.size()) {
|
|
| 235 |
currentTextNode = textNodes.get(nNode); |
|
| 236 |
currentText = currentTextNode.getTextContent(); |
|
| 260 |
if (currentWords.size() > 0) {
|
|
| 261 |
writeWords(nNode, currentText, curentTextIndex, currentTextNode, currentWords) |
|
| 237 | 262 |
} |
| 238 | 263 |
|
| 264 |
if (stringTokenizer.doSentences()) {
|
|
| 265 |
Node newChild = dom.getOwnerDocument().createProcessingInstruction("txm", "</s>")
|
|
| 266 |
currentTextNode.getParentNode().insertBefore(newChild, currentTextNode)//appendChild(newChild)// |
|
| 267 |
} |
|
| 239 | 268 |
} |
| 240 |
|
|
| 241 |
currentWords.add(word) |
|
| 242 | 269 |
} |
| 243 |
} |
|
| 244 |
}
|
|
| 245 |
|
|
| 246 |
public ArrayList<Node> getTextNodes(Node element) {
|
|
| 247 |
def children = element.getChildNodes()
|
|
| 248 |
ArrayList<Node> texts = new ArrayList<Node>()
|
|
| 249 |
for (int i = 0 ; i < children.getLength() ; i++) {
|
|
| 250 |
def node = children.item(i);
|
|
| 251 |
if (node.getNodeType() == Node.TEXT_NODE) {
|
|
| 252 |
texts.add(node)
|
|
| 253 |
} else if (node.getNodeType() == Node.ELEMENT_NODE) {
|
|
| 254 |
if (node.getLocalName().equals("w")) {
|
|
| 255 |
texts.add(node)
|
|
| 256 |
} else {
|
|
| 257 |
texts.addAll(getTextNodes(node));
|
|
| 270 |
|
|
| 271 |
public void writeWords(def nNode, def currentText, def curentTextIndex, def currentTextNode, def currentWords) {
|
|
| 272 |
//println "WRITING: nNode=$nNode currentText='${currentText.replace("\\n", " ")}' index=$curentTextIndex words=$currentWords"
|
|
| 273 |
currentTextNode.setTextContent("");
|
|
| 274 |
for (def w : currentWords) {
|
|
| 275 |
Node newChild = dom.getOwnerDocument().createElementNS(null, "w");
|
|
| 276 |
wordcount++
|
|
| 277 |
newChild.setAttribute("id", "w_"+filename+"_"+wordcount)
|
|
| 278 |
for (int i = 0 ; i < stringTokenizer.getAdditionalProperties().size() ; i++) {
|
|
| 279 |
//println "write att: "+stringTokenizer.getAdditionalProperties()[i]+"="+w[i+1]
|
|
| 280 |
newChild.setAttribute(stringTokenizer.getAdditionalProperties()[i], w[i+1])
|
|
| 281 |
}
|
|
| 282 |
newChild.setTextContent(w[0]);
|
|
| 283 |
|
|
| 284 |
currentTextNode.getParentNode().insertBefore(newChild, currentTextNode)
|
|
| 258 | 285 |
} |
| 286 |
currentWords.clear(); |
|
| 259 | 287 |
} |
| 260 |
} |
|
| 261 |
return texts; |
|
| 262 |
} |
|
| 263 |
}; |
|
| 288 |
|
|
| 289 |
public ArrayList<Node> getTextNodes(Node element) {
|
|
| 290 |
def children = element.getChildNodes() |
|
| 291 |
ArrayList<Node> texts = new ArrayList<Node>() |
|
| 292 |
for (int i = 0 ; i < children.getLength() ; i++) {
|
|
| 293 |
def node = children.item(i); |
|
| 294 |
if (node.getNodeType() == Node.TEXT_NODE && node.getTextContent().trim().length() > 0) {
|
|
| 295 |
texts.add(node) |
|
| 296 |
} else if (node.getNodeType() == Node.ELEMENT_NODE) {
|
|
| 297 |
if (node.getLocalName().equals("w")) {
|
|
| 298 |
if (retokenize) {
|
|
| 299 |
//texts.add(node) |
|
| 300 |
} |
|
| 301 |
} else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(node.getLocalName()).matches()) {
|
|
| 302 |
//texts.add(node) |
|
| 303 |
} else if (reg_outside_text_tags != null && reg_outside_text_tags.matcher(node.getLocalName()).matches()) { // ignore the tag and its content of the tag
|
|
| 304 |
//texts.add(node) |
|
| 305 |
} else {
|
|
| 306 |
texts.addAll(getTextNodes(node)); |
|
| 307 |
} |
|
| 308 |
} |
|
| 309 |
} |
|
| 310 |
return texts; |
|
| 311 |
} |
|
| 312 |
}; |
|
| 264 | 313 |
} |
| 265 | 314 |
|
| 266 | 315 |
/** |
| ... | ... | |
| 411 | 460 |
} |
| 412 | 461 |
|
| 413 | 462 |
|
| 414 |
|
|
| 415 |
/** The wordcount. */ |
|
| 416 |
int wordcount = 0; |
|
| 417 |
|
|
| 418 | 463 |
/** The ignorecontent. */ |
| 419 | 464 |
boolean ignorecontent = true; // tokenization starts with element name matching $reg_startTag |
| 420 | 465 |
boolean insideword = false; |
| ... | ... | |
| 628 | 673 |
text = regLN.matcher(text).replaceAll(WHITESPACE); |
| 629 | 674 |
text = regCTRL.matcher(text).replaceAll(EMPTY); // remove ctrl characters |
| 630 | 675 |
|
| 676 |
println "tokenize text" |
|
| 631 | 677 |
def sentences = stringTokenizer.processText(text); |
| 632 | 678 |
for (def words : sentences) {
|
| 633 | 679 |
for (def word : words) {
|
| 634 | 680 |
wordcount++; |
| 635 | 681 |
writer.writeStartElement(word_element_to_create); |
| 636 | 682 |
writeWordAttributes();// id |
| 637 |
writer.writeCharacters(word); |
|
| 683 |
for (int i = 0 ; i < stringTokenizer.getAdditionalProperties() ; i++) {
|
|
| 684 |
String att = stringTokenizer.getAdditionalProperties().get(i) |
|
| 685 |
writer.writeAttribute(att, word[i+1]) |
|
| 686 |
} |
|
| 687 |
writer.writeCharacters(word[0]); |
|
| 638 | 688 |
writer.writeEndElement(); |
| 639 | 689 |
writer.writeCharacters("\n");
|
| 690 |
println "WRITE WORD: "+word[0]+" in "+writer |
|
| 640 | 691 |
} |
| 641 | 692 |
if (stringTokenizer.doSentences()) {
|
| 642 | 693 |
writer.writeProcessingInstruction("txm", "</s>")
|
| TXM/branches/eltec/org.txm.groovy.core/src/groovy/org/txm/scripts/filters/Tokeniser/SimpleTokenizerXml.groovy (revision 3283) | ||
|---|---|---|
| 518 | 518 |
wordcount++; |
| 519 | 519 |
writer.writeStartElement(word_element_to_create); |
| 520 | 520 |
writeWordAttributes();// id |
| 521 |
writer.writeCharacters(word); |
|
| 521 |
for (int i = 0 ; i < stringTokenizer.getAdditionalProperties() ; i++) {
|
|
| 522 |
String att = stringTokenizer.getAdditionalProperties().get(i) |
|
| 523 |
writer.writeAttribute(att, word[i+1]) |
|
| 524 |
} |
|
| 525 |
writer.writeCharacters(word[0]); |
|
| 522 | 526 |
writer.writeEndElement(); |
| 523 | 527 |
writer.writeCharacters("\n");
|
| 524 | 528 |
} |
| TXM/branches/eltec/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZImporter.groovy (revision 3283) | ||
|---|---|---|
| 1 | 1 |
package org.txm.scripts.importer.xtz |
| 2 | 2 |
|
| 3 |
import org.txm.scripts.filters.Tokeniser.SimpleTokenizerXml
|
|
| 3 |
import org.txm.scripts.filters.Tokeniser.ChunkTokenizerXml
|
|
| 4 | 4 |
|
| 5 | 5 |
import java.io.File; |
| 6 | 6 |
|
| ... | ... | |
| 390 | 390 |
cpb.tick() |
| 391 | 391 |
File infile = f; |
| 392 | 392 |
File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName()); |
| 393 |
SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, TokenizerClasses.newTokenizerClasses(project.getPreferencesScope(), lang))
|
|
| 394 |
if (module.getProject().getAnnotate()) { // an annotation will be done, does the annotation engine needs another tokenizer ?
|
|
| 393 |
ChunkTokenizerXml tokenizer = new ChunkTokenizerXml(infile, TokenizerClasses.newTokenizerClasses(project.getPreferencesScope(), lang))
|
|
| 394 |
//if (module.getProject().getAnnotate()) { // an annotation will be done, does the annotation engine needs another tokenizer ?
|
|
| 395 | 395 |
String engineName = module.getProject().getImportParameters().node("annotate").get("engine", "TreeTagger")
|
| 396 | 396 |
def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine(engineName) |
| 397 | 397 |
def stringTokenizer = engine.getStringTokenizer(lang) |
| 398 | 398 |
if (stringTokenizer != null) {
|
| 399 | 399 |
tokenizer.setStringTokenizer(stringTokenizer) |
| 400 | 400 |
} |
| 401 |
} |
|
| 401 |
//}
|
|
| 402 | 402 |
tokenizer.setRetokenize(retokenize) |
| 403 | 403 |
if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) {
|
| 404 | 404 |
tokenizer.setOutSideTextTags(outSideTextTagsRegex) |
| ... | ... | |
| 413 | 413 |
} |
| 414 | 414 |
|
| 415 | 415 |
// tokenize ! |
| 416 |
if (!tokenizer.process()) {
|
|
| 416 |
if (!tokenizer.process(outfile)) {
|
|
| 417 | 417 |
println("Failed to process "+f)
|
| 418 | 418 |
outfile.delete() |
| 419 | 419 |
} |
| TXM/branches/eltec/org.txm.udpipe.core/src/org/txm/udpipe/core/UDPipeEngine.java (revision 3283) | ||
|---|---|---|
| 20 | 20 |
import org.txm.utils.logger.Log; |
| 21 | 21 |
import org.txm.xml.xmltxm.XMLTXMWordPropertiesInjection; |
| 22 | 22 |
|
| 23 |
import cz.cuni.mff.ufal.udpipe.InputFormat;
|
|
| 24 |
import cz.cuni.mff.ufal.udpipe.Model;
|
|
| 23 |
import cz.cuni.mff.ufal.udpipe.MultiwordToken;
|
|
| 24 |
import cz.cuni.mff.ufal.udpipe.MultiwordTokens;
|
|
| 25 | 25 |
import cz.cuni.mff.ufal.udpipe.Sentence; |
| 26 | 26 |
import cz.cuni.mff.ufal.udpipe.Sentences; |
| 27 | 27 |
import cz.cuni.mff.ufal.udpipe.Version; |
| ... | ... | |
| 118 | 118 |
*/ |
| 119 | 119 |
public static String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
|
| 120 | 120 |
|
| 121 |
|
|
| 121 |
/** |
|
| 122 |
* TODO re-think the process. The UDPStringTokenizer do the job |
|
| 123 |
*/ |
|
| 122 | 124 |
@Override |
| 123 | 125 |
public boolean processFile(File xmlFile, File binaryCorpusDirectory, HashMap<String, Object> parameters) {
|
| 124 | 126 |
if (!isRunning()) return false; |
| 125 | 127 |
|
| 126 |
String lang = null; |
|
| 127 |
Object ps = parameters.get("langs");
|
|
| 128 |
Object p = parameters.get("lang");
|
|
| 128 |
// String lang = null; |
|
| 129 |
// Object ps = parameters.get("langs");
|
|
| 130 |
// Object p = parameters.get("lang");
|
|
| 131 |
// |
|
| 132 |
// if (p == null && ps == null) {
|
|
| 133 |
// Log.warning("Warning: can't annotate. No 'lang' (String) or 'langs' (Map<String, String>) parameter specified in " + parameters);
|
|
| 134 |
// return false; |
|
| 135 |
// } |
|
| 136 |
// |
|
| 137 |
// if (ps != null && ps instanceof Map) {
|
|
| 138 |
// Map<?, ?> map = (Map<?, ?>) ps; |
|
| 139 |
// String text_id = xmlFile.getName(); |
|
| 140 |
// if (map.get(text_id) != null) {
|
|
| 141 |
// lang = map.get(text_id).toString().toLowerCase(); |
|
| 142 |
// if (!canAnnotateLang(lang)) {
|
|
| 143 |
// Log.warning("Warning: can't annotate text_id=${text_id} with $lang, will use the default lang=$p");
|
|
| 144 |
// return false; |
|
| 145 |
// } |
|
| 146 |
// } |
|
| 147 |
// } |
|
| 148 |
// |
|
| 149 |
// if (lang == null && p == null) {
|
|
| 150 |
// System.out.println(NLS.bind("** Error: no 'lang' parameter given: {0}. Aborting TreeTagger annotation.", parameters));
|
|
| 151 |
// return false; |
|
| 152 |
// } |
|
| 153 |
// else {
|
|
| 154 |
// lang = p.toString(); |
|
| 155 |
// } |
|
| 156 |
// |
|
| 157 |
// if (!canAnnotateLang(lang)) {
|
|
| 158 |
// return false; |
|
| 159 |
// } |
|
| 160 |
// |
|
| 161 |
// File modelsDirectory = new File(UDPipePreferences.getInstance().getString(UDPipePreferences.MODELS_PATH)); // default models directory is set in the Toolbox |
|
| 162 |
// File modelFile = new File(modelsDirectory, lang + ".udpipe"); |
|
| 163 |
// //System.out.println("model="+modelFile.getAbsolutePath());
|
|
| 164 |
// |
|
| 165 |
// try {
|
|
| 166 |
// long time = System.currentTimeMillis(); |
|
| 167 |
// // get words |
|
| 168 |
// XMLTXMToUDPipeXMLParser wparser = new XMLTXMToUDPipeXMLParser(xmlFile.toURI().toURL()); |
|
| 169 |
// if (!wparser.process(null)) {
|
|
| 170 |
// Log.warning("Error while parsing: " + xmlFile);
|
|
| 171 |
// return false; |
|
| 172 |
// } |
|
| 173 |
// //System.out.println("words built in: "+(System.currentTimeMillis() - time));
|
|
| 174 |
// Sentences sentences = wparser.getSentences(); |
|
| 175 |
// |
|
| 176 |
// time = System.currentTimeMillis(); |
|
| 177 |
// // System.out.println("SENTENCES PARSED: " + UDPipeJavaUtils.toString(sentences));
|
|
| 178 |
// // tag |
|
| 179 |
// UDPipeJavaUtils.processSentences(modelFile.getAbsolutePath(), sentences); |
|
| 180 |
// //System.out.println("sentences parsed in: "+(System.currentTimeMillis() - time));
|
|
| 181 |
// // System.out.println("SENTENCES RESULT: " + UDPipeJavaUtils.toString(sentences));
|
|
| 182 |
// |
|
| 183 |
// // update the XML-TXM file |
|
| 184 |
// time = System.currentTimeMillis(); |
|
| 185 |
// XMLTXMWordPropertiesInjection injector = new XMLTXMWordPropertiesInjection(xmlFile); |
|
| 186 |
// HashMap<String, HashMap<String, String>> rules = new HashMap<>(); |
|
| 187 |
// for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
|
|
| 188 |
// Sentence sentence = sentences.get(iSentence); |
|
| 189 |
// Words words = sentence.getWords(); |
|
| 190 |
// MultiwordTokens mwt = sentence.getMultiwordTokens(); |
|
| 191 |
// HashMap<Integer, MultiwordToken> firstWordToToken = new HashMap<Integer, MultiwordToken>(); |
|
| 192 |
// HashMap<Integer, MultiwordToken> lastWordToToken = new HashMap<Integer, MultiwordToken>(); |
|
| 193 |
// for (int i = 0 ; i < mwt.size(); i++) {
|
|
| 194 |
// MultiwordToken mw = mwt.get(i); |
|
| 195 |
// firstWordToToken.put(mw.getIdFirst(), mw); |
|
| 196 |
// lastWordToToken.put(mw.getIdLast(), mw); |
|
| 197 |
// } |
|
| 198 |
// |
|
| 199 |
// for (int iWord = 0; iWord < words.size(); iWord++) {
|
|
| 200 |
// Word word = words.get(iWord); |
|
| 201 |
// String form = word.getForm(); |
|
| 202 |
// if (Sentence.getRootForm().equals(form)) continue; |
|
| 203 |
// |
|
| 204 |
// if (firstWordToToken.containsKey(word.getId())) {
|
|
| 205 |
// MultiwordToken mw = firstWordToToken.get(word.getId()); |
|
| 206 |
// |
|
| 207 |
// String misc = mw.getMisc(); |
|
| 208 |
// int idx = misc.indexOf(XMLIDMISC); |
|
| 209 |
// int idx2 = misc.indexOf("|", idx + 6);
|
|
| 210 |
// if (idx2 < 0) idx2 = misc.length(); |
|
| 211 |
// String id = misc.substring(idx + 6, idx2); |
|
| 212 |
// if (id != null && id.length() > 0) {
|
|
| 213 |
// HashMap<String, String> properties = UDPipeJavaUtils.wordToHashMap(sentence, mw, "#ud-"); |
|
| 214 |
// rules.put(id, properties); |
|
| 215 |
// } |
|
| 216 |
// } else if (lastWordToToken.containsKey(word.getId())) {
|
|
| 217 |
// // already written |
|
| 218 |
// } else {
|
|
| 219 |
// |
|
| 220 |
// String misc = word.getMisc(); |
|
| 221 |
// int idx = misc.indexOf(XMLIDMISC); |
|
| 222 |
// int idx2 = misc.indexOf("|", idx + 6);
|
|
| 223 |
// if (idx2 < 0) idx2 = misc.length(); |
|
| 224 |
// String id = misc.substring(idx + 6, idx2); |
|
| 225 |
// if (id != null && id.length() > 0) {
|
|
| 226 |
// HashMap<String, String> properties = UDPipeJavaUtils.wordToHashMap(word, "#ud-"); |
|
| 227 |
// rules.put(id, properties); |
|
| 228 |
// } |
|
| 229 |
// } |
|
| 230 |
// } |
|
| 231 |
// } |
|
| 232 |
// File outFile = new File(binaryCorpusDirectory, xmlFile.getName() + ".tmp"); |
|
| 233 |
// injector.setProperties(rules); |
|
| 234 |
// if (injector.process(outFile) && outFile.exists()) {
|
|
| 235 |
// xmlFile.delete(); |
|
| 236 |
// outFile.renameTo(xmlFile); |
|
| 237 |
// } |
|
| 238 |
// else {
|
|
| 239 |
// Log.warning("Error while processing: " + xmlFile);
|
|
| 240 |
// } |
|
| 241 |
// System.out.println("ud properties injected in: "+(System.currentTimeMillis() - time));
|
|
| 242 |
// } |
|
| 243 |
// catch (IOException | XMLStreamException e) {
|
|
| 244 |
// // TODO Auto-generated catch block |
|
| 245 |
// e.printStackTrace(); |
|
| 246 |
// } |
|
| 129 | 247 |
|
| 130 |
if (p == null && ps == null) {
|
|
| 131 |
Log.warning("Warning: can't annotate. No 'lang' (String) or 'langs' (Map<String, String>) parameter specified in " + parameters);
|
|
| 132 |
return false; |
|
| 133 |
} |
|
| 134 |
|
|
| 135 |
if (ps != null && ps instanceof Map) {
|
|
| 136 |
Map<?, ?> map = (Map<?, ?>) ps; |
|
| 137 |
String text_id = xmlFile.getName(); |
|
| 138 |
if (map.get(text_id) != null) {
|
|
| 139 |
lang = map.get(text_id).toString().toLowerCase(); |
|
| 140 |
if (!canAnnotateLang(lang)) {
|
|
| 141 |
Log.warning("Warning: can't annotate text_id=${text_id} with $lang, will use the default lang=$p");
|
|
| 142 |
return false; |
|
| 143 |
} |
|
| 144 |
} |
|
| 145 |
} |
|
| 146 |
|
|
| 147 |
if (lang == null && p == null) {
|
|
| 148 |
System.out.println(NLS.bind("** Error: no 'lang' parameter given: {0}. Aborting TreeTagger annotation.", parameters));
|
|
| 149 |
return false; |
|
| 150 |
} |
|
| 151 |
else {
|
|
| 152 |
lang = p.toString(); |
|
| 153 |
} |
|
| 154 |
|
|
| 155 |
if (!canAnnotateLang(lang)) {
|
|
| 156 |
return false; |
|
| 157 |
} |
|
| 158 |
|
|
| 159 |
File modelsDirectory = new File(UDPipePreferences.getInstance().getString(UDPipePreferences.MODELS_PATH)); // default models directory is set in the Toolbox |
|
| 160 |
File modelFile = new File(modelsDirectory, lang + ".udpipe"); |
|
| 161 |
//System.out.println("model="+modelFile.getAbsolutePath());
|
|
| 162 |
|
|
| 163 |
try {
|
|
| 164 |
// get words |
|
| 165 |
XMLTXMToUDPipeXMLParser wparser = new XMLTXMToUDPipeXMLParser(xmlFile.toURI().toURL()); |
|
| 166 |
if (!wparser.process(null)) {
|
|
| 167 |
Log.warning("Error while parsing: " + xmlFile);
|
|
| 168 |
return false; |
|
| 169 |
} |
|
| 170 |
|
|
| 171 |
Sentences sentences = wparser.getSentences(); |
|
| 172 |
|
|
| 173 |
// System.out.println("SENTENCES PARSED: " + UDPipeJavaUtils.toString(sentences));
|
|
| 174 |
// tag |
|
| 175 |
UDPipeJavaUtils.processSentences(modelFile.getAbsolutePath(), sentences); |
|
| 176 |
|
|
| 177 |
// System.out.println("SENTENCES RESULT: " + UDPipeJavaUtils.toString(sentences));
|
|
| 178 |
|
|
| 179 |
// update XML-TXM files |
|
| 180 |
XMLTXMWordPropertiesInjection injector = new XMLTXMWordPropertiesInjection(xmlFile); |
|
| 181 |
HashMap<String, HashMap<String, String>> rules = new HashMap<>(); |
|
| 182 |
for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
|
|
| 183 |
Sentence sentence = sentences.get(iSentence); |
|
| 184 |
Words words = sentence.getWords(); |
|
| 185 |
for (int iWord = 0; iWord < words.size(); iWord++) {
|
|
| 186 |
Word word = words.get(iWord); |
|
| 187 |
String form = word.getForm(); |
|
| 188 |
if ("<root>".equals(form)) continue;
|
|
| 189 |
|
|
| 190 |
String misc = word.getMisc(); |
|
| 191 |
int idx = misc.indexOf(XMLIDMISC); |
|
| 192 |
int idx2 = misc.indexOf("|", idx + 6);
|
|
| 193 |
if (idx2 < 0) idx2 = misc.length(); |
|
| 194 |
String id = misc.substring(idx + 6, idx2); |
|
| 195 |
if (id != null && id.length() > 0) {
|
|
| 196 |
HashMap<String, String> properties = UDPipeJavaUtils.wordToHashMap(word, "#ud-"); |
|
| 197 |
rules.put(id, properties); |
|
| 198 |
} |
|
| 199 |
} |
|
| 200 |
} |
|
| 201 |
File outFile = new File(binaryCorpusDirectory, xmlFile.getName() + ".tmp"); |
|
| 202 |
injector.setProperties(rules); |
|
| 203 |
if (injector.process(outFile) && outFile.exists()) {
|
|
| 204 |
xmlFile.delete(); |
|
| 205 |
outFile.renameTo(xmlFile); |
|
| 206 |
} |
|
| 207 |
else {
|
|
| 208 |
Log.warning("Error while processing: " + xmlFile);
|
|
| 209 |
} |
|
| 210 |
} |
|
| 211 |
catch (IOException | XMLStreamException e) {
|
|
| 212 |
// TODO Auto-generated catch block |
|
| 213 |
e.printStackTrace(); |
|
| 214 |
} |
|
| 215 |
|
|
| 216 | 248 |
// update xml-txm files |
| 217 | 249 |
return true; |
| 218 | 250 |
} |
| TXM/branches/eltec/org.txm.udpipe.core/src/org/txm/udpipe/core/XMLTXMToUDPipeXMLParser.java (revision 3283) | ||
|---|---|---|
| 173 | 173 |
flagform = false; |
| 174 | 174 |
form = form.trim(); |
| 175 | 175 |
form = form.replace("\n", "").replace("<", "<");
|
| 176 |
Word word = new Word(); |
|
| 177 |
word.setForm(form); |
|
| 176 |
|
|
| 177 |
sentence.addWord(form); |
|
| 178 |
Word word = sentence.getWords().get((int) (sentence.getWords().size() - 1)); |
|
| 178 | 179 |
word.setMisc(UDPipeEngine.XMLIDMISC + wordId); |
| 179 |
sentence.getWords().add(word); |
|
| 180 | 180 |
} |
| 181 | 181 |
break; |
| 182 | 182 |
|
| ... | ... | |
| 195 | 195 |
} |
| 196 | 196 |
} |
| 197 | 197 |
|
| 198 |
if (sentence != null && sentence.getWords().size() > 0) {
|
|
| 198 |
if (sentence != null && sentence.getWords().size() > 1) { // a sent contains at least <root>
|
|
| 199 | 199 |
sentences.add(sentence); |
| 200 | 200 |
} |
| 201 | 201 |
|
| TXM/branches/eltec/org.txm.udpipe.core/src/org/txm/udpipe/core/UDPipeJavaUtils.java (revision 3283) | ||
|---|---|---|
| 10 | 10 |
|
| 11 | 11 |
import cz.cuni.mff.ufal.udpipe.InputFormat; |
| 12 | 12 |
import cz.cuni.mff.ufal.udpipe.Model; |
| 13 |
import cz.cuni.mff.ufal.udpipe.MultiwordToken; |
|
| 14 |
import cz.cuni.mff.ufal.udpipe.MultiwordTokens; |
|
| 13 | 15 |
import cz.cuni.mff.ufal.udpipe.OutputFormat; |
| 14 | 16 |
import cz.cuni.mff.ufal.udpipe.Sentence; |
| 15 | 17 |
import cz.cuni.mff.ufal.udpipe.Sentences; |
| ... | ... | |
| 42 | 44 |
|
| 43 | 45 |
model.parse(sent, ""); |
| 44 | 46 |
model.tag(sent, ""); |
| 45 |
// System.out.println(toString(sent)); |
|
| 47 |
// System.out.println(toString(sent));
|
|
| 46 | 48 |
} |
| 47 | 49 |
|
| 48 | 50 |
return sentences; |
| ... | ... | |
| 60 | 62 |
public static String toString(Sentence sent) {
|
| 61 | 63 |
StringBuilder buffer = new StringBuilder(); |
| 62 | 64 |
Words words = sent.getWords(); |
| 63 |
for (int i = 0; i < words.size(); i++) {
|
|
| 65 |
|
|
| 66 |
MultiwordTokens mwt = sent.getMultiwordTokens(); |
|
| 67 |
HashMap<Integer, MultiwordToken> firstWordToToken = new HashMap<Integer, MultiwordToken>(); |
|
| 68 |
HashMap<Integer, MultiwordToken> lastWordToToken = new HashMap<Integer, MultiwordToken>(); |
|
| 69 |
for (int i = 0 ; i < mwt.size(); i++) {
|
|
| 70 |
MultiwordToken mw = mwt.get(i); |
|
| 71 |
firstWordToToken.put(mw.getIdFirst(), mw); |
|
| 72 |
lastWordToToken.put(mw.getIdLast(), mw); |
|
| 73 |
} |
|
| 74 |
|
|
| 75 |
for (int i = 0 ; i < words.size(); i++) {
|
|
| 76 |
// MultiwordToken mw = mwt.get(i); |
|
| 77 |
// Word w = words.get(mw.getIdFirst()); |
|
| 78 |
|
|
| 64 | 79 |
Word w = words.get(i); |
| 80 |
|
|
| 65 | 81 |
if (i > 0) buffer.append(" ");
|
| 66 | 82 |
|
| 67 |
buffer.append(w.getForm()); |
|
| 68 |
buffer.append("/" + w.getLemma());
|
|
| 69 |
buffer.append("/" + w.getUpostag());
|
|
| 70 |
buffer.append("/" + w.getXpostag());
|
|
| 71 |
buffer.append("/" + w.getFeats());
|
|
| 72 |
buffer.append("/" + w.getDeps());
|
|
| 73 |
buffer.append("/" + w.getDeprel());
|
|
| 74 |
buffer.append("/" + w.getHead());
|
|
| 75 |
buffer.append("/" + w.getMisc());
|
|
| 83 |
if (firstWordToToken.containsKey(w.getId())) {
|
|
| 84 |
MultiwordToken mw = firstWordToToken.get(w.getId()); |
|
| 85 |
Word w2 = words.get(mw.getIdLast()); |
|
| 86 |
|
|
| 87 |
buffer.append(""+w.getId()+"+"+w2.getId());
|
|
| 88 |
buffer.append("/" + w.getForm()+"+"+w2.getForm());
|
|
| 89 |
buffer.append("/" + w.getLemma()+"+"+w2.getLemma());
|
|
| 90 |
buffer.append("/" + w.getUpostag()+"+"+w2.getUpostag());
|
|
| 91 |
buffer.append("/" + w.getXpostag()+"+"+w2.getXpostag());
|
|
| 92 |
buffer.append("/" + w.getFeats()+"+"+w2.getFeats());
|
|
| 93 |
buffer.append("/" + w.getDeps()+"+"+w2.getDeps());
|
|
| 94 |
buffer.append("/" + w.getDeprel()+"+"+w2.getDeprel());
|
|
| 95 |
buffer.append("/" + w.getHead()+"+"+w2.getHead());
|
|
| 96 |
buffer.append("/" + w.getMisc()+"+"+w2.getMisc());
|
|
| 97 |
} else if (lastWordToToken.containsKey(w.getId())) {
|
|
| 98 |
// already written |
|
| 99 |
} else {
|
|
| 100 |
|
|
| 101 |
buffer.append(w.getId()); |
|
| 102 |
buffer.append("/" + w.getForm());
|
|
| 103 |
buffer.append("/" + w.getLemma());
|
|
| 104 |
buffer.append("/" + w.getUpostag());
|
|
| 105 |
buffer.append("/" + w.getXpostag());
|
|
| 106 |
buffer.append("/" + w.getFeats());
|
|
| 107 |
buffer.append("/" + w.getDeps());
|
|
| 108 |
buffer.append("/" + w.getDeprel());
|
|
| 109 |
buffer.append("/" + w.getHead());
|
|
| 110 |
buffer.append("/" + w.getMisc());
|
|
| 111 |
} |
|
| 76 | 112 |
} |
| 77 | 113 |
return buffer.toString(); |
| 78 | 114 |
} |
| ... | ... | |
| 126 | 162 |
} |
| 127 | 163 |
|
| 128 | 164 |
public static void main(String[] args) {
|
| 129 |
// try {
|
|
| 130 |
// toConnluFile(new File("/tmp/result.connlu"), new File("/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe"),
|
|
| 131 |
// "Et un petit test... En deux phrases ? ou trois."); |
|
| 165 |
// try {
|
|
| 166 |
// toConnluFile(new File("/tmp/result.connlu"), new File("/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe"),
|
|
| 167 |
// "Et un petit test... En deux phrases ? ou trois."); |
|
| 168 |
// } |
|
| 169 |
// catch (UnsupportedEncodingException | FileNotFoundException e) {
|
|
| 170 |
// // TODO Auto-generated catch block |
|
| 171 |
// e.printStackTrace(); |
|
| 172 |
// } |
|
| 173 |
UDPipeEngine.initializeUDLib(new File("/home/mdecorde/workspace047/org.txm.libs.udpipe"));
|
|
| 174 |
// System.out.println("VERSION: "+Version.current().toString());
|
|
| 175 |
// // /usr/lib/UDPipe/models/fr.udpipe |
|
| 176 |
// // /home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe |
|
| 177 |
// for (String p : new String[] {"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe",
|
|
| 178 |
// "/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-partut-ud-2.4-190531.udpipe", |
|
| 179 |
// "/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-sequoia-ud-2.4-190531.udpipe", |
|
| 180 |
// "/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-spoken-ud-2.4-190531.udpipe"}) {
|
|
| 181 |
// System.out.println("M="+p);
|
|
| 182 |
// System.out.println(toString(process(p, "Une plaine, des champs cultivés que traverse une grande route."))); |
|
| 132 | 183 |
// } |
| 133 |
// catch (UnsupportedEncodingException | FileNotFoundException e) {
|
|
| 134 |
// // TODO Auto-generated catch block |
|
| 135 |
// e.printStackTrace(); |
|
| 136 |
// } |
|
| 137 |
UDPipeEngine.initializeUDLib(new File("/home/mdecorde/workspace047/org.txm.libs.udpipe"));
|
|
| 138 |
System.out.println("VERSION: "+Version.current().toString());
|
|
| 139 |
// /usr/lib/UDPipe/models/fr.udpipe |
|
| 140 |
// /home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe |
|
| 141 |
for (String p : new String[] {"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe",
|
|
| 142 |
"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-partut-ud-2.4-190531.udpipe", |
|
| 143 |
"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-sequoia-ud-2.4-190531.udpipe", |
|
| 144 |
"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-spoken-ud-2.4-190531.udpipe"}) {
|
|
| 145 |
System.out.println("M="+p);
|
|
| 146 |
process(p, "Et un petit test... En deux phrases ? ou trois."); |
|
| 147 |
} |
|
| 184 |
|
|
| 185 |
Sentence sentence = new Sentence(); |
|
| 186 |
sentence.addWord("je");
|
|
| 187 |
sentence.addWord("suis");
|
|
| 188 |
sentence.addWord(".");
|
|
| 189 |
System.out.println("SENT="+toString(sentence));
|
|
| 190 |
|
|
| 148 | 191 |
} |
| 149 | 192 |
|
| 150 | 193 |
public static void processSentences(String modelPath, Sentences sentences) {
|
| 151 |
processSentences(Model.load(modelPath), sentences); |
|
| 194 |
long time = System.currentTimeMillis(); |
|
| 195 |
Model m = Model.load(modelPath); |
|
| 196 |
//System.out.println("Model load in: "+(System.currentTimeMillis()-time));
|
|
| 197 |
|
|
| 198 |
time = System.currentTimeMillis(); |
|
| 199 |
processSentences(m, sentences); |
|
| 200 |
//System.out.println("sent processed in: "+(System.currentTimeMillis()-time));
|
|
| 152 | 201 |
} |
| 153 | 202 |
|
| 154 | 203 |
public static void processSentences(Model model, Sentences sentences) {
|
| ... | ... | |
| 156 | 205 |
for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
|
| 157 | 206 |
Sentence sent = sentences.get(iSentence); |
| 158 | 207 |
|
| 208 |
model.parse(sent, ""); |
|
| 159 | 209 |
model.tag(sent, ""); |
| 160 |
model.parse(sent, ""); |
|
| 210 |
|
|
| 161 | 211 |
//System.out.println(toString(sent)); |
| 162 | 212 |
} |
| 163 | 213 |
} |
| ... | ... | |
| 186 | 236 |
|
| 187 | 237 |
return properties; |
| 188 | 238 |
} |
| 239 |
|
|
| 240 |
/** |
|
| 241 |
* fill a map with values of "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" |
|
| 242 |
* |
|
| 243 |
* @param word |
|
| 244 |
* @param prefix |
|
| 245 |
* @return |
|
| 246 |
*/ |
|
| 247 |
public static HashMap<String, String> wordToHashMap(Sentence sentence, MultiwordToken mword, String prefix) {
|
|
| 248 |
if (prefix == null) prefix = ""; |
|
| 249 |
|
|
| 250 |
Word word = sentence.getWords().get(mword.getIdFirst()); |
|
| 251 |
Word word2 = sentence.getWords().get(mword.getIdFirst()); |
|
| 252 |
|
|
| 253 |
HashMap<String, String> properties = new HashMap<>(); |
|
| 254 |
properties.put(prefix + "id", Integer.toString(word.getId()) + "+" + Integer.toString(word2.getId())); |
|
| 255 |
properties.put(prefix + "form", word.getForm() + "+" + word2.getForm()); |
|
| 256 |
properties.put(prefix + "lemma", word.getLemma() + "+" + word2.getLemma()); |
|
| 257 |
properties.put(prefix + "upos", word.getUpostag() + "+" + word2.getUpostag()); |
|
| 258 |
properties.put(prefix + "xpos", word.getXpostag() + "+" + word2.getXpostag()); |
|
| 259 |
properties.put(prefix + "feats", word.getFeats() + "+" + word2.getFeats()); |
|
| 260 |
properties.put(prefix + "head", Integer.toString(word.getHead()) + "+" + Integer.toString(word2.getId())); |
|
| 261 |
properties.put(prefix + "deprel", word.getDeprel() + "+" + word2.getDeprel()); |
|
| 262 |
properties.put(prefix + "deps", word.getDeps() + "+" + word2.getDeps()); |
|
| 263 |
properties.put(prefix + "misc", word.getMisc() + "+" + word2.getMisc()); |
|
| 264 |
|
|
| 265 |
return properties; |
|
| 266 |
} |
|
| 189 | 267 |
} |
| TXM/branches/eltec/org.txm.udpipe.core/src/org/txm/udpipe/core/UDStringTokenizer.java (revision 3283) | ||
|---|---|---|
| 2 | 2 |
|
| 3 | 3 |
import java.io.File; |
| 4 | 4 |
import java.util.ArrayList; |
| 5 |
import java.util.Arrays; |
|
| 6 |
import java.util.HashMap; |
|
| 5 | 7 |
import java.util.List; |
| 6 | 8 |
|
| 7 | 9 |
import org.txm.tokenizer.StringTokenizer; |
| 8 | 10 |
|
| 9 | 11 |
import cz.cuni.mff.ufal.udpipe.InputFormat; |
| 10 | 12 |
import cz.cuni.mff.ufal.udpipe.Model; |
| 13 |
import cz.cuni.mff.ufal.udpipe.MultiwordToken; |
|
| 14 |
import cz.cuni.mff.ufal.udpipe.MultiwordTokens; |
|
| 11 | 15 |
import cz.cuni.mff.ufal.udpipe.Sentence; |
| 16 |
import cz.cuni.mff.ufal.udpipe.Word; |
|
| 12 | 17 |
import cz.cuni.mff.ufal.udpipe.Words; |
| 13 | 18 |
|
| 14 | 19 |
|
| ... | ... | |
| 29 | 34 |
tokenizer = model.newTokenizer("");
|
| 30 | 35 |
} |
| 31 | 36 |
|
| 37 |
public final static List<String> ADDITIONAL_PROPERTIES = Arrays.asList("ud-id", "ud-form", "ud-lemma", "ud-upos", "ud-xpos", "ud-feats", "ud-head", "ud-deprel", "ud-deps", "ud-misc");
|
|
| 38 |
public List<String> getAdditionalProperties() {
|
|
| 39 |
return ADDITIONAL_PROPERTIES; |
|
| 40 |
} |
|
| 41 |
|
|
| 32 | 42 |
@Override |
| 33 |
public List<List<String>> processText(String text) {
|
|
| 34 |
ArrayList<List<String>> result = new ArrayList<>();
|
|
| 43 |
public ArrayList<ArrayList<ArrayList<String>>> processText(String text) {
|
|
| 44 |
ArrayList<ArrayList<ArrayList<String>>> result = new ArrayList<>();
|
|
| 35 | 45 |
|
| 36 |
|
|
| 37 | 46 |
tokenizer.setText(text); |
| 38 | 47 |
Sentence sent = new Sentence(); |
| 48 |
|
|
| 49 |
|
|
| 50 |
|
|
| 39 | 51 |
while (tokenizer.nextSentence(sent)) {
|
| 40 |
List<String> sresult = new ArrayList<>(); |
|
| 52 |
|
|
| 53 |
model.parse(sent, ""); |
|
| 54 |
model.tag(sent, ""); |
|
| 55 |
|
|
| 56 |
ArrayList<ArrayList<String>> sresult = new ArrayList<>(); |
|
| 41 | 57 |
Words words = sent.getWords(); |
| 58 |
|
|
| 59 |
MultiwordTokens mwt = sent.getMultiwordTokens(); |
|
| 60 |
HashMap<Integer, MultiwordToken> firstWordToToken = new HashMap<Integer, MultiwordToken>(); |
|
| 61 |
HashMap<Integer, MultiwordToken> lastWordToToken = new HashMap<Integer, MultiwordToken>(); |
|
| 62 |
for (int i = 0 ; i < mwt.size(); i++) {
|
|
| 63 |
MultiwordToken mw = mwt.get(i); |
|
| 64 |
firstWordToToken.put(mw.getIdFirst(), mw); |
|
| 65 |
lastWordToToken.put(mw.getIdLast(), mw); |
|
| 66 |
} |
|
| 67 |
|
|
| 68 |
// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
|
|
| 42 | 69 |
for (int iWord = 0; iWord < words.size(); iWord++) {
|
| 43 |
sresult.add(words.get(iWord).getForm()); |
|
| 70 |
Word word = words.get(iWord); |
|
| 71 |
ArrayList<String> properties = new ArrayList<String>(); |
|
| 72 |
|
|
| 73 |
if (firstWordToToken.containsKey(word.getId())) {
|
|
| 74 |
MultiwordToken mw = firstWordToToken.get(word.getId()); |
|
| 75 |
Word word2 = words.get(mw.getIdLast()); |
|
| 76 |
|
|
| 77 |
properties.add(mw.getForm()); |
|
| 78 |
properties.add(""+word.getId());
|
|
| 79 |
properties.add(word.getForm() + "+" + word2.getForm()); |
|
| 80 |
properties.add(word.getLemma() + "+" + word2.getLemma()); |
|
| 81 |
properties.add(word.getUpostag() + "+" + word2.getUpostag()); |
|
| 82 |
properties.add(word.getXpostag() + "+" + word2.getXpostag()); |
|
| 83 |
properties.add(word.getFeats() + "+" + word2.getFeats()); |
|
| 84 |
properties.add(Integer.toString(word.getHead()) + "+" + word2.getHead()); |
|
| 85 |
properties.add(word.getDeprel() + "+" + word2.getDeprel()); |
|
| 86 |
properties.add(word.getDeps() + "+" + word2.getDeps()); |
|
| 87 |
properties.add(mw.getMisc() + "+" + word2.getMisc()); |
|
| 88 |
|
|
| 89 |
sresult.add(properties); |
|
| 90 |
} else if (lastWordToToken.containsKey(word.getId())) {
|
|
| 91 |
// already written |
|
| 92 |
} else {
|
|
| 93 |
properties.add(word.getForm()); |
|
| 94 |
properties.add(""+word.getId());
|
|
| 95 |
properties.add(word.getForm()); |
|
| 96 |
properties.add(word.getLemma()); |
|
| 97 |
properties.add(word.getUpostag()); |
|
| 98 |
properties.add(word.getXpostag()); |
|
| 99 |
properties.add(word.getFeats()); |
|
| 100 |
properties.add(Integer.toString(word.getHead())); |
|
| 101 |
properties.add(word.getDeprel()); |
|
| 102 |
properties.add(word.getDeps()); |
|
| 103 |
properties.add(word.getMisc()); |
|
| 104 |
|
|
| 105 |
sresult.add(properties); |
|
| 106 |
} |
|
| 44 | 107 |
} |
| 45 | 108 |
if (sresult.size() > 0) {
|
| 46 | 109 |
sresult.remove(0); // remove the <root> element |
| ... | ... | |
| 59 | 122 |
return true; |
| 60 | 123 |
} |
| 61 | 124 |
} |
| 125 |
|
|
| TXM/branches/eltec/org.txm.tokenizer.core/src/org/txm/tokenizer/SimpleStringTokenizer.groovy (revision 3283) | ||
|---|---|---|
| 270 | 270 |
println "Result : "+tokenizer.processText(text).collect{"<"+it+">"}
|
| 271 | 271 |
} |
| 272 | 272 |
} |
| 273 |
|
|
| 274 |
@Override |
|
| 275 |
public List<String> getAdditionalProperties() {
|
|
| 276 |
return Arrays.asList(); |
|
| 277 |
} |
|
| 273 | 278 |
} |
| TXM/branches/eltec/org.txm.tokenizer.core/src/org/txm/tokenizer/StringTokenizer.java (revision 3283) | ||
|---|---|---|
| 1 | 1 |
package org.txm.tokenizer; |
| 2 | 2 |
|
| 3 |
import java.util.ArrayList; |
|
| 3 | 4 |
import java.util.List; |
| 4 | 5 |
|
| 5 | 6 |
public interface StringTokenizer {
|
| 6 | 7 |
|
| 7 |
List<List<String>> processText(String text);
|
|
| 8 |
ArrayList<ArrayList<ArrayList<String>>> processText(String text);
|
|
| 8 | 9 |
|
| 9 | 10 |
boolean doSentences(); |
| 11 |
|
|
| 12 |
List<String> getAdditionalProperties(); |
|
| 10 | 13 |
} |
| TXM/branches/eltec/org.txm.internalview.rcp/src/org/txm/internalview/rcp/editors/InternalViewEditor.java (revision 3283) | ||
|---|---|---|
| 9 | 9 |
import org.eclipse.jface.viewers.ISelectionChangedListener; |
| 10 | 10 |
import org.eclipse.jface.viewers.IStructuredContentProvider; |
| 11 | 11 |
import org.eclipse.jface.viewers.SelectionChangedEvent; |
| 12 |
import org.eclipse.jface.viewers.StructuredSelection; |
|
| 12 | 13 |
import org.eclipse.jface.viewers.TableViewer; |
| 13 | 14 |
import org.eclipse.jface.viewers.TableViewerColumn; |
| 14 | 15 |
import org.eclipse.jface.viewers.Viewer; |
| ... | ... | |
| 37 | 38 |
import org.txm.searchengine.cqp.clientExceptions.CqiClientException; |
| 38 | 39 |
import org.txm.searchengine.cqp.corpus.CQPCorpus; |
| 39 | 40 |
import org.txm.searchengine.cqp.corpus.Property; |
| 41 |
import org.txm.searchengine.cqp.corpus.StructuralUnit; |
|
| 40 | 42 |
import org.txm.searchengine.cqp.corpus.StructuralUnitProperty; |
| 41 | 43 |
import org.txm.searchengine.cqp.corpus.WordProperty; |
| 42 | 44 |
import org.txm.searchengine.cqp.corpus.query.Match; |
| ... | ... | |
| 114 | 116 |
structComboLabel.setText(InternalViewUIMessages.structure); |
| 115 | 117 |
structComboLabel.setLayoutData(new GridData(GridData.CENTER, GridData.CENTER, false, true)); |
| 116 | 118 |
this.structuralUnitsComboViewer = new StructuralUnitsComboViewer(parametersArea, this, true); |
| 119 |
|
|
| 117 | 120 |
// Listener |
| 118 | 121 |
this.structuralUnitsComboViewer.addSelectionChangedListener(new ISelectionChangedListener() {
|
| 119 | 122 |
|
| ... | ... | |
| 124 | 127 |
} |
| 125 | 128 |
}); |
| 126 | 129 |
|
| 130 |
|
|
| 127 | 131 |
// Word properties selector |
| 128 | 132 |
propertiesSelector = new PropertiesSelector<>(parametersArea); |
| 129 | 133 |
propertiesSelector.setLayoutData(new GridData(GridData.CENTER, GridData.CENTER, false, true)); |
| TXM/branches/eltec/org.txm.core/src/java/org/txm/xml/StaxDomConstructor.java (revision 3283) | ||
|---|---|---|
| 2 | 2 |
|
| 3 | 3 |
import java.io.Reader; |
| 4 | 4 |
import java.io.StringReader; |
| 5 |
import java.util.Arrays; |
|
| 5 | 6 |
|
| 6 | 7 |
import javax.xml.parsers.ParserConfigurationException; |
| 7 | 8 |
import javax.xml.stream.XMLInputFactory; |
| ... | ... | |
| 77 | 78 |
elements++; |
| 78 | 79 |
break; |
| 79 | 80 |
case XMLStreamConstants.CHARACTERS: |
| 81 |
//char[] tmp = Arrays.copyOfRange(parser.getTextCharacters(), parser.getTextStart(), parser.getTextStart()+parser.getTextLength()); |
|
| 82 |
//System.out.println("T='"+Arrays.toString(tmp)+"'");
|
|
| 80 | 83 |
Text textNode = doc.createTextNode(parser.getText()); |
| 81 | 84 |
currentElement.appendChild(textNode); |
| 82 | 85 |
break; |
| TXM/branches/eltec/org.txm.core/src/java/org/txm/xml/DOMIdentityHook.java (revision 3283) | ||
|---|---|---|
| 73 | 73 |
parentParser.writer.writeCharacters("\n");
|
| 74 | 74 |
Element e = (Element) node; |
| 75 | 75 |
NodeList children = e.getChildNodes(); |
| 76 |
String ns = e.getNamespaceURI(); |
|
| 77 |
String ln = e.getLocalName(); |
|
| 76 | 78 |
if (children.getLength() > 0) {
|
| 77 |
String ns = e.getNamespaceURI(); |
|
| 78 |
String ln = e.getLocalName(); |
|
| 79 | 79 |
if (ns == null) {
|
| 80 | 80 |
parentParser.writer.writeStartElement(ln); |
| 81 | 81 |
} else {
|
| ... | ... | |
| 83 | 83 |
} |
| 84 | 84 |
} |
| 85 | 85 |
else {
|
| 86 |
parentParser.writer.writeEmptyElement(e.getNamespaceURI(), e.getLocalName()); |
|
| 86 |
if (ns == null) {
|
|
| 87 |
parentParser.writer.writeEmptyElement(ln); |
|
| 88 |
} else {
|
|
| 89 |
parentParser.writer.writeEmptyElement(ns, ln); |
|
| 90 |
} |
|
| 87 | 91 |
} |
| 88 | 92 |
|
| 89 | 93 |
for (int i = 0; i < e.getAttributes().getLength(); i++) {
|
| ... | ... | |
| 102 | 106 |
} |
| 103 | 107 |
if (children.getLength() > 0) {
|
| 104 | 108 |
parentParser.writer.writeEndElement(); |
| 105 |
parentParser.writer.writeCharacters("\n");
|
|
| 109 |
//parentParser.writer.writeCharacters("\n");
|
|
| 106 | 110 |
} |
| 107 | 111 |
} |
| 108 | 112 |
else if (node.getNodeType() == Node.TEXT_NODE) {
|
| 109 | 113 |
parentParser.writer.writeCharacters(node.getTextContent()); |
| 114 |
} else if (node.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE) {
|
|
| 115 |
parentParser.writer.writeProcessingInstruction(node.getNodeName(), node.getNodeValue()); |
|
| 116 |
} else if (node.getNodeType() == Node.CDATA_SECTION_NODE) {
|
|
| 117 |
parentParser.writer.writeCData(node.getNodeValue()); |
|
| 118 |
} else if (node.getNodeType() == Node.COMMENT_NODE) {
|
|
| 119 |
parentParser.writer.writeComment(node.getNodeValue()); |
|
| 110 | 120 |
} |
| 111 | 121 |
} |
| 112 | 122 |
|
| TXM/branches/eltec/org.txm.internalview.core/src/org/txm/internalview/core/functions/InternalView.java (revision 3283) | ||
|---|---|---|
| 3 | 3 |
import java.io.File; |
| 4 | 4 |
import java.io.PrintWriter; |
| 5 | 5 |
import java.util.ArrayList; |
| 6 |
import java.util.Arrays; |
|
| 6 | 7 |
import java.util.Collection; |
| 7 | 8 |
import java.util.HashMap; |
| 8 | 9 |
import java.util.LinkedHashMap; |
| ... | ... | |
| 91 | 92 |
public boolean loadParameters() throws Exception {
|
| 92 | 93 |
try {
|
| 93 | 94 |
String str = this.getStringParameterValue(TXMPreferences.STRUCTURAL_UNIT); |
| 94 |
this.pStructuralUnit = this.getCorpus().getStructuralUnit(str); |
|
| 95 |
if (str.length() == 0) {
|
|
| 96 |
try {
|
|
| 97 |
StructuralUnit struct = this.getCorpus().getStructuralUnit("text");
|
|
| 98 |
if (struct != null) {
|
|
| 99 |
this.pStructuralUnit = struct; |
|
| 100 |
} |
|
| 101 |
struct = this.getCorpus().getStructuralUnit("div");
|
|
| 102 |
if (struct != null) {
|
|
| 103 |
this.pStructuralUnit = struct; |
|
| 104 |
} |
|
| 105 |
struct = this.getCorpus().getStructuralUnit("p");
|
|
| 106 |
if (struct != null) {
|
|
| 107 |
this.pStructuralUnit = struct; |
|
| 108 |
} |
|
| 109 |
} |
|
| 110 |
catch (CqiClientException e1) {
|
|
| 111 |
// TODO Auto-generated catch block |
|
| 112 |
e1.printStackTrace(); |
|
| 113 |
} |
|
| 114 |
} else {
|
|
| 115 |
this.pStructuralUnit = this.getCorpus().getStructuralUnit(str); |
|
| 116 |
} |
|
| 117 |
|
|
| 95 | 118 |
} |
| 96 | 119 |
catch (Exception e) {
|
| 97 | 120 |
Log.printStackTrace(e); |
| ... | ... | |
| 105 | 128 |
} |
| 106 | 129 |
try {
|
| 107 | 130 |
String str = this.getStringParameterValue(TXMPreferences.STRUCTURAL_UNIT_PROPERTIES); |
| 108 |
this.pStructuralUnitsProperties = StructuralUnitProperty.stringToProperties(this.getCorpus(), str); |
|
| 131 |
if (str.length() == 0) {
|
|
| 132 |
StructuralUnitProperty structP = pStructuralUnit.getProperty("id");
|
|
| 133 |
if (structP != null) {
|
|
| 134 |
this.pStructuralUnitsProperties = Arrays.asList(structP); |
|
| 135 |
} |
|
| 136 |
structP = pStructuralUnit.getProperty("n");
|
|
| 137 |
if (structP != null) {
|
|
| 138 |
this.pStructuralUnitsProperties = Arrays.asList(structP); |
|
| 139 |
} |
|
| 140 |
} else {
|
|
| 141 |
this.pStructuralUnitsProperties = StructuralUnitProperty.stringToProperties(this.getCorpus(), str); |
|
| 142 |
} |
|
| 109 | 143 |
} |
| 110 | 144 |
catch (Exception e3) {
|
| 111 | 145 |
Log.printStackTrace(e3); |
| ... | ... | |
| 365 | 399 |
this.setDirty(); |
| 366 | 400 |
} |
| 367 | 401 |
|
| 368 |
|
|
| 369 |
|
|
| 370 |
|
|
| 371 | 402 |
@Override |
| 372 | 403 |
public boolean setParameters(TXMParameters parameters) {
|
| 373 | 404 |
System.err.println("InternalView.setParameters(): not yet implemented.");
|
| TXM/branches/eltec/org.txm.internalview.core/src/org/txm/internalview/core/preferences/InternalViewPreferences.java (revision 3283) | ||
|---|---|---|
| 37 | 37 |
super.initializeDefaultPreferences(); |
| 38 | 38 |
Preferences preferences = this.getDefaultPreferencesNode(); |
| 39 | 39 |
|
| 40 |
preferences.put(STRUCTURAL_UNIT, TBXPreferences.DEFAULT_STRUCTURAL_UNIT);
|
|
| 40 |
preferences.put(STRUCTURAL_UNIT, "");
|
|
| 41 | 41 |
preferences.put(UNIT_PROPERTIES, TBXPreferences.DEFAULT_UNIT_PROPERTY); |
| 42 |
preferences.put(STRUCTURAL_UNIT_PROPERTIES, TBXPreferences.DEFAULT_STRUCTURAL_UNIT_PROPERTY);
|
|
| 42 |
preferences.put(STRUCTURAL_UNIT_PROPERTIES, "");
|
|
| 43 | 43 |
preferences.putInt(CURRENT_PAGE, 0); |
| 44 | 44 |
} |
| 45 | 45 |
} |
Formats disponibles : Unified diff