Révision 3283
TXM/branches/eltec/org.txm.groovy.core/src/groovy/org/txm/scripts/filters/Tokeniser/ChunkTokenizerXml.groovy (revision 3283) | ||
---|---|---|
41 | 41 |
// $LastChangedRevision:$ |
42 | 42 |
// $LastChangedBy:$ |
43 | 43 |
// |
44 |
package org.txm.tokenizer;
|
|
44 |
package org.txm.scripts.filters.Tokeniser;
|
|
45 | 45 |
|
46 | 46 |
import static groovy.transform.TypeCheckingMode.SKIP |
47 | 47 |
import groovy.transform.CompileStatic |
... | ... | |
52 | 52 |
|
53 | 53 |
import javax.xml.stream.* |
54 | 54 |
|
55 |
import org.eclipse.ui.internal.dialogs.NewContentTypeDialog |
|
55 | 56 |
import org.txm.importer.PersonalNamespaceContext |
56 | 57 |
import org.txm.tokenizer.StringTokenizer |
57 | 58 |
import org.txm.tokenizer.SimpleStringTokenizer |
... | ... | |
63 | 64 |
import org.txm.xml.XPathHookActivator |
64 | 65 |
import org.w3c.dom.Node |
65 | 66 |
|
66 |
@CompileStatic |
|
67 | 67 |
public class ChunkTokenizerXml extends XMLProcessor { |
68 | 68 |
|
69 | 69 |
XPathHookActivator activator; |
... | ... | |
119 | 119 |
/** The prefix. */ |
120 | 120 |
String prefix; |
121 | 121 |
String filename; |
122 |
int wordcount = 0; |
|
122 | 123 |
|
123 | 124 |
Pattern regLN; |
124 | 125 |
Pattern regCTRL; |
... | ... | |
139 | 140 |
*/ |
140 | 141 |
public ChunkTokenizerXml(File infile, TokenizerClasses tc) { |
141 | 142 |
super(infile) |
143 |
|
|
142 | 144 |
this.lang = tc.lang; |
143 | 145 |
this.stringTokenizer = new SimpleStringTokenizer(lang); |
144 | 146 |
|
... | ... | |
158 | 160 |
regLN = Pattern.compile("/\n/"); |
159 | 161 |
regCTRL = Pattern.compile("/\\p{C}/"); |
160 | 162 |
|
161 |
activator = new XPathHookActivator<>(hook, "//div|p|ab");
|
|
163 |
activator = new XPathHookActivator<>(hook, "//(div|p|ab|note|s|list|head|front|body|back|text)");
|
|
162 | 164 |
|
163 | 165 |
hook = new DOMIdentityHook("in_text_hook", activator, this) { |
164 |
|
|
165 |
String id; |
|
166 |
|
|
167 |
boolean inAna = false; |
|
168 |
|
|
169 |
boolean inForm = false; |
|
170 |
|
|
171 |
boolean inW = false; |
|
172 |
|
|
173 |
ArrayList<String[]> anaValues = new ArrayList<>(); |
|
174 |
|
|
175 |
ArrayList<String[]> formValues = new ArrayList<>(); |
|
176 |
|
|
177 |
StringBuilder value = new StringBuilder(); |
|
178 |
|
|
179 |
String resp = ""; |
|
180 |
|
|
181 |
String type = ""; |
|
182 |
|
|
183 |
/** |
|
166 |
|
|
167 |
String id;
|
|
168 |
|
|
169 |
boolean inAna = false;
|
|
170 |
|
|
171 |
boolean inForm = false;
|
|
172 |
|
|
173 |
boolean inW = false;
|
|
174 |
|
|
175 |
ArrayList<String[]> anaValues = new ArrayList<>();
|
|
176 |
|
|
177 |
ArrayList<String[]> formValues = new ArrayList<>();
|
|
178 |
|
|
179 |
StringBuilder value = new StringBuilder();
|
|
180 |
|
|
181 |
String resp = "";
|
|
182 |
|
|
183 |
String type = "";
|
|
184 |
|
|
185 |
/**
|
|
184 | 186 |
* extends this method to process the DOM before it is written |
185 | 187 |
*/ |
186 |
public void processDom() { |
|
187 |
|
|
188 |
ArrayList<Node> textNodes = getTextNodes(dom); |
|
189 |
if (textNodes.size() == 0) return; // easy |
|
190 |
|
|
191 |
StringBuilder buffer = new StringBuilder(); // build a string to tokenize |
|
192 |
for (Node textNode : textNodes) { |
|
193 |
buffer.append(" "+textNode.getTextContent()); |
|
194 |
} |
|
195 |
|
|
196 |
int nNode = 0; |
|
197 |
Node currentTextNode = textNodes.get(0); |
|
198 |
String currentText = currentTextNode.getTextContent(); |
|
199 |
int curentTextIndex = 0; |
|
200 |
StringBuilder currentNewText = new StringBuilder() |
|
201 |
ArrayList<String> currentWords = new ArrayList<String>() |
|
202 |
List<List<String>> sentences = stringTokenizer.processText(buffer.toString()); |
|
203 |
//println "text="+buffer.toString() |
|
204 |
println "sentences=$sentences" |
|
205 |
for (List<String> sent : sentences) { |
|
206 |
if (nNode >= textNodes.size()) { // all nodes are updated |
|
207 |
break; |
|
208 |
} |
|
209 |
|
|
210 |
for (String word : sent) { |
|
211 |
if (nNode >= textNodes.size()) { // all nodes are updated |
|
212 |
break; |
|
188 |
public void processDom() { |
|
189 |
|
|
190 |
//println "Processing DOM with $stringTokenizer" |
|
191 |
ArrayList<Node> textNodes = getTextNodes(dom); |
|
192 |
if (textNodes.size() == 0) return; // easy |
|
193 |
|
|
194 |
StringBuilder buffer = new StringBuilder(); // build a string to tokenize |
|
195 |
for (Node textNode : textNodes) { |
|
196 |
buffer.append(" "+textNode.getTextContent()); |
|
213 | 197 |
} |
214 | 198 |
|
215 |
int idx = currentText.indexOf(word, curentTextIndex); |
|
216 |
if (idx >= 0) { |
|
217 |
curentTextIndex = idx + word.length(); |
|
218 |
} else { |
|
219 |
println "HOUSTON: word=$word nNode=$nNode currentText=$currentText index=$curentTextIndex words=$currentWords" |
|
220 |
currentTextNode.setTextContent(""); |
|
221 |
for (String w : currentWords) { |
|
222 |
Node newChild = dom.getOwnerDocument().createElementNS(null, "w"); |
|
223 |
newChild.setAttribute("id", "W_ID") |
|
224 |
newChild.setTextContent(w); |
|
199 |
int nNode = 0; |
|
200 |
Node currentTextNode = textNodes.get(0); |
|
201 |
String currentText = currentTextNode.getTextContent(); |
|
202 |
int curentTextIndex = 0; |
|
203 |
StringBuilder currentNewText = new StringBuilder() |
|
204 |
ArrayList<String> currentWords = new ArrayList<String>() |
|
205 |
//println "text="+buffer.toString() |
|
206 |
List<List<List<String>>> sentences = stringTokenizer.processText(buffer.toString()); |
|
207 |
//println "sentences=$sentences" |
|
208 |
for (List<List<String>> sent : sentences) { |
|
209 |
//println "sent: $sent" |
|
210 |
for (int iWord = 0 ; iWord < sent.size() ; iWord++) { |
|
225 | 211 |
|
226 |
currentTextNode.getParentNode().insertBefore(newChild, currentTextNode) |
|
212 |
List<String> word = sent.get(iWord) |
|
213 |
int idx = currentText.indexOf(word[0], curentTextIndex); |
|
214 |
//println "TEST: $word at $curentTextIndex in $currentText" |
|
215 |
if (idx >= 0) { // the whole word is in the current node text content |
|
216 |
curentTextIndex = idx + word[0].length(); |
|
217 |
currentWords.add(word) |
|
218 |
continue // NEXT WORD |
|
219 |
} |
|
220 |
|
|
221 |
// see if the word is partially in the current text node |
|
222 |
// String partialWord = "" |
|
223 |
// for (int c = 0 ; c < word.length() - 1 ; c++) { |
|
224 |
// idx = currentText.indexOf(word.substring(0, word.length() - c), curentTextIndex); |
|
225 |
// if (idx >= 0) { // the partial word is in the current node text content |
|
226 |
// partialWord = word.substring(0, word.length() - c) |
|
227 |
// break; |
|
228 |
// } |
|
229 |
// } |
|
230 |
// if (partialWord.length() > 0) { // yes, the word is partially in the text node |
|
231 |
// //println "partialword=$partialWord" |
|
232 |
// def sword = word.substring(partialWord.length()) |
|
233 |
// //println "sword=$sword" |
|
234 |
// if (sword.length() > 0) { // should always happen |
|
235 |
// sent.set(iWord, sword) |
|
236 |
// //iWord-- // to re-process the word |
|
237 |
// } |
|
238 |
// curentTextIndex = idx + partialWord.length() |
|
239 |
// currentWords.add(partialWord.toString()) |
|
240 |
// } |
|
241 |
|
|
242 |
if (currentWords.size() > 0) { |
|
243 |
writeWords(nNode, currentText, curentTextIndex, currentTextNode, currentWords) |
|
244 |
} |
|
245 |
|
|
246 |
if (nNode < textNodes.size() - 1) { |
|
247 |
currentNewText = new StringBuilder() |
|
248 |
curentTextIndex = 0; |
|
249 |
nNode++; |
|
250 |
|
|
251 |
currentTextNode = textNodes.get(nNode); |
|
252 |
currentText = currentTextNode.getTextContent(); |
|
253 |
|
|
254 |
iWord--; // |
|
255 |
} else { |
|
256 |
currentWords.add(word) |
|
257 |
} |
|
227 | 258 |
} |
228 |
currentTextNode.getParentNode().removeChild(currentTextNode) |
|
229 | 259 |
|
230 |
currentNewText = new StringBuilder() |
|
231 |
currentWords.clear(); |
|
232 |
curentTextIndex = 0; |
|
233 |
nNode++; |
|
234 |
if (nNode < textNodes.size()) { |
|
235 |
currentTextNode = textNodes.get(nNode); |
|
236 |
currentText = currentTextNode.getTextContent(); |
|
260 |
if (currentWords.size() > 0) { |
|
261 |
writeWords(nNode, currentText, curentTextIndex, currentTextNode, currentWords) |
|
237 | 262 |
} |
238 | 263 |
|
264 |
if (stringTokenizer.doSentences()) { |
|
265 |
Node newChild = dom.getOwnerDocument().createProcessingInstruction("txm", "</s>") |
|
266 |
currentTextNode.getParentNode().insertBefore(newChild, currentTextNode)//appendChild(newChild)// |
|
267 |
} |
|
239 | 268 |
} |
240 |
|
|
241 |
currentWords.add(word) |
|
242 | 269 |
} |
243 |
} |
|
244 |
}
|
|
245 |
|
|
246 |
public ArrayList<Node> getTextNodes(Node element) {
|
|
247 |
def children = element.getChildNodes()
|
|
248 |
ArrayList<Node> texts = new ArrayList<Node>()
|
|
249 |
for (int i = 0 ; i < children.getLength() ; i++) {
|
|
250 |
def node = children.item(i);
|
|
251 |
if (node.getNodeType() == Node.TEXT_NODE) {
|
|
252 |
texts.add(node)
|
|
253 |
} else if (node.getNodeType() == Node.ELEMENT_NODE) {
|
|
254 |
if (node.getLocalName().equals("w")) {
|
|
255 |
texts.add(node)
|
|
256 |
} else { |
|
257 |
texts.addAll(getTextNodes(node));
|
|
270 |
|
|
271 |
public void writeWords(def nNode, def currentText, def curentTextIndex, def currentTextNode, def currentWords) {
|
|
272 |
//println "WRITING: nNode=$nNode currentText='${currentText.replace("\\n", " ")}' index=$curentTextIndex words=$currentWords" |
|
273 |
currentTextNode.setTextContent("");
|
|
274 |
for (def w : currentWords) {
|
|
275 |
Node newChild = dom.getOwnerDocument().createElementNS(null, "w");
|
|
276 |
wordcount++
|
|
277 |
newChild.setAttribute("id", "w_"+filename+"_"+wordcount)
|
|
278 |
for (int i = 0 ; i < stringTokenizer.getAdditionalProperties().size() ; i++) {
|
|
279 |
//println "write att: "+stringTokenizer.getAdditionalProperties()[i]+"="+w[i+1]
|
|
280 |
newChild.setAttribute(stringTokenizer.getAdditionalProperties()[i], w[i+1])
|
|
281 |
}
|
|
282 |
newChild.setTextContent(w[0]);
|
|
283 |
|
|
284 |
currentTextNode.getParentNode().insertBefore(newChild, currentTextNode)
|
|
258 | 285 |
} |
286 |
currentWords.clear(); |
|
259 | 287 |
} |
260 |
} |
|
261 |
return texts; |
|
262 |
} |
|
263 |
}; |
|
288 |
|
|
289 |
public ArrayList<Node> getTextNodes(Node element) { |
|
290 |
def children = element.getChildNodes() |
|
291 |
ArrayList<Node> texts = new ArrayList<Node>() |
|
292 |
for (int i = 0 ; i < children.getLength() ; i++) { |
|
293 |
def node = children.item(i); |
|
294 |
if (node.getNodeType() == Node.TEXT_NODE && node.getTextContent().trim().length() > 0) { |
|
295 |
texts.add(node) |
|
296 |
} else if (node.getNodeType() == Node.ELEMENT_NODE) { |
|
297 |
if (node.getLocalName().equals("w")) { |
|
298 |
if (retokenize) { |
|
299 |
//texts.add(node) |
|
300 |
} |
|
301 |
} else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(node.getLocalName()).matches()) { |
|
302 |
//texts.add(node) |
|
303 |
} else if (reg_outside_text_tags != null && reg_outside_text_tags.matcher(node.getLocalName()).matches()) { // ignore the tag and its content of the tag |
|
304 |
//texts.add(node) |
|
305 |
} else { |
|
306 |
texts.addAll(getTextNodes(node)); |
|
307 |
} |
|
308 |
} |
|
309 |
} |
|
310 |
return texts; |
|
311 |
} |
|
312 |
}; |
|
264 | 313 |
} |
265 | 314 |
|
266 | 315 |
/** |
... | ... | |
411 | 460 |
} |
412 | 461 |
|
413 | 462 |
|
414 |
|
|
415 |
/** The wordcount. */ |
|
416 |
int wordcount = 0; |
|
417 |
|
|
418 | 463 |
/** The ignorecontent. */ |
419 | 464 |
boolean ignorecontent = true; // tokenization starts with element name matching $reg_startTag |
420 | 465 |
boolean insideword = false; |
... | ... | |
628 | 673 |
text = regLN.matcher(text).replaceAll(WHITESPACE); |
629 | 674 |
text = regCTRL.matcher(text).replaceAll(EMPTY); // remove ctrl characters |
630 | 675 |
|
676 |
println "tokenize text" |
|
631 | 677 |
def sentences = stringTokenizer.processText(text); |
632 | 678 |
for (def words : sentences) { |
633 | 679 |
for (def word : words) { |
634 | 680 |
wordcount++; |
635 | 681 |
writer.writeStartElement(word_element_to_create); |
636 | 682 |
writeWordAttributes();// id |
637 |
writer.writeCharacters(word); |
|
683 |
for (int i = 0 ; i < stringTokenizer.getAdditionalProperties() ; i++) { |
|
684 |
String att = stringTokenizer.getAdditionalProperties().get(i) |
|
685 |
writer.writeAttribute(att, word[i+1]) |
|
686 |
} |
|
687 |
writer.writeCharacters(word[0]); |
|
638 | 688 |
writer.writeEndElement(); |
639 | 689 |
writer.writeCharacters("\n"); |
690 |
println "WRITE WORD: "+word[0]+" in "+writer |
|
640 | 691 |
} |
641 | 692 |
if (stringTokenizer.doSentences()) { |
642 | 693 |
writer.writeProcessingInstruction("txm", "</s>") |
TXM/branches/eltec/org.txm.groovy.core/src/groovy/org/txm/scripts/filters/Tokeniser/SimpleTokenizerXml.groovy (revision 3283) | ||
---|---|---|
518 | 518 |
wordcount++; |
519 | 519 |
writer.writeStartElement(word_element_to_create); |
520 | 520 |
writeWordAttributes();// id |
521 |
writer.writeCharacters(word); |
|
521 |
for (int i = 0 ; i < stringTokenizer.getAdditionalProperties() ; i++) { |
|
522 |
String att = stringTokenizer.getAdditionalProperties().get(i) |
|
523 |
writer.writeAttribute(att, word[i+1]) |
|
524 |
} |
|
525 |
writer.writeCharacters(word[0]); |
|
522 | 526 |
writer.writeEndElement(); |
523 | 527 |
writer.writeCharacters("\n"); |
524 | 528 |
} |
TXM/branches/eltec/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZImporter.groovy (revision 3283) | ||
---|---|---|
1 | 1 |
package org.txm.scripts.importer.xtz |
2 | 2 |
|
3 |
import org.txm.scripts.filters.Tokeniser.SimpleTokenizerXml
|
|
3 |
import org.txm.scripts.filters.Tokeniser.ChunkTokenizerXml
|
|
4 | 4 |
|
5 | 5 |
import java.io.File; |
6 | 6 |
|
... | ... | |
390 | 390 |
cpb.tick() |
391 | 391 |
File infile = f; |
392 | 392 |
File outfile = new File(module.getBinaryDirectory(),"tokenized/"+f.getName()); |
393 |
SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, TokenizerClasses.newTokenizerClasses(project.getPreferencesScope(), lang))
|
|
394 |
if (module.getProject().getAnnotate()) { // an annotation will be done, does the annotation engine needs another tokenizer ? |
|
393 |
ChunkTokenizerXml tokenizer = new ChunkTokenizerXml(infile, TokenizerClasses.newTokenizerClasses(project.getPreferencesScope(), lang))
|
|
394 |
//if (module.getProject().getAnnotate()) { // an annotation will be done, does the annotation engine needs another tokenizer ?
|
|
395 | 395 |
String engineName = module.getProject().getImportParameters().node("annotate").get("engine", "TreeTagger") |
396 | 396 |
def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine(engineName) |
397 | 397 |
def stringTokenizer = engine.getStringTokenizer(lang) |
398 | 398 |
if (stringTokenizer != null) { |
399 | 399 |
tokenizer.setStringTokenizer(stringTokenizer) |
400 | 400 |
} |
401 |
} |
|
401 |
//}
|
|
402 | 402 |
tokenizer.setRetokenize(retokenize) |
403 | 403 |
if (outSideTextTagsRegex != null && outSideTextTagsRegex.trim().length() > 0) { |
404 | 404 |
tokenizer.setOutSideTextTags(outSideTextTagsRegex) |
... | ... | |
413 | 413 |
} |
414 | 414 |
|
415 | 415 |
// tokenize ! |
416 |
if (!tokenizer.process()) { |
|
416 |
if (!tokenizer.process(outfile)) {
|
|
417 | 417 |
println("Failed to process "+f) |
418 | 418 |
outfile.delete() |
419 | 419 |
} |
TXM/branches/eltec/org.txm.udpipe.core/src/org/txm/udpipe/core/UDPipeEngine.java (revision 3283) | ||
---|---|---|
20 | 20 |
import org.txm.utils.logger.Log; |
21 | 21 |
import org.txm.xml.xmltxm.XMLTXMWordPropertiesInjection; |
22 | 22 |
|
23 |
import cz.cuni.mff.ufal.udpipe.InputFormat;
|
|
24 |
import cz.cuni.mff.ufal.udpipe.Model;
|
|
23 |
import cz.cuni.mff.ufal.udpipe.MultiwordToken;
|
|
24 |
import cz.cuni.mff.ufal.udpipe.MultiwordTokens;
|
|
25 | 25 |
import cz.cuni.mff.ufal.udpipe.Sentence; |
26 | 26 |
import cz.cuni.mff.ufal.udpipe.Sentences; |
27 | 27 |
import cz.cuni.mff.ufal.udpipe.Version; |
... | ... | |
118 | 118 |
*/ |
119 | 119 |
public static String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" }; |
120 | 120 |
|
121 |
|
|
121 |
/** |
|
122 |
* TODO re-think the process. The UDPStringTokenizer do the job |
|
123 |
*/ |
|
122 | 124 |
@Override |
123 | 125 |
public boolean processFile(File xmlFile, File binaryCorpusDirectory, HashMap<String, Object> parameters) { |
124 | 126 |
if (!isRunning()) return false; |
125 | 127 |
|
126 |
String lang = null; |
|
127 |
Object ps = parameters.get("langs"); |
|
128 |
Object p = parameters.get("lang"); |
|
128 |
// String lang = null; |
|
129 |
// Object ps = parameters.get("langs"); |
|
130 |
// Object p = parameters.get("lang"); |
|
131 |
// |
|
132 |
// if (p == null && ps == null) { |
|
133 |
// Log.warning("Warning: can't annotate. No 'lang' (String) or 'langs' (Map<String, String>) parameter specified in " + parameters); |
|
134 |
// return false; |
|
135 |
// } |
|
136 |
// |
|
137 |
// if (ps != null && ps instanceof Map) { |
|
138 |
// Map<?, ?> map = (Map<?, ?>) ps; |
|
139 |
// String text_id = xmlFile.getName(); |
|
140 |
// if (map.get(text_id) != null) { |
|
141 |
// lang = map.get(text_id).toString().toLowerCase(); |
|
142 |
// if (!canAnnotateLang(lang)) { |
|
143 |
// Log.warning("Warning: can't annotate text_id=${text_id} with $lang, will use the default lang=$p"); |
|
144 |
// return false; |
|
145 |
// } |
|
146 |
// } |
|
147 |
// } |
|
148 |
// |
|
149 |
// if (lang == null && p == null) { |
|
150 |
// System.out.println(NLS.bind("** Error: no 'lang' parameter given: {0}. Aborting TreeTagger annotation.", parameters)); |
|
151 |
// return false; |
|
152 |
// } |
|
153 |
// else { |
|
154 |
// lang = p.toString(); |
|
155 |
// } |
|
156 |
// |
|
157 |
// if (!canAnnotateLang(lang)) { |
|
158 |
// return false; |
|
159 |
// } |
|
160 |
// |
|
161 |
// File modelsDirectory = new File(UDPipePreferences.getInstance().getString(UDPipePreferences.MODELS_PATH)); // default models directory is set in the Toolbox |
|
162 |
// File modelFile = new File(modelsDirectory, lang + ".udpipe"); |
|
163 |
// //System.out.println("model="+modelFile.getAbsolutePath()); |
|
164 |
// |
|
165 |
// try { |
|
166 |
// long time = System.currentTimeMillis(); |
|
167 |
// // get words |
|
168 |
// XMLTXMToUDPipeXMLParser wparser = new XMLTXMToUDPipeXMLParser(xmlFile.toURI().toURL()); |
|
169 |
// if (!wparser.process(null)) { |
|
170 |
// Log.warning("Error while parsing: " + xmlFile); |
|
171 |
// return false; |
|
172 |
// } |
|
173 |
// //System.out.println("words built in: "+(System.currentTimeMillis() - time)); |
|
174 |
// Sentences sentences = wparser.getSentences(); |
|
175 |
// |
|
176 |
// time = System.currentTimeMillis(); |
|
177 |
// // System.out.println("SENTENCES PARSED: " + UDPipeJavaUtils.toString(sentences)); |
|
178 |
// // tag |
|
179 |
// UDPipeJavaUtils.processSentences(modelFile.getAbsolutePath(), sentences); |
|
180 |
// //System.out.println("sentences parsed in: "+(System.currentTimeMillis() - time)); |
|
181 |
// // System.out.println("SENTENCES RESULT: " + UDPipeJavaUtils.toString(sentences)); |
|
182 |
// |
|
183 |
// // update the XML-TXM file |
|
184 |
// time = System.currentTimeMillis(); |
|
185 |
// XMLTXMWordPropertiesInjection injector = new XMLTXMWordPropertiesInjection(xmlFile); |
|
186 |
// HashMap<String, HashMap<String, String>> rules = new HashMap<>(); |
|
187 |
// for (int iSentence = 0; iSentence < sentences.size(); iSentence++) { |
|
188 |
// Sentence sentence = sentences.get(iSentence); |
|
189 |
// Words words = sentence.getWords(); |
|
190 |
// MultiwordTokens mwt = sentence.getMultiwordTokens(); |
|
191 |
// HashMap<Integer, MultiwordToken> firstWordToToken = new HashMap<Integer, MultiwordToken>(); |
|
192 |
// HashMap<Integer, MultiwordToken> lastWordToToken = new HashMap<Integer, MultiwordToken>(); |
|
193 |
// for (int i = 0 ; i < mwt.size(); i++) { |
|
194 |
// MultiwordToken mw = mwt.get(i); |
|
195 |
// firstWordToToken.put(mw.getIdFirst(), mw); |
|
196 |
// lastWordToToken.put(mw.getIdLast(), mw); |
|
197 |
// } |
|
198 |
// |
|
199 |
// for (int iWord = 0; iWord < words.size(); iWord++) { |
|
200 |
// Word word = words.get(iWord); |
|
201 |
// String form = word.getForm(); |
|
202 |
// if (Sentence.getRootForm().equals(form)) continue; |
|
203 |
// |
|
204 |
// if (firstWordToToken.containsKey(word.getId())) { |
|
205 |
// MultiwordToken mw = firstWordToToken.get(word.getId()); |
|
206 |
// |
|
207 |
// String misc = mw.getMisc(); |
|
208 |
// int idx = misc.indexOf(XMLIDMISC); |
|
209 |
// int idx2 = misc.indexOf("|", idx + 6); |
|
210 |
// if (idx2 < 0) idx2 = misc.length(); |
|
211 |
// String id = misc.substring(idx + 6, idx2); |
|
212 |
// if (id != null && id.length() > 0) { |
|
213 |
// HashMap<String, String> properties = UDPipeJavaUtils.wordToHashMap(sentence, mw, "#ud-"); |
|
214 |
// rules.put(id, properties); |
|
215 |
// } |
|
216 |
// } else if (lastWordToToken.containsKey(word.getId())) { |
|
217 |
// // already written |
|
218 |
// } else { |
|
219 |
// |
|
220 |
// String misc = word.getMisc(); |
|
221 |
// int idx = misc.indexOf(XMLIDMISC); |
|
222 |
// int idx2 = misc.indexOf("|", idx + 6); |
|
223 |
// if (idx2 < 0) idx2 = misc.length(); |
|
224 |
// String id = misc.substring(idx + 6, idx2); |
|
225 |
// if (id != null && id.length() > 0) { |
|
226 |
// HashMap<String, String> properties = UDPipeJavaUtils.wordToHashMap(word, "#ud-"); |
|
227 |
// rules.put(id, properties); |
|
228 |
// } |
|
229 |
// } |
|
230 |
// } |
|
231 |
// } |
|
232 |
// File outFile = new File(binaryCorpusDirectory, xmlFile.getName() + ".tmp"); |
|
233 |
// injector.setProperties(rules); |
|
234 |
// if (injector.process(outFile) && outFile.exists()) { |
|
235 |
// xmlFile.delete(); |
|
236 |
// outFile.renameTo(xmlFile); |
|
237 |
// } |
|
238 |
// else { |
|
239 |
// Log.warning("Error while processing: " + xmlFile); |
|
240 |
// } |
|
241 |
// System.out.println("ud properties injected in: "+(System.currentTimeMillis() - time)); |
|
242 |
// } |
|
243 |
// catch (IOException | XMLStreamException e) { |
|
244 |
// // TODO Auto-generated catch block |
|
245 |
// e.printStackTrace(); |
|
246 |
// } |
|
129 | 247 |
|
130 |
if (p == null && ps == null) { |
|
131 |
Log.warning("Warning: can't annotate. No 'lang' (String) or 'langs' (Map<String, String>) parameter specified in " + parameters); |
|
132 |
return false; |
|
133 |
} |
|
134 |
|
|
135 |
if (ps != null && ps instanceof Map) { |
|
136 |
Map<?, ?> map = (Map<?, ?>) ps; |
|
137 |
String text_id = xmlFile.getName(); |
|
138 |
if (map.get(text_id) != null) { |
|
139 |
lang = map.get(text_id).toString().toLowerCase(); |
|
140 |
if (!canAnnotateLang(lang)) { |
|
141 |
Log.warning("Warning: can't annotate text_id=${text_id} with $lang, will use the default lang=$p"); |
|
142 |
return false; |
|
143 |
} |
|
144 |
} |
|
145 |
} |
|
146 |
|
|
147 |
if (lang == null && p == null) { |
|
148 |
System.out.println(NLS.bind("** Error: no 'lang' parameter given: {0}. Aborting TreeTagger annotation.", parameters)); |
|
149 |
return false; |
|
150 |
} |
|
151 |
else { |
|
152 |
lang = p.toString(); |
|
153 |
} |
|
154 |
|
|
155 |
if (!canAnnotateLang(lang)) { |
|
156 |
return false; |
|
157 |
} |
|
158 |
|
|
159 |
File modelsDirectory = new File(UDPipePreferences.getInstance().getString(UDPipePreferences.MODELS_PATH)); // default models directory is set in the Toolbox |
|
160 |
File modelFile = new File(modelsDirectory, lang + ".udpipe"); |
|
161 |
//System.out.println("model="+modelFile.getAbsolutePath()); |
|
162 |
|
|
163 |
try { |
|
164 |
// get words |
|
165 |
XMLTXMToUDPipeXMLParser wparser = new XMLTXMToUDPipeXMLParser(xmlFile.toURI().toURL()); |
|
166 |
if (!wparser.process(null)) { |
|
167 |
Log.warning("Error while parsing: " + xmlFile); |
|
168 |
return false; |
|
169 |
} |
|
170 |
|
|
171 |
Sentences sentences = wparser.getSentences(); |
|
172 |
|
|
173 |
// System.out.println("SENTENCES PARSED: " + UDPipeJavaUtils.toString(sentences)); |
|
174 |
// tag |
|
175 |
UDPipeJavaUtils.processSentences(modelFile.getAbsolutePath(), sentences); |
|
176 |
|
|
177 |
// System.out.println("SENTENCES RESULT: " + UDPipeJavaUtils.toString(sentences)); |
|
178 |
|
|
179 |
// update XML-TXM files |
|
180 |
XMLTXMWordPropertiesInjection injector = new XMLTXMWordPropertiesInjection(xmlFile); |
|
181 |
HashMap<String, HashMap<String, String>> rules = new HashMap<>(); |
|
182 |
for (int iSentence = 0; iSentence < sentences.size(); iSentence++) { |
|
183 |
Sentence sentence = sentences.get(iSentence); |
|
184 |
Words words = sentence.getWords(); |
|
185 |
for (int iWord = 0; iWord < words.size(); iWord++) { |
|
186 |
Word word = words.get(iWord); |
|
187 |
String form = word.getForm(); |
|
188 |
if ("<root>".equals(form)) continue; |
|
189 |
|
|
190 |
String misc = word.getMisc(); |
|
191 |
int idx = misc.indexOf(XMLIDMISC); |
|
192 |
int idx2 = misc.indexOf("|", idx + 6); |
|
193 |
if (idx2 < 0) idx2 = misc.length(); |
|
194 |
String id = misc.substring(idx + 6, idx2); |
|
195 |
if (id != null && id.length() > 0) { |
|
196 |
HashMap<String, String> properties = UDPipeJavaUtils.wordToHashMap(word, "#ud-"); |
|
197 |
rules.put(id, properties); |
|
198 |
} |
|
199 |
} |
|
200 |
} |
|
201 |
File outFile = new File(binaryCorpusDirectory, xmlFile.getName() + ".tmp"); |
|
202 |
injector.setProperties(rules); |
|
203 |
if (injector.process(outFile) && outFile.exists()) { |
|
204 |
xmlFile.delete(); |
|
205 |
outFile.renameTo(xmlFile); |
|
206 |
} |
|
207 |
else { |
|
208 |
Log.warning("Error while processing: " + xmlFile); |
|
209 |
} |
|
210 |
} |
|
211 |
catch (IOException | XMLStreamException e) { |
|
212 |
// TODO Auto-generated catch block |
|
213 |
e.printStackTrace(); |
|
214 |
} |
|
215 |
|
|
216 | 248 |
// update xml-txm files |
217 | 249 |
return true; |
218 | 250 |
} |
TXM/branches/eltec/org.txm.udpipe.core/src/org/txm/udpipe/core/XMLTXMToUDPipeXMLParser.java (revision 3283) | ||
---|---|---|
173 | 173 |
flagform = false; |
174 | 174 |
form = form.trim(); |
175 | 175 |
form = form.replace("\n", "").replace("<", "<"); |
176 |
Word word = new Word(); |
|
177 |
word.setForm(form); |
|
176 |
|
|
177 |
sentence.addWord(form); |
|
178 |
Word word = sentence.getWords().get((int) (sentence.getWords().size() - 1)); |
|
178 | 179 |
word.setMisc(UDPipeEngine.XMLIDMISC + wordId); |
179 |
sentence.getWords().add(word); |
|
180 | 180 |
} |
181 | 181 |
break; |
182 | 182 |
|
... | ... | |
195 | 195 |
} |
196 | 196 |
} |
197 | 197 |
|
198 |
if (sentence != null && sentence.getWords().size() > 0) {
|
|
198 |
if (sentence != null && sentence.getWords().size() > 1) { // a sent contains at least <root>
|
|
199 | 199 |
sentences.add(sentence); |
200 | 200 |
} |
201 | 201 |
|
TXM/branches/eltec/org.txm.udpipe.core/src/org/txm/udpipe/core/UDPipeJavaUtils.java (revision 3283) | ||
---|---|---|
10 | 10 |
|
11 | 11 |
import cz.cuni.mff.ufal.udpipe.InputFormat; |
12 | 12 |
import cz.cuni.mff.ufal.udpipe.Model; |
13 |
import cz.cuni.mff.ufal.udpipe.MultiwordToken; |
|
14 |
import cz.cuni.mff.ufal.udpipe.MultiwordTokens; |
|
13 | 15 |
import cz.cuni.mff.ufal.udpipe.OutputFormat; |
14 | 16 |
import cz.cuni.mff.ufal.udpipe.Sentence; |
15 | 17 |
import cz.cuni.mff.ufal.udpipe.Sentences; |
... | ... | |
42 | 44 |
|
43 | 45 |
model.parse(sent, ""); |
44 | 46 |
model.tag(sent, ""); |
45 |
// System.out.println(toString(sent)); |
|
47 |
// System.out.println(toString(sent));
|
|
46 | 48 |
} |
47 | 49 |
|
48 | 50 |
return sentences; |
... | ... | |
60 | 62 |
public static String toString(Sentence sent) { |
61 | 63 |
StringBuilder buffer = new StringBuilder(); |
62 | 64 |
Words words = sent.getWords(); |
63 |
for (int i = 0; i < words.size(); i++) { |
|
65 |
|
|
66 |
MultiwordTokens mwt = sent.getMultiwordTokens(); |
|
67 |
HashMap<Integer, MultiwordToken> firstWordToToken = new HashMap<Integer, MultiwordToken>(); |
|
68 |
HashMap<Integer, MultiwordToken> lastWordToToken = new HashMap<Integer, MultiwordToken>(); |
|
69 |
for (int i = 0 ; i < mwt.size(); i++) { |
|
70 |
MultiwordToken mw = mwt.get(i); |
|
71 |
firstWordToToken.put(mw.getIdFirst(), mw); |
|
72 |
lastWordToToken.put(mw.getIdLast(), mw); |
|
73 |
} |
|
74 |
|
|
75 |
for (int i = 0 ; i < words.size(); i++) { |
|
76 |
// MultiwordToken mw = mwt.get(i); |
|
77 |
// Word w = words.get(mw.getIdFirst()); |
|
78 |
|
|
64 | 79 |
Word w = words.get(i); |
80 |
|
|
65 | 81 |
if (i > 0) buffer.append(" "); |
66 | 82 |
|
67 |
buffer.append(w.getForm()); |
|
68 |
buffer.append("/" + w.getLemma()); |
|
69 |
buffer.append("/" + w.getUpostag()); |
|
70 |
buffer.append("/" + w.getXpostag()); |
|
71 |
buffer.append("/" + w.getFeats()); |
|
72 |
buffer.append("/" + w.getDeps()); |
|
73 |
buffer.append("/" + w.getDeprel()); |
|
74 |
buffer.append("/" + w.getHead()); |
|
75 |
buffer.append("/" + w.getMisc()); |
|
83 |
if (firstWordToToken.containsKey(w.getId())) { |
|
84 |
MultiwordToken mw = firstWordToToken.get(w.getId()); |
|
85 |
Word w2 = words.get(mw.getIdLast()); |
|
86 |
|
|
87 |
buffer.append(""+w.getId()+"+"+w2.getId()); |
|
88 |
buffer.append("/" + w.getForm()+"+"+w2.getForm()); |
|
89 |
buffer.append("/" + w.getLemma()+"+"+w2.getLemma()); |
|
90 |
buffer.append("/" + w.getUpostag()+"+"+w2.getUpostag()); |
|
91 |
buffer.append("/" + w.getXpostag()+"+"+w2.getXpostag()); |
|
92 |
buffer.append("/" + w.getFeats()+"+"+w2.getFeats()); |
|
93 |
buffer.append("/" + w.getDeps()+"+"+w2.getDeps()); |
|
94 |
buffer.append("/" + w.getDeprel()+"+"+w2.getDeprel()); |
|
95 |
buffer.append("/" + w.getHead()+"+"+w2.getHead()); |
|
96 |
buffer.append("/" + w.getMisc()+"+"+w2.getMisc()); |
|
97 |
} else if (lastWordToToken.containsKey(w.getId())) { |
|
98 |
// already written |
|
99 |
} else { |
|
100 |
|
|
101 |
buffer.append(w.getId()); |
|
102 |
buffer.append("/" + w.getForm()); |
|
103 |
buffer.append("/" + w.getLemma()); |
|
104 |
buffer.append("/" + w.getUpostag()); |
|
105 |
buffer.append("/" + w.getXpostag()); |
|
106 |
buffer.append("/" + w.getFeats()); |
|
107 |
buffer.append("/" + w.getDeps()); |
|
108 |
buffer.append("/" + w.getDeprel()); |
|
109 |
buffer.append("/" + w.getHead()); |
|
110 |
buffer.append("/" + w.getMisc()); |
|
111 |
} |
|
76 | 112 |
} |
77 | 113 |
return buffer.toString(); |
78 | 114 |
} |
... | ... | |
126 | 162 |
} |
127 | 163 |
|
128 | 164 |
public static void main(String[] args) { |
129 |
// try { |
|
130 |
// toConnluFile(new File("/tmp/result.connlu"), new File("/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe"), |
|
131 |
// "Et un petit test... En deux phrases ? ou trois."); |
|
165 |
// try { |
|
166 |
// toConnluFile(new File("/tmp/result.connlu"), new File("/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe"), |
|
167 |
// "Et un petit test... En deux phrases ? ou trois."); |
|
168 |
// } |
|
169 |
// catch (UnsupportedEncodingException | FileNotFoundException e) { |
|
170 |
// // TODO Auto-generated catch block |
|
171 |
// e.printStackTrace(); |
|
172 |
// } |
|
173 |
UDPipeEngine.initializeUDLib(new File("/home/mdecorde/workspace047/org.txm.libs.udpipe")); |
|
174 |
// System.out.println("VERSION: "+Version.current().toString()); |
|
175 |
// // /usr/lib/UDPipe/models/fr.udpipe |
|
176 |
// // /home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe |
|
177 |
// for (String p : new String[] {"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe", |
|
178 |
// "/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-partut-ud-2.4-190531.udpipe", |
|
179 |
// "/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-sequoia-ud-2.4-190531.udpipe", |
|
180 |
// "/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-spoken-ud-2.4-190531.udpipe"}) { |
|
181 |
// System.out.println("M="+p); |
|
182 |
// System.out.println(toString(process(p, "Une plaine, des champs cultivés que traverse une grande route."))); |
|
132 | 183 |
// } |
133 |
// catch (UnsupportedEncodingException | FileNotFoundException e) { |
|
134 |
// // TODO Auto-generated catch block |
|
135 |
// e.printStackTrace(); |
|
136 |
// } |
|
137 |
UDPipeEngine.initializeUDLib(new File("/home/mdecorde/workspace047/org.txm.libs.udpipe")); |
|
138 |
System.out.println("VERSION: "+Version.current().toString()); |
|
139 |
// /usr/lib/UDPipe/models/fr.udpipe |
|
140 |
// /home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe |
|
141 |
for (String p : new String[] {"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe", |
|
142 |
"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-partut-ud-2.4-190531.udpipe", |
|
143 |
"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-sequoia-ud-2.4-190531.udpipe", |
|
144 |
"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-spoken-ud-2.4-190531.udpipe"}) { |
|
145 |
System.out.println("M="+p); |
|
146 |
process(p, "Et un petit test... En deux phrases ? ou trois."); |
|
147 |
} |
|
184 |
|
|
185 |
Sentence sentence = new Sentence(); |
|
186 |
sentence.addWord("je"); |
|
187 |
sentence.addWord("suis"); |
|
188 |
sentence.addWord("."); |
|
189 |
System.out.println("SENT="+toString(sentence)); |
|
190 |
|
|
148 | 191 |
} |
149 | 192 |
|
150 | 193 |
public static void processSentences(String modelPath, Sentences sentences) { |
151 |
processSentences(Model.load(modelPath), sentences); |
|
194 |
long time = System.currentTimeMillis(); |
|
195 |
Model m = Model.load(modelPath); |
|
196 |
//System.out.println("Model load in: "+(System.currentTimeMillis()-time)); |
|
197 |
|
|
198 |
time = System.currentTimeMillis(); |
|
199 |
processSentences(m, sentences); |
|
200 |
//System.out.println("sent processed in: "+(System.currentTimeMillis()-time)); |
|
152 | 201 |
} |
153 | 202 |
|
154 | 203 |
public static void processSentences(Model model, Sentences sentences) { |
... | ... | |
156 | 205 |
for (int iSentence = 0; iSentence < sentences.size(); iSentence++) { |
157 | 206 |
Sentence sent = sentences.get(iSentence); |
158 | 207 |
|
208 |
model.parse(sent, ""); |
|
159 | 209 |
model.tag(sent, ""); |
160 |
model.parse(sent, ""); |
|
210 |
|
|
161 | 211 |
//System.out.println(toString(sent)); |
162 | 212 |
} |
163 | 213 |
} |
... | ... | |
186 | 236 |
|
187 | 237 |
return properties; |
188 | 238 |
} |
239 |
|
|
240 |
/** |
|
241 |
* fill a map with values of "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" |
|
242 |
* |
|
243 |
* @param word |
|
244 |
* @param prefix |
|
245 |
* @return |
|
246 |
*/ |
|
247 |
public static HashMap<String, String> wordToHashMap(Sentence sentence, MultiwordToken mword, String prefix) { |
|
248 |
if (prefix == null) prefix = ""; |
|
249 |
|
|
250 |
Word word = sentence.getWords().get(mword.getIdFirst()); |
|
251 |
Word word2 = sentence.getWords().get(mword.getIdFirst()); |
|
252 |
|
|
253 |
HashMap<String, String> properties = new HashMap<>(); |
|
254 |
properties.put(prefix + "id", Integer.toString(word.getId()) + "+" + Integer.toString(word2.getId())); |
|
255 |
properties.put(prefix + "form", word.getForm() + "+" + word2.getForm()); |
|
256 |
properties.put(prefix + "lemma", word.getLemma() + "+" + word2.getLemma()); |
|
257 |
properties.put(prefix + "upos", word.getUpostag() + "+" + word2.getUpostag()); |
|
258 |
properties.put(prefix + "xpos", word.getXpostag() + "+" + word2.getXpostag()); |
|
259 |
properties.put(prefix + "feats", word.getFeats() + "+" + word2.getFeats()); |
|
260 |
properties.put(prefix + "head", Integer.toString(word.getHead()) + "+" + Integer.toString(word2.getId())); |
|
261 |
properties.put(prefix + "deprel", word.getDeprel() + "+" + word2.getDeprel()); |
|
262 |
properties.put(prefix + "deps", word.getDeps() + "+" + word2.getDeps()); |
|
263 |
properties.put(prefix + "misc", word.getMisc() + "+" + word2.getMisc()); |
|
264 |
|
|
265 |
return properties; |
|
266 |
} |
|
189 | 267 |
} |
TXM/branches/eltec/org.txm.udpipe.core/src/org/txm/udpipe/core/UDStringTokenizer.java (revision 3283) | ||
---|---|---|
2 | 2 |
|
3 | 3 |
import java.io.File; |
4 | 4 |
import java.util.ArrayList; |
5 |
import java.util.Arrays; |
|
6 |
import java.util.HashMap; |
|
5 | 7 |
import java.util.List; |
6 | 8 |
|
7 | 9 |
import org.txm.tokenizer.StringTokenizer; |
8 | 10 |
|
9 | 11 |
import cz.cuni.mff.ufal.udpipe.InputFormat; |
10 | 12 |
import cz.cuni.mff.ufal.udpipe.Model; |
13 |
import cz.cuni.mff.ufal.udpipe.MultiwordToken; |
|
14 |
import cz.cuni.mff.ufal.udpipe.MultiwordTokens; |
|
11 | 15 |
import cz.cuni.mff.ufal.udpipe.Sentence; |
16 |
import cz.cuni.mff.ufal.udpipe.Word; |
|
12 | 17 |
import cz.cuni.mff.ufal.udpipe.Words; |
13 | 18 |
|
14 | 19 |
|
... | ... | |
29 | 34 |
tokenizer = model.newTokenizer(""); |
30 | 35 |
} |
31 | 36 |
|
37 |
public final static List<String> ADDITIONAL_PROPERTIES = Arrays.asList("ud-id", "ud-form", "ud-lemma", "ud-upos", "ud-xpos", "ud-feats", "ud-head", "ud-deprel", "ud-deps", "ud-misc"); |
|
38 |
public List<String> getAdditionalProperties() { |
|
39 |
return ADDITIONAL_PROPERTIES; |
|
40 |
} |
|
41 |
|
|
32 | 42 |
@Override |
33 |
public List<List<String>> processText(String text) {
|
|
34 |
ArrayList<List<String>> result = new ArrayList<>();
|
|
43 |
public ArrayList<ArrayList<ArrayList<String>>> processText(String text) {
|
|
44 |
ArrayList<ArrayList<ArrayList<String>>> result = new ArrayList<>();
|
|
35 | 45 |
|
36 |
|
|
37 | 46 |
tokenizer.setText(text); |
38 | 47 |
Sentence sent = new Sentence(); |
48 |
|
|
49 |
|
|
50 |
|
|
39 | 51 |
while (tokenizer.nextSentence(sent)) { |
40 |
List<String> sresult = new ArrayList<>(); |
|
52 |
|
|
53 |
model.parse(sent, ""); |
|
54 |
model.tag(sent, ""); |
|
55 |
|
|
56 |
ArrayList<ArrayList<String>> sresult = new ArrayList<>(); |
|
41 | 57 |
Words words = sent.getWords(); |
58 |
|
|
59 |
MultiwordTokens mwt = sent.getMultiwordTokens(); |
|
60 |
HashMap<Integer, MultiwordToken> firstWordToToken = new HashMap<Integer, MultiwordToken>(); |
|
61 |
HashMap<Integer, MultiwordToken> lastWordToToken = new HashMap<Integer, MultiwordToken>(); |
|
62 |
for (int i = 0 ; i < mwt.size(); i++) { |
|
63 |
MultiwordToken mw = mwt.get(i); |
|
64 |
firstWordToToken.put(mw.getIdFirst(), mw); |
|
65 |
lastWordToToken.put(mw.getIdLast(), mw); |
|
66 |
} |
|
67 |
|
|
68 |
// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" }; |
|
42 | 69 |
for (int iWord = 0; iWord < words.size(); iWord++) { |
43 |
sresult.add(words.get(iWord).getForm()); |
|
70 |
Word word = words.get(iWord); |
|
71 |
ArrayList<String> properties = new ArrayList<String>(); |
|
72 |
|
|
73 |
if (firstWordToToken.containsKey(word.getId())) { |
|
74 |
MultiwordToken mw = firstWordToToken.get(word.getId()); |
|
75 |
Word word2 = words.get(mw.getIdLast()); |
|
76 |
|
|
77 |
properties.add(mw.getForm()); |
|
78 |
properties.add(""+word.getId()); |
|
79 |
properties.add(word.getForm() + "+" + word2.getForm()); |
|
80 |
properties.add(word.getLemma() + "+" + word2.getLemma()); |
|
81 |
properties.add(word.getUpostag() + "+" + word2.getUpostag()); |
|
82 |
properties.add(word.getXpostag() + "+" + word2.getXpostag()); |
|
83 |
properties.add(word.getFeats() + "+" + word2.getFeats()); |
|
84 |
properties.add(Integer.toString(word.getHead()) + "+" + word2.getHead()); |
|
85 |
properties.add(word.getDeprel() + "+" + word2.getDeprel()); |
|
86 |
properties.add(word.getDeps() + "+" + word2.getDeps()); |
|
87 |
properties.add(mw.getMisc() + "+" + word2.getMisc()); |
|
88 |
|
|
89 |
sresult.add(properties); |
|
90 |
} else if (lastWordToToken.containsKey(word.getId())) { |
|
91 |
// already written |
|
92 |
} else { |
|
93 |
properties.add(word.getForm()); |
|
94 |
properties.add(""+word.getId()); |
|
95 |
properties.add(word.getForm()); |
|
96 |
properties.add(word.getLemma()); |
|
97 |
properties.add(word.getUpostag()); |
|
98 |
properties.add(word.getXpostag()); |
|
99 |
properties.add(word.getFeats()); |
|
100 |
properties.add(Integer.toString(word.getHead())); |
|
101 |
properties.add(word.getDeprel()); |
|
102 |
properties.add(word.getDeps()); |
|
103 |
properties.add(word.getMisc()); |
|
104 |
|
|
105 |
sresult.add(properties); |
|
106 |
} |
|
44 | 107 |
} |
45 | 108 |
if (sresult.size() > 0) { |
46 | 109 |
sresult.remove(0); // remove the <root> element |
... | ... | |
59 | 122 |
return true; |
60 | 123 |
} |
61 | 124 |
} |
125 |
|
TXM/branches/eltec/org.txm.tokenizer.core/src/org/txm/tokenizer/SimpleStringTokenizer.groovy (revision 3283) | ||
---|---|---|
270 | 270 |
println "Result : "+tokenizer.processText(text).collect{"<"+it+">"} |
271 | 271 |
} |
272 | 272 |
} |
273 |
|
|
274 |
@Override |
|
275 |
public List<String> getAdditionalProperties() { |
|
276 |
return Arrays.asList(); |
|
277 |
} |
|
273 | 278 |
} |
TXM/branches/eltec/org.txm.tokenizer.core/src/org/txm/tokenizer/StringTokenizer.java (revision 3283) | ||
---|---|---|
1 | 1 |
package org.txm.tokenizer; |
2 | 2 |
|
3 |
import java.util.ArrayList; |
|
3 | 4 |
import java.util.List; |
4 | 5 |
|
5 | 6 |
public interface StringTokenizer { |
6 | 7 |
|
7 |
List<List<String>> processText(String text);
|
|
8 |
ArrayList<ArrayList<ArrayList<String>>> processText(String text);
|
|
8 | 9 |
|
9 | 10 |
boolean doSentences(); |
11 |
|
|
12 |
List<String> getAdditionalProperties(); |
|
10 | 13 |
} |
TXM/branches/eltec/org.txm.internalview.rcp/src/org/txm/internalview/rcp/editors/InternalViewEditor.java (revision 3283) | ||
---|---|---|
9 | 9 |
import org.eclipse.jface.viewers.ISelectionChangedListener; |
10 | 10 |
import org.eclipse.jface.viewers.IStructuredContentProvider; |
11 | 11 |
import org.eclipse.jface.viewers.SelectionChangedEvent; |
12 |
import org.eclipse.jface.viewers.StructuredSelection; |
|
12 | 13 |
import org.eclipse.jface.viewers.TableViewer; |
13 | 14 |
import org.eclipse.jface.viewers.TableViewerColumn; |
14 | 15 |
import org.eclipse.jface.viewers.Viewer; |
... | ... | |
37 | 38 |
import org.txm.searchengine.cqp.clientExceptions.CqiClientException; |
38 | 39 |
import org.txm.searchengine.cqp.corpus.CQPCorpus; |
39 | 40 |
import org.txm.searchengine.cqp.corpus.Property; |
41 |
import org.txm.searchengine.cqp.corpus.StructuralUnit; |
|
40 | 42 |
import org.txm.searchengine.cqp.corpus.StructuralUnitProperty; |
41 | 43 |
import org.txm.searchengine.cqp.corpus.WordProperty; |
42 | 44 |
import org.txm.searchengine.cqp.corpus.query.Match; |
... | ... | |
114 | 116 |
structComboLabel.setText(InternalViewUIMessages.structure); |
115 | 117 |
structComboLabel.setLayoutData(new GridData(GridData.CENTER, GridData.CENTER, false, true)); |
116 | 118 |
this.structuralUnitsComboViewer = new StructuralUnitsComboViewer(parametersArea, this, true); |
119 |
|
|
117 | 120 |
// Listener |
118 | 121 |
this.structuralUnitsComboViewer.addSelectionChangedListener(new ISelectionChangedListener() { |
119 | 122 |
|
... | ... | |
124 | 127 |
} |
125 | 128 |
}); |
126 | 129 |
|
130 |
|
|
127 | 131 |
// Word properties selector |
128 | 132 |
propertiesSelector = new PropertiesSelector<>(parametersArea); |
129 | 133 |
propertiesSelector.setLayoutData(new GridData(GridData.CENTER, GridData.CENTER, false, true)); |
TXM/branches/eltec/org.txm.core/src/java/org/txm/xml/StaxDomConstructor.java (revision 3283) | ||
---|---|---|
2 | 2 |
|
3 | 3 |
import java.io.Reader; |
4 | 4 |
import java.io.StringReader; |
5 |
import java.util.Arrays; |
|
5 | 6 |
|
6 | 7 |
import javax.xml.parsers.ParserConfigurationException; |
7 | 8 |
import javax.xml.stream.XMLInputFactory; |
... | ... | |
77 | 78 |
elements++; |
78 | 79 |
break; |
79 | 80 |
case XMLStreamConstants.CHARACTERS: |
81 |
//char[] tmp = Arrays.copyOfRange(parser.getTextCharacters(), parser.getTextStart(), parser.getTextStart()+parser.getTextLength()); |
|
82 |
//System.out.println("T='"+Arrays.toString(tmp)+"'"); |
|
80 | 83 |
Text textNode = doc.createTextNode(parser.getText()); |
81 | 84 |
currentElement.appendChild(textNode); |
82 | 85 |
break; |
TXM/branches/eltec/org.txm.core/src/java/org/txm/xml/DOMIdentityHook.java (revision 3283) | ||
---|---|---|
73 | 73 |
parentParser.writer.writeCharacters("\n"); |
74 | 74 |
Element e = (Element) node; |
75 | 75 |
NodeList children = e.getChildNodes(); |
76 |
String ns = e.getNamespaceURI(); |
|
77 |
String ln = e.getLocalName(); |
|
76 | 78 |
if (children.getLength() > 0) { |
77 |
String ns = e.getNamespaceURI(); |
|
78 |
String ln = e.getLocalName(); |
|
79 | 79 |
if (ns == null) { |
80 | 80 |
parentParser.writer.writeStartElement(ln); |
81 | 81 |
} else { |
... | ... | |
83 | 83 |
} |
84 | 84 |
} |
85 | 85 |
else { |
86 |
parentParser.writer.writeEmptyElement(e.getNamespaceURI(), e.getLocalName()); |
|
86 |
if (ns == null) { |
|
87 |
parentParser.writer.writeEmptyElement(ln); |
|
88 |
} else { |
|
89 |
parentParser.writer.writeEmptyElement(ns, ln); |
|
90 |
} |
|
87 | 91 |
} |
88 | 92 |
|
89 | 93 |
for (int i = 0; i < e.getAttributes().getLength(); i++) { |
... | ... | |
102 | 106 |
} |
103 | 107 |
if (children.getLength() > 0) { |
104 | 108 |
parentParser.writer.writeEndElement(); |
105 |
parentParser.writer.writeCharacters("\n"); |
|
109 |
//parentParser.writer.writeCharacters("\n");
|
|
106 | 110 |
} |
107 | 111 |
} |
108 | 112 |
else if (node.getNodeType() == Node.TEXT_NODE) { |
109 | 113 |
parentParser.writer.writeCharacters(node.getTextContent()); |
114 |
} else if (node.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE) { |
|
115 |
parentParser.writer.writeProcessingInstruction(node.getNodeName(), node.getNodeValue()); |
|
116 |
} else if (node.getNodeType() == Node.CDATA_SECTION_NODE) { |
|
117 |
parentParser.writer.writeCData(node.getNodeValue()); |
|
118 |
} else if (node.getNodeType() == Node.COMMENT_NODE) { |
|
119 |
parentParser.writer.writeComment(node.getNodeValue()); |
|
110 | 120 |
} |
111 | 121 |
} |
112 | 122 |
|
TXM/branches/eltec/org.txm.internalview.core/src/org/txm/internalview/core/functions/InternalView.java (revision 3283) | ||
---|---|---|
3 | 3 |
import java.io.File; |
4 | 4 |
import java.io.PrintWriter; |
5 | 5 |
import java.util.ArrayList; |
6 |
import java.util.Arrays; |
|
6 | 7 |
import java.util.Collection; |
7 | 8 |
import java.util.HashMap; |
8 | 9 |
import java.util.LinkedHashMap; |
... | ... | |
91 | 92 |
public boolean loadParameters() throws Exception { |
92 | 93 |
try { |
93 | 94 |
String str = this.getStringParameterValue(TXMPreferences.STRUCTURAL_UNIT); |
94 |
this.pStructuralUnit = this.getCorpus().getStructuralUnit(str); |
|
95 |
if (str.length() == 0) { |
|
96 |
try { |
|
97 |
StructuralUnit struct = this.getCorpus().getStructuralUnit("text"); |
|
98 |
if (struct != null) { |
|
99 |
this.pStructuralUnit = struct; |
|
100 |
} |
|
101 |
struct = this.getCorpus().getStructuralUnit("div"); |
|
102 |
if (struct != null) { |
|
103 |
this.pStructuralUnit = struct; |
|
104 |
} |
|
105 |
struct = this.getCorpus().getStructuralUnit("p"); |
|
106 |
if (struct != null) { |
|
107 |
this.pStructuralUnit = struct; |
|
108 |
} |
|
109 |
} |
|
110 |
catch (CqiClientException e1) { |
|
111 |
// TODO Auto-generated catch block |
|
112 |
e1.printStackTrace(); |
|
113 |
} |
|
114 |
} else { |
|
115 |
this.pStructuralUnit = this.getCorpus().getStructuralUnit(str); |
|
116 |
} |
|
117 |
|
|
95 | 118 |
} |
96 | 119 |
catch (Exception e) { |
97 | 120 |
Log.printStackTrace(e); |
... | ... | |
105 | 128 |
} |
106 | 129 |
try { |
107 | 130 |
String str = this.getStringParameterValue(TXMPreferences.STRUCTURAL_UNIT_PROPERTIES); |
108 |
this.pStructuralUnitsProperties = StructuralUnitProperty.stringToProperties(this.getCorpus(), str); |
|
131 |
if (str.length() == 0) { |
|
132 |
StructuralUnitProperty structP = pStructuralUnit.getProperty("id"); |
|
133 |
if (structP != null) { |
|
134 |
this.pStructuralUnitsProperties = Arrays.asList(structP); |
|
135 |
} |
|
136 |
structP = pStructuralUnit.getProperty("n"); |
|
137 |
if (structP != null) { |
|
138 |
this.pStructuralUnitsProperties = Arrays.asList(structP); |
|
139 |
} |
|
140 |
} else { |
|
141 |
this.pStructuralUnitsProperties = StructuralUnitProperty.stringToProperties(this.getCorpus(), str); |
|
142 |
} |
|
109 | 143 |
} |
110 | 144 |
catch (Exception e3) { |
111 | 145 |
Log.printStackTrace(e3); |
... | ... | |
365 | 399 |
this.setDirty(); |
366 | 400 |
} |
367 | 401 |
|
368 |
|
|
369 |
|
|
370 |
|
|
371 | 402 |
@Override |
372 | 403 |
public boolean setParameters(TXMParameters parameters) { |
373 | 404 |
System.err.println("InternalView.setParameters(): not yet implemented."); |
TXM/branches/eltec/org.txm.internalview.core/src/org/txm/internalview/core/preferences/InternalViewPreferences.java (revision 3283) | ||
---|---|---|
37 | 37 |
super.initializeDefaultPreferences(); |
38 | 38 |
Preferences preferences = this.getDefaultPreferencesNode(); |
39 | 39 |
|
40 |
preferences.put(STRUCTURAL_UNIT, TBXPreferences.DEFAULT_STRUCTURAL_UNIT);
|
|
40 |
preferences.put(STRUCTURAL_UNIT, "");
|
|
41 | 41 |
preferences.put(UNIT_PROPERTIES, TBXPreferences.DEFAULT_UNIT_PROPERTY); |
42 |
preferences.put(STRUCTURAL_UNIT_PROPERTIES, TBXPreferences.DEFAULT_STRUCTURAL_UNIT_PROPERTY);
|
|
42 |
preferences.put(STRUCTURAL_UNIT_PROPERTIES, "");
|
|
43 | 43 |
preferences.putInt(CURRENT_PAGE, 0); |
44 | 44 |
} |
45 | 45 |
} |
Formats disponibles : Unified diff