Révision 3264
tmp/org.txm.tokenizer.core/src/org/txm/tokenizer/StringTokenizer.java (revision 3264) | ||
---|---|---|
4 | 4 |
|
5 | 5 |
public interface StringTokenizer { |
6 | 6 |
|
7 |
List<String> processText(String text); |
|
7 |
List<List<String>> processText(String text); |
|
8 |
|
|
9 |
boolean doSentences(); |
|
8 | 10 |
} |
tmp/org.txm.tokenizer.core/src/org/txm/tokenizer/SimpleStringTokenizer.groovy (revision 3264) | ||
---|---|---|
77 | 77 |
this(new TokenizerClasses(lang)) |
78 | 78 |
} |
79 | 79 |
|
80 |
public boolean doSentences() { |
|
81 |
return false; |
|
82 |
} |
|
83 |
|
|
80 | 84 |
/** |
81 | 85 |
* Instantiates a new simple string tokenizer. |
82 | 86 |
* |
... | ... | |
133 | 137 |
/** |
134 | 138 |
* Process word. |
135 | 139 |
*/ |
136 |
public ArrayList<String> processText(String text) { |
|
137 |
ArrayList<String> result = new ArrayList<String>(); |
|
140 |
public ArrayList<ArrayList<String>> processText(String text) { |
|
141 |
ArrayList<ArrayList<String>> result = new ArrayList<String>() |
|
142 |
ArrayList<String> sresult = new ArrayList<String>() |
|
138 | 143 |
if (regSplitWhiteSpaces != null) { |
139 | 144 |
for (String s : regSplitWhiteSpaces.split(text)) { // separate with unicode white spaces |
140 | 145 |
// if (DEBUG){println "process $s"} |
141 |
result.addAll(iterate(s)); |
|
146 |
sresult.addAll(iterate(s));
|
|
142 | 147 |
} |
143 | 148 |
} |
144 | 149 |
else { |
145 |
result.addAll(iterate(text)); |
|
150 |
sresult.addAll(iterate(text));
|
|
146 | 151 |
} |
152 |
result.add(sresult) |
|
147 | 153 |
return result; |
148 | 154 |
} |
149 | 155 |
|
tmp/org.txm.udpipe.core/src/org/txm/udpipe/core/XMLTXMToUDPipeXMLParser.java (revision 3264) | ||
---|---|---|
72 | 72 |
private Sentences sentences; |
73 | 73 |
|
74 | 74 |
private Sentence sentence; |
75 |
|
|
76 |
private int nSentenceTagFound = 0; |
|
75 | 77 |
|
76 | 78 |
/** |
77 | 79 |
* Instantiates a new builds the tt src. |
... | ... | |
112 | 114 |
String form = ""; // the content of the form tag |
113 | 115 |
String lastopenlocalname = ""; |
114 | 116 |
String localname = ""; |
117 |
boolean inS = false; |
|
115 | 118 |
try { |
116 | 119 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
117 | 120 |
switch (event) { |
121 |
case XMLStreamConstants.PROCESSING_INSTRUCTION: |
|
122 |
//System.out.println("PI target="+parser.getPITarget()+" data="+parser.getPIData()); |
|
123 |
if (!inS && "txm".equals(parser.getPITarget()) && "</s>".equals(parser.getPIData())) { |
|
124 |
if (sentence != null && sentence.getWords().size() > 0) { |
|
125 |
sentences.add(sentence); |
|
126 |
} |
|
127 |
sentence = new Sentence(); |
|
128 |
nSentenceTagFound ++; |
|
129 |
} |
|
130 |
break; |
|
118 | 131 |
case XMLStreamConstants.START_ELEMENT: |
119 | 132 |
localname = parser.getLocalName(); |
120 | 133 |
|
... | ... | |
144 | 157 |
sentences.add(sentence); |
145 | 158 |
} |
146 | 159 |
sentence = new Sentence(); |
160 |
nSentenceTagFound ++; |
|
161 |
inS = true; |
|
147 | 162 |
break; |
148 | 163 |
} |
149 | 164 |
break; |
... | ... | |
166 | 181 |
break; |
167 | 182 |
|
168 | 183 |
case "s": |
184 |
inS = false; |
|
169 | 185 |
break; |
170 | 186 |
} |
171 | 187 |
break; |
... | ... | |
194 | 210 |
return true; |
195 | 211 |
} |
196 | 212 |
|
213 |
public int getNSentenceTagFound() { |
|
214 |
return nSentenceTagFound; |
|
215 |
} |
|
216 |
|
|
197 | 217 |
/** |
198 | 218 |
* The main method. |
199 | 219 |
* |
tmp/org.txm.udpipe.core/src/org/txm/udpipe/core/UDPipeJavaUtils.java (revision 3264) | ||
---|---|---|
13 | 13 |
import cz.cuni.mff.ufal.udpipe.OutputFormat; |
14 | 14 |
import cz.cuni.mff.ufal.udpipe.Sentence; |
15 | 15 |
import cz.cuni.mff.ufal.udpipe.Sentences; |
16 |
import cz.cuni.mff.ufal.udpipe.Version; |
|
16 | 17 |
import cz.cuni.mff.ufal.udpipe.Word; |
17 | 18 |
import cz.cuni.mff.ufal.udpipe.Words; |
18 | 19 |
|
... | ... | |
38 | 39 |
|
39 | 40 |
for (int iSentence = 0; iSentence < sentences.size(); iSentence++) { |
40 | 41 |
Sentence sent = sentences.get(iSentence); |
42 |
|
|
43 |
model.parse(sent, ""); |
|
41 | 44 |
model.tag(sent, ""); |
42 |
model.parse(sent, "");
|
|
45 |
// System.out.println(toString(sent));
|
|
43 | 46 |
} |
44 | 47 |
|
45 | 48 |
return sentences; |
... | ... | |
123 | 126 |
} |
124 | 127 |
|
125 | 128 |
public static void main(String[] args) { |
126 |
try { |
|
127 |
toConnluFile(new File("/tmp/result.connlu"), new File("/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe"), |
|
128 |
"Et un petit test... En deux phrases ? ou trois."); |
|
129 |
// try { |
|
130 |
// toConnluFile(new File("/tmp/result.connlu"), new File("/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe"), |
|
131 |
// "Et un petit test... En deux phrases ? ou trois."); |
|
132 |
// } |
|
133 |
// catch (UnsupportedEncodingException | FileNotFoundException e) { |
|
134 |
// // TODO Auto-generated catch block |
|
135 |
// e.printStackTrace(); |
|
136 |
// } |
|
137 |
UDPipeEngine.initializeUDLib(new File("/home/mdecorde/workspace047/org.txm.libs.udpipe")); |
|
138 |
System.out.println("VERSION: "+Version.current().toString()); |
|
139 |
// /usr/lib/UDPipe/models/fr.udpipe |
|
140 |
// /home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe |
|
141 |
for (String p : new String[] {"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe", |
|
142 |
"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-partut-ud-2.4-190531.udpipe", |
|
143 |
"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-sequoia-ud-2.4-190531.udpipe", |
|
144 |
"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-spoken-ud-2.4-190531.udpipe"}) { |
|
145 |
System.out.println("M="+p); |
|
146 |
process(p, "Et un petit test... En deux phrases ? ou trois."); |
|
129 | 147 |
} |
130 |
catch (UnsupportedEncodingException | FileNotFoundException e) { |
|
131 |
// TODO Auto-generated catch block |
|
132 |
e.printStackTrace(); |
|
133 |
} |
|
134 | 148 |
} |
135 | 149 |
|
136 | 150 |
public static void processSentences(String modelPath, Sentences sentences) { |
... | ... | |
138 | 152 |
} |
139 | 153 |
|
140 | 154 |
public static void processSentences(Model model, Sentences sentences) { |
141 |
System.out.println("Model: " + model);
|
|
155 |
//System.out.println("Processing sent ("+sentences.size()+") with model: " + model);
|
|
142 | 156 |
for (int iSentence = 0; iSentence < sentences.size(); iSentence++) { |
143 | 157 |
Sentence sent = sentences.get(iSentence); |
144 |
System.out.println(sent.getWords().size()); |
|
145 |
// model.tag(sent, ""); |
|
158 |
|
|
159 |
model.tag(sent, ""); |
|
160 |
model.parse(sent, ""); |
|
161 |
//System.out.println(toString(sent)); |
|
146 | 162 |
} |
147 | 163 |
} |
148 | 164 |
|
... | ... | |
155 | 171 |
*/ |
156 | 172 |
public static HashMap<String, String> wordToHashMap(Word word, String prefix) { |
157 | 173 |
if (prefix == null) prefix = ""; |
174 |
|
|
158 | 175 |
HashMap<String, String> properties = new HashMap<>(); |
159 | 176 |
properties.put(prefix + "id", Integer.toString(word.getId())); |
160 | 177 |
properties.put(prefix + "form", word.getForm()); |
... | ... | |
166 | 183 |
properties.put(prefix + "deprel", word.getDeprel()); |
167 | 184 |
properties.put(prefix + "deps", word.getDeps()); |
168 | 185 |
properties.put(prefix + "misc", word.getMisc()); |
169 |
return null; |
|
186 |
|
|
187 |
return properties; |
|
170 | 188 |
} |
171 | 189 |
} |
tmp/org.txm.udpipe.core/src/org/txm/udpipe/core/UDStringTokenizer.java (revision 3264) | ||
---|---|---|
18 | 18 |
|
19 | 19 |
protected InputFormat tokenizer; |
20 | 20 |
|
21 |
|
|
21 | 22 |
public UDStringTokenizer(String lang) { |
22 | 23 |
File modelsDirectory = new File(UDPipePreferences.getInstance().getString(UDPipePreferences.MODELS_PATH)); // default models directory is set in the Toolbox |
23 | 24 |
File modelFile = new File(modelsDirectory, lang + ".udpipe"); |
... | ... | |
29 | 30 |
} |
30 | 31 |
|
31 | 32 |
@Override |
32 |
public List<String> processText(String text) {
|
|
33 |
ArrayList<String> result = new ArrayList<>();
|
|
33 |
public List<List<String>> processText(String text) {
|
|
34 |
ArrayList<List<String>> result = new ArrayList<>();
|
|
34 | 35 |
|
36 |
|
|
35 | 37 |
tokenizer.setText(text); |
36 | 38 |
Sentence sent = new Sentence(); |
37 | 39 |
while (tokenizer.nextSentence(sent)) { |
40 |
List<String> sresult = new ArrayList<>(); |
|
38 | 41 |
Words words = sent.getWords(); |
39 | 42 |
for (int iWord = 0; iWord < words.size(); iWord++) { |
40 |
result.add(words.get(iWord).getForm()); |
|
43 |
sresult.add(words.get(iWord).getForm());
|
|
41 | 44 |
} |
45 |
if (sresult.size() > 0) { |
|
46 |
sresult.remove(0); // remove the <root> element |
|
47 |
} |
|
48 |
if (sresult.size() > 0) { |
|
49 |
result.add(sresult); |
|
50 |
} |
|
42 | 51 |
} |
43 |
if (result.size() > 0) { |
|
44 |
result.remove(0); // remove the <root> element |
|
45 |
} |
|
52 |
|
|
46 | 53 |
return result; |
47 | 54 |
} |
55 |
|
|
56 |
@Override |
|
57 |
public boolean doSentences() { |
|
58 |
|
|
59 |
return true; |
|
60 |
} |
|
48 | 61 |
} |
tmp/org.txm.udpipe.core/src/org/txm/udpipe/core/UDPipeEngine.java (revision 3264) | ||
---|---|---|
20 | 20 |
import org.txm.utils.logger.Log; |
21 | 21 |
import org.txm.xml.xmltxm.XMLTXMWordPropertiesInjection; |
22 | 22 |
|
23 |
import cz.cuni.mff.ufal.udpipe.InputFormat; |
|
24 |
import cz.cuni.mff.ufal.udpipe.Model; |
|
23 | 25 |
import cz.cuni.mff.ufal.udpipe.Sentence; |
24 | 26 |
import cz.cuni.mff.ufal.udpipe.Sentences; |
25 | 27 |
import cz.cuni.mff.ufal.udpipe.Version; |
... | ... | |
45 | 47 |
Bundle bundle = BundleUtils.getBundle("org.txm.libs.udpipe"); |
46 | 48 |
File bundleDir = BundleUtils.getBundleFile("org.txm.libs.udpipe"); |
47 | 49 |
if (bundleDir.isDirectory()) { |
48 |
if (OSDetector.isFamilyWindows()) { |
|
49 |
libFile = new File(bundleDir, "lib/libudpipe_java.dll"); |
|
50 |
} |
|
51 |
else if (OSDetector.isFamilyMac()) { |
|
52 |
libFile = new File(bundleDir, "lib/libudpipe_java.dylib"); |
|
53 |
} |
|
54 |
else if (OSDetector.isFamilyUnix()) { |
|
55 |
libFile = new File(bundleDir, "lib/libudpipe_java.so"); |
|
56 |
} |
|
57 |
udpipe_java.setLibraryPath(libFile.getAbsolutePath()); |
|
50 |
libFile = initializeUDLib(bundleDir); |
|
58 | 51 |
} |
59 | 52 |
else { |
60 | 53 |
URL entry = bundle.getEntry("lib/libudpipe_java.so"); |
... | ... | |
64 | 57 |
return getDetails() != null; |
65 | 58 |
} |
66 | 59 |
|
60 |
public static File initializeUDLib(File bundleDir) { |
|
61 |
File libFile; |
|
62 |
if (OSDetector.isFamilyWindows()) { |
|
63 |
libFile = new File(bundleDir, "lib/libudpipe_java.dll"); |
|
64 |
} |
|
65 |
else if (OSDetector.isFamilyMac()) { |
|
66 |
libFile = new File(bundleDir, "lib/libudpipe_java.dylib"); |
|
67 |
} |
|
68 |
else { |
|
69 |
libFile = new File(bundleDir, "lib/libudpipe_java.so"); |
|
70 |
} |
|
71 |
udpipe_java.setLibraryPath(libFile.getAbsolutePath()); |
|
72 |
return libFile; |
|
73 |
} |
|
74 |
|
|
67 | 75 |
@Override |
68 | 76 |
public StringTokenizer getStringTokenizer(String lang) throws Exception { |
69 | 77 |
return new UDStringTokenizer(lang); |
... | ... | |
150 | 158 |
|
151 | 159 |
File modelsDirectory = new File(UDPipePreferences.getInstance().getString(UDPipePreferences.MODELS_PATH)); // default models directory is set in the Toolbox |
152 | 160 |
File modelFile = new File(modelsDirectory, lang + ".udpipe"); |
161 |
//System.out.println("model="+modelFile.getAbsolutePath()); |
|
153 | 162 |
|
154 | 163 |
try { |
155 | 164 |
// get words |
... | ... | |
160 | 169 |
} |
161 | 170 |
|
162 | 171 |
Sentences sentences = wparser.getSentences(); |
172 |
|
|
163 | 173 |
// System.out.println("SENTENCES PARSED: " + UDPipeJavaUtils.toString(sentences)); |
164 | 174 |
// tag |
165 | 175 |
UDPipeJavaUtils.processSentences(modelFile.getAbsolutePath(), sentences); |
... | ... | |
168 | 178 |
|
169 | 179 |
// update XML-TXM files |
170 | 180 |
XMLTXMWordPropertiesInjection injector = new XMLTXMWordPropertiesInjection(xmlFile); |
181 |
HashMap<String, HashMap<String, String>> rules = new HashMap<>(); |
|
171 | 182 |
for (int iSentence = 0; iSentence < sentences.size(); iSentence++) { |
172 | 183 |
Sentence sentence = sentences.get(iSentence); |
173 | 184 |
Words words = sentence.getWords(); |
... | ... | |
181 | 192 |
int idx2 = misc.indexOf("|", idx + 6); |
182 | 193 |
if (idx2 < 0) idx2 = misc.length(); |
183 | 194 |
String id = misc.substring(idx + 6, idx2); |
184 |
HashMap<String, String> properties = UDPipeJavaUtils.wordToHashMap(word, "ud-"); |
|
185 |
injector.addProperty(id, properties); |
|
195 |
if (id != null && id.length() > 0) { |
|
196 |
HashMap<String, String> properties = UDPipeJavaUtils.wordToHashMap(word, "#ud-"); |
|
197 |
rules.put(id, properties); |
|
198 |
} |
|
186 | 199 |
} |
187 | 200 |
} |
188 | 201 |
File outFile = new File(binaryCorpusDirectory, xmlFile.getName() + ".tmp"); |
202 |
injector.setProperties(rules); |
|
189 | 203 |
if (injector.process(outFile) && outFile.exists()) { |
190 | 204 |
xmlFile.delete(); |
191 | 205 |
outFile.renameTo(xmlFile); |
Formats disponibles : Unified diff