Révision 460
tmp/org.txm.treetagger.rcp/.project (revision 460) | ||
---|---|---|
1 |
<?xml version="1.0" encoding="UTF-8"?> |
|
2 |
<projectDescription> |
|
3 |
<name>TreeTagger</name> |
|
4 |
<comment></comment> |
|
5 |
<projects> |
|
6 |
</projects> |
|
7 |
<buildSpec> |
|
8 |
<buildCommand> |
|
9 |
<name>org.eclipse.jdt.core.javabuilder</name> |
|
10 |
<arguments> |
|
11 |
</arguments> |
|
12 |
</buildCommand> |
|
13 |
<buildCommand> |
|
14 |
<name>org.eclipse.pde.ManifestBuilder</name> |
|
15 |
<arguments> |
|
16 |
</arguments> |
|
17 |
</buildCommand> |
|
18 |
<buildCommand> |
|
19 |
<name>org.eclipse.pde.SchemaBuilder</name> |
|
20 |
<arguments> |
|
21 |
</arguments> |
|
22 |
</buildCommand> |
|
23 |
</buildSpec> |
|
24 |
<natures> |
|
25 |
<nature>org.eclipse.pde.PluginNature</nature> |
|
26 |
<nature>org.eclipse.jdt.core.javanature</nature> |
|
27 |
</natures> |
|
28 |
</projectDescription> |
|
0 | 29 |
tmp/org.txm.treetagger.rcp/src/treetagger/Activator.java (revision 460) | ||
---|---|---|
1 |
package treetagger; |
|
2 |
|
|
3 |
import org.eclipse.jface.resource.ImageDescriptor; |
|
4 |
import org.eclipse.ui.plugin.AbstractUIPlugin; |
|
5 |
import org.osgi.framework.BundleContext; |
|
6 |
|
|
7 |
/** |
|
8 |
* The activator class controls the plug-in life cycle |
|
9 |
*/ |
|
10 |
public class Activator extends AbstractUIPlugin { |
|
11 |
|
|
12 |
// The plug-in ID |
|
13 |
public static final String PLUGIN_ID = "TreeTagger"; //$NON-NLS-1$ |
|
14 |
|
|
15 |
// The shared instance |
|
16 |
private static Activator plugin; |
|
17 |
|
|
18 |
/** |
|
19 |
* The constructor |
|
20 |
*/ |
|
21 |
public Activator() { |
|
22 |
} |
|
23 |
|
|
24 |
/* |
|
25 |
* (non-Javadoc) |
|
26 |
* @see org.eclipse.ui.plugin.AbstractUIPlugin#start(org.osgi.framework.BundleContext) |
|
27 |
*/ |
|
28 |
public void start(BundleContext context) throws Exception { |
|
29 |
super.start(context); |
|
30 |
plugin = this; |
|
31 |
} |
|
32 |
|
|
33 |
/* |
|
34 |
* (non-Javadoc) |
|
35 |
* @see org.eclipse.ui.plugin.AbstractUIPlugin#stop(org.osgi.framework.BundleContext) |
|
36 |
*/ |
|
37 |
public void stop(BundleContext context) throws Exception { |
|
38 |
plugin = null; |
|
39 |
super.stop(context); |
|
40 |
} |
|
41 |
|
|
42 |
/** |
|
43 |
* Returns the shared instance |
|
44 |
* |
|
45 |
* @return the shared instance |
|
46 |
*/ |
|
47 |
public static Activator getDefault() { |
|
48 |
return plugin; |
|
49 |
} |
|
50 |
|
|
51 |
/** |
|
52 |
* Returns an image descriptor for the image file at the given |
|
53 |
* plug-in relative path |
|
54 |
* |
|
55 |
* @param path the path |
|
56 |
* @return the image descriptor |
|
57 |
*/ |
|
58 |
public static ImageDescriptor getImageDescriptor(String path) { |
|
59 |
return imageDescriptorFromPlugin(PLUGIN_ID, path); |
|
60 |
} |
|
61 |
} |
|
0 | 62 |
tmp/org.txm.treetagger.rcp/src/org/txm/treetagger/commands/Train.java (revision 460) | ||
---|---|---|
1 |
package org.txm.treetagger.commands; |
|
2 |
|
|
3 |
import java.io.BufferedOutputStream; |
|
4 |
import java.io.BufferedReader; |
|
5 |
import java.io.BufferedWriter; |
|
6 |
import java.io.File; |
|
7 |
import java.io.FileOutputStream; |
|
8 |
import java.io.OutputStreamWriter; |
|
9 |
import java.io.PrintStream; |
|
10 |
import java.io.PrintWriter; |
|
11 |
import java.util.ArrayList; |
|
12 |
import java.util.Arrays; |
|
13 |
import java.util.HashMap; |
|
14 |
import java.util.HashSet; |
|
15 |
import java.util.LinkedHashMap; |
|
16 |
import java.util.LinkedHashSet; |
|
17 |
import java.util.List; |
|
18 |
|
|
19 |
import org.eclipse.core.commands.AbstractHandler; |
|
20 |
import org.eclipse.core.commands.ExecutionEvent; |
|
21 |
import org.eclipse.core.commands.ExecutionException; |
|
22 |
import org.eclipse.core.runtime.IProgressMonitor; |
|
23 |
import org.eclipse.core.runtime.IStatus; |
|
24 |
import org.eclipse.core.runtime.Status; |
|
25 |
import org.eclipse.jface.viewers.ISelection; |
|
26 |
import org.eclipse.jface.viewers.IStructuredSelection; |
|
27 |
import org.eclipse.ui.IWorkbenchWindow; |
|
28 |
import org.eclipse.ui.handlers.HandlerUtil; |
|
29 |
import org.kohsuke.args4j.Option; |
|
30 |
import org.txm.core.preferences.TBXPreferences; |
|
31 |
import org.txm.core.preferences.TXMPreferences; |
|
32 |
import org.txm.index.core.functions.Index; |
|
33 |
import org.txm.index.core.functions.Line; |
|
34 |
import org.txm.rcp.swt.widget.parameters.ParametersDialog; |
|
35 |
import org.txm.rcp.utils.JobHandler; |
|
36 |
import org.txm.searchengine.cqp.AbstractCqiClient; |
|
37 |
import org.txm.searchengine.cqp.CQPEngine; |
|
38 |
import org.txm.searchengine.cqp.corpus.Corpus; |
|
39 |
import org.txm.searchengine.cqp.corpus.MainCorpus; |
|
40 |
import org.txm.searchengine.cqp.corpus.Property; |
|
41 |
import org.txm.searchengine.cqp.corpus.query.Match; |
|
42 |
import org.txm.searchengine.cqp.corpus.query.Query; |
|
43 |
import org.txm.utils.DeleteDir; |
|
44 |
import org.txm.utils.io.IOUtils; |
|
45 |
import org.txm.utils.logger.Log; |
|
46 |
import org.txm.utils.treetagger.TreeTagger; |
|
47 |
|
|
48 |
/** |
|
49 |
* Our sample handler extends AbstractHandler, an IHandler base class. |
|
50 |
* @see org.eclipse.core.commands.IHandler |
|
51 |
* @see org.eclipse.core.commands.AbstractHandler |
|
52 |
*/ |
|
53 |
public class Train extends AbstractHandler { |
|
54 |
|
|
55 |
public Corpus corpus = null; |
|
56 |
|
|
57 |
@Option(name="model", usage="The model file to create", widget="CreateFile", required=true, def="fr.par") |
|
58 |
public File model = null; |
|
59 |
@Option(name="posProperty", usage="The pos property", widget="String", required=true, def="frpos") |
|
60 |
public String posProperty = null; |
|
61 |
@Option(name="sentenceTag", usage="The pos property", widget="String", required=true, def="SENT") |
|
62 |
public String sentenceTag = null; |
|
63 |
@Option(name="lemmaProperty", usage="The lemma property", widget="String", required=true, def="frlemma") |
|
64 |
public String lemmaProperty = null; |
|
65 |
@Option(name="lexique", usage="Lexicon file", widget="File", required=true, def="lexicon.txt") |
|
66 |
public File lexique = null; |
|
67 |
@Option(name="options", usage="TreeTagger supplementary options", widget="String", required=true, def="") |
|
68 |
public String options = null; |
|
69 |
|
|
70 |
/** |
|
71 |
* |
|
72 |
*/ |
|
73 |
public Object execute(ExecutionEvent event) throws ExecutionException { |
|
74 |
|
|
75 |
|
|
76 |
IWorkbenchWindow window = HandlerUtil.getActiveWorkbenchWindowChecked(event); |
|
77 |
|
|
78 |
ISelection isel = window.getActivePage().getSelection(); |
|
79 |
if (isel instanceof IStructuredSelection) { |
|
80 |
IStructuredSelection sel = (IStructuredSelection)isel; |
|
81 |
Object first = sel.getFirstElement(); |
|
82 |
if (first instanceof Corpus) { |
|
83 |
corpus = (Corpus)first; |
|
84 |
if (ParametersDialog.open(this)) { |
|
85 |
|
|
86 |
train(corpus, model, lexique, new String[]{posProperty, lemmaProperty}, sentenceTag, options.split(" ")); |
|
87 |
|
|
88 |
return corpus; |
|
89 |
} |
|
90 |
} |
|
91 |
} |
|
92 |
|
|
93 |
System.out.println("Wrong selection."); |
|
94 |
return null; |
|
95 |
} |
|
96 |
|
|
97 |
public static void train(final Corpus corpus, final File model, final File lexique, final String[] properties, final String sentenceTag, final String[] options) { |
|
98 |
|
|
99 |
JobHandler job = new JobHandler("Applying TreeTagger to "+corpus+" corpus.") { |
|
100 |
@Override |
|
101 |
protected IStatus run(IProgressMonitor monitor) { |
|
102 |
this.runInit(monitor); |
|
103 |
try { |
|
104 |
File lexique2 = lexique; |
|
105 |
MainCorpus mainCorpus = corpus.getMainCorpus(); |
|
106 |
File corpusBinaryDirectory = mainCorpus.getBaseDirectory(); |
|
107 |
|
|
108 |
System.out.println("TRAIN : "+corpus+" with "+lexique2+" to create "+model+ " with properties "+Arrays.toString(properties)); |
|
109 |
|
|
110 |
if (properties == null || properties.length != 2) { |
|
111 |
System.out.println("Error can't continue with selected word properties: "+Arrays.toString(properties)); |
|
112 |
return Status.CANCEL_STATUS; |
|
113 |
} |
|
114 |
|
|
115 |
for (String p : properties) { |
|
116 |
Property prop = corpus.getProperty(p); |
|
117 |
if (prop == null) { |
|
118 |
System.out.println("Missing property in corpus: "+p); |
|
119 |
return Status.CANCEL_STATUS; |
|
120 |
} |
|
121 |
} |
|
122 |
|
|
123 |
Property pos = corpus.getProperty(properties[0]); |
|
124 |
Property lemma = corpus.getProperty(properties[1]); |
|
125 |
|
|
126 |
// Prepare temporary directory |
|
127 |
File treetaggerSrcDirectory = new File(mainCorpus.getBaseDirectory(), "treetagger"); |
|
128 |
DeleteDir.deleteDirectory(treetaggerSrcDirectory); |
|
129 |
treetaggerSrcDirectory.mkdirs(); |
|
130 |
|
|
131 |
HashMap<String, HashSet<String>> simplified_lexicon = null; |
|
132 |
HashMap<String, HashSet<String>> simplified_lexicon_errors = null; |
|
133 |
int error_counter = 0; |
|
134 |
// Create Lexicon file from an Index |
|
135 |
if (lexique2 == null || !lexique2.exists()) { |
|
136 |
System.out.println("Warning: no lexicon file or given lexicon file does not exist ("+lexique2+"). Using corpus Index..."); |
|
137 |
|
|
138 |
File lexiconfile = new File(treetaggerSrcDirectory, "lexicon.txt"); |
|
139 |
List<Property> corpusProperties = new ArrayList<Property>(); |
|
140 |
corpusProperties.add(mainCorpus.getProperty("word")); |
|
141 |
for (String p : properties) { |
|
142 |
Property prop = mainCorpus.getProperty(p); |
|
143 |
if (prop == null) { |
|
144 |
System.out.println("Error, a property is missing: "+p); |
|
145 |
return Status.CANCEL_STATUS; |
|
146 |
} |
|
147 |
corpusProperties.add(prop); |
|
148 |
} |
|
149 |
Index index = new Index(mainCorpus, new Query("[]"), corpusProperties); |
|
150 |
List<Line> lines = index.getAllLines(); |
|
151 |
LinkedHashMap<String, ArrayList<String>> lex = new LinkedHashMap<String, ArrayList<String>>(); |
|
152 |
HashMap<String, HashSet<String>> allPosValues = new HashMap<String, HashSet<String>>(); |
|
153 |
for (Line l : lines) { |
|
154 |
List<List<String>> values = l.getUnitsProperties(); |
|
155 |
String form = values.get(0).get(0); |
|
156 |
if (!lex.containsKey(form)) { |
|
157 |
ArrayList<String> pairs = new ArrayList<String>(); |
|
158 |
HashSet<String> posValues = new HashSet<String>(); |
|
159 |
|
|
160 |
allPosValues.put(form, posValues); |
|
161 |
lex.put(form, pairs); |
|
162 |
} |
|
163 |
ArrayList<String> pairs = lex.get(form); |
|
164 |
HashSet<String> posValues = allPosValues.get(form); |
|
165 |
String posValue = values.get(1).get(0); |
|
166 |
String lemmaValue = values.get(2).get(0); |
|
167 |
if (posValues.contains(posValue)) { |
|
168 |
|
|
169 |
} else { |
|
170 |
posValues.add(posValue); |
|
171 |
pairs.add(posValue); |
|
172 |
pairs.add(lemmaValue); |
|
173 |
} |
|
174 |
} |
|
175 |
|
|
176 |
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(lexiconfile), "UTF-8")); |
|
177 |
for (String form : lex.keySet()) { |
|
178 |
|
|
179 |
writer.write(form); |
|
180 |
boolean tab = true; |
|
181 |
for (String v : lex.get(form)) { |
|
182 |
if (tab) writer.write("\t"+v); |
|
183 |
else writer.write(" "+v); |
|
184 |
|
|
185 |
tab = !tab; |
|
186 |
} |
|
187 |
writer.write("\n"); |
|
188 |
} |
|
189 |
writer.close(); |
|
190 |
lexique2 = lexiconfile; |
|
191 |
} else { // diagnose lexicon content |
|
192 |
simplified_lexicon = new HashMap<String, HashSet<String>>(); |
|
193 |
simplified_lexicon_errors = new HashMap<String, HashSet<String>>(); |
|
194 |
BufferedReader reader = IOUtils.getReader(lexique2); |
|
195 |
String line = reader.readLine(); |
|
196 |
while (line != null) { |
|
197 |
String[] split = line.split("\t", 2); |
|
198 |
HashSet<String> posValues = new HashSet<String>(); |
|
199 |
simplified_lexicon.put(split[0], posValues); |
|
200 |
for (String poslemme : split[1].split("\t")) { |
|
201 |
String[] split2 = poslemme.split(" ", 2); |
|
202 |
posValues.add(split2[0]); |
|
203 |
} |
|
204 |
line = reader.readLine(); |
|
205 |
} |
|
206 |
reader.close(); |
|
207 |
} |
|
208 |
|
|
209 |
|
|
210 |
// create TT SRC file from CWB indexes |
|
211 |
|
|
212 |
File ttSrcFile = new File(treetaggerSrcDirectory, mainCorpus.getName()+".tt"); |
|
213 |
System.out.println("TT SRC file: "+ttSrcFile.getAbsolutePath()); |
|
214 |
BufferedOutputStream fos = new BufferedOutputStream(new FileOutputStream(ttSrcFile)); |
|
215 |
PrintStream ps = new PrintStream(fos); |
|
216 |
LinkedHashSet<Integer> positions = new LinkedHashSet<Integer>(); |
|
217 |
Property word = corpus.getProperty("word"); |
|
218 |
AbstractCqiClient CQI = CQPEngine.getCqiClient(); |
|
219 |
for (Match m : corpus.getMatches()) { |
|
220 |
for (int i = m.getStart() ; i <= m.getEnd() ; i++) { // end match must be included |
|
221 |
positions.add(i); |
|
222 |
|
|
223 |
if (positions.size() >= 1000) { // avoid too big array |
|
224 |
int[] positions_array = new int[positions.size()]; |
|
225 |
int ip = 0; |
|
226 |
for (int p : positions) positions_array[ip++] = p; |
|
227 |
String[] words = CQI.cpos2Str(word.getQualifiedName(), positions_array); |
|
228 |
String[] values = CQI.cpos2Str(pos.getQualifiedName(), positions_array); |
|
229 |
|
|
230 |
for (int iW = 0 ; iW < words.length ; iW++) { |
|
231 |
String w = words[iW]; |
|
232 |
if (w != null) { |
|
233 |
String s = w+"\t"+values[iW]; |
|
234 |
ps.println(s); |
|
235 |
|
|
236 |
if (simplified_lexicon != null) { // check given lexicon |
|
237 |
if (simplified_lexicon.containsKey(w)) { |
|
238 |
if (!simplified_lexicon.get(w).contains(values[iW])) { |
|
239 |
//System.out.println("Lexicon error: cannot find pos="+values[iW]+" for form="+w); |
|
240 |
if (!simplified_lexicon_errors.containsKey(w)) simplified_lexicon_errors.put(w, new HashSet<String>()); |
|
241 |
HashSet<String> error_values = simplified_lexicon_errors.get(w); |
|
242 |
error_values.add(values[iW]); |
|
243 |
error_counter++; |
|
244 |
} |
|
245 |
} else { |
|
246 |
//System.out.println("Lexicon error: cannot find form="+w); |
|
247 |
if (!simplified_lexicon_errors.containsKey(w)) simplified_lexicon_errors.put(w, new HashSet<String>()); |
|
248 |
HashSet<String> error_values = simplified_lexicon_errors.get(w); |
|
249 |
error_values.add("#"+values[iW]); |
|
250 |
error_counter++; |
|
251 |
} |
|
252 |
} |
|
253 |
} |
|
254 |
} |
|
255 |
positions.clear(); |
|
256 |
} |
|
257 |
} |
|
258 |
} |
|
259 |
if (positions.size() > 0) { // write last words |
|
260 |
int[] positions_array = new int[positions.size()]; |
|
261 |
int ip = 0; |
|
262 |
for (int p : positions) positions_array[ip++] = p; |
|
263 |
String[] words = CQI.cpos2Str(word.getQualifiedName(), positions_array); |
|
264 |
String[] values = CQI.cpos2Str(pos.getQualifiedName(), positions_array); |
|
265 |
|
|
266 |
for (int iW = 0 ; iW < words.length ; iW++) { |
|
267 |
String w = words[iW]; |
|
268 |
if (w != null) { |
|
269 |
String s = w+"\t"+values[iW]; |
|
270 |
ps.println(s); |
|
271 |
} |
|
272 |
} |
|
273 |
positions.clear(); |
|
274 |
} |
|
275 |
ps.close(); |
|
276 |
|
|
277 |
if (simplified_lexicon_errors != null && simplified_lexicon_errors.size() > 0) { |
|
278 |
File error_file = new File(treetaggerSrcDirectory, "errors.txt"); |
|
279 |
PrintWriter errorwriter = IOUtils.getWriter(error_file); |
|
280 |
int c = 0; |
|
281 |
System.out.println("Warning, lexicon errors ("+error_counter+") found with words:"); |
|
282 |
for (String w : simplified_lexicon_errors.keySet()) { |
|
283 |
errorwriter.println(w+"="+simplified_lexicon_errors.get(w)); |
|
284 |
if (c < 10) { |
|
285 |
System.out.println(w+"="+simplified_lexicon_errors.get(w)); |
|
286 |
c++; |
|
287 |
if (c == 10) System.out.println("... errors display is trucated, see "+error_file.getAbsolutePath()); |
|
288 |
} |
|
289 |
} |
|
290 |
errorwriter.close(); |
|
291 |
//System.out.println("Cannot apply train-treetagger if lexicon is missing words and pos."); |
|
292 |
//return Status.CANCEL_STATUS; |
|
293 |
File lexique3 = new File(lexique2.getParentFile(), lexique2.getName()+".fix"); |
|
294 |
BufferedReader reader = IOUtils.getReader(lexique2); |
|
295 |
PrintWriter writer = IOUtils.getWriter(lexique3); |
|
296 |
String line = reader.readLine(); |
|
297 |
while (line != null) { |
|
298 |
String w = line.split("\t", 2)[0]; |
|
299 |
|
|
300 |
if (simplified_lexicon_errors.containsKey(w)) { |
|
301 |
for (String p : simplified_lexicon_errors.get(w)) { |
|
302 |
if (!p.startsWith("#")) |
|
303 |
line += ("\t"+p+" <no_lemma>"); // append missing value |
|
304 |
} |
|
305 |
simplified_lexicon_errors.remove(w); |
|
306 |
} |
|
307 |
|
|
308 |
writer.println(line); |
|
309 |
line = reader.readLine(); |
|
310 |
} |
|
311 |
|
|
312 |
// write missing words |
|
313 |
for (String w2 : simplified_lexicon_errors.keySet()) { |
|
314 |
writer.print(w2); |
|
315 |
for (String p : simplified_lexicon_errors.get(w2)) { |
|
316 |
writer.print("\t"+p+" <no_lemma>"); |
|
317 |
} |
|
318 |
writer.println(""); |
|
319 |
} |
|
320 |
|
|
321 |
reader.close(); |
|
322 |
writer.close(); |
|
323 |
System.out.println("Adding words to a temporary lexicon: "+lexique3); |
|
324 |
lexique2 = lexique3; |
|
325 |
} |
|
326 |
|
|
327 |
// Create open class file : contains all pos values |
|
328 |
File openclassfile = new File(treetaggerSrcDirectory, "openclasses.txt"); |
|
329 |
PrintWriter openClassFileWriter = IOUtils.getWriter(openclassfile); |
|
330 |
|
|
331 |
// Lexicon poslexicon = corpus.getLexicon(pos); |
|
332 |
// String[] posValues = poslexicon.getForms(); |
|
333 |
// for (int iV = 0 ; iV < posValues.length ; iV++) { |
|
334 |
// if (iV == 0) openClassFileWriter.print(posValues[iV]); |
|
335 |
// else openClassFileWriter.print(" "+posValues[iV]); |
|
336 |
// } |
|
337 |
openClassFileWriter.close(); |
|
338 |
|
|
339 |
// Call treetagger-train |
|
340 |
if (ttSrcFile.exists() && lexique2.exists() && openclassfile.exists()) { |
|
341 |
System.out.println("Running "); |
|
342 |
String treetaggerBinDirectory = new File(TXMPreferences.getString(TBXPreferences.TREETAGGER_INSTALL_PATH, TBXPreferences.PREFERENCES_NODE), "bin").getAbsolutePath(); |
|
343 |
if (!treetaggerBinDirectory.endsWith("/")) treetaggerBinDirectory += "/"; |
|
344 |
|
|
345 |
TreeTagger tt = new TreeTagger(treetaggerBinDirectory, options); |
|
346 |
tt.settoken(); |
|
347 |
tt.setquiet(); |
|
348 |
tt.setlemma(); |
|
349 |
tt.setsgml(); |
|
350 |
tt.setst(sentenceTag); |
|
351 |
tt.setproto(); |
|
352 |
tt.setutf8(); |
|
353 |
tt.debug(true); |
|
354 |
tt.traintreetagger(lexique2.getAbsolutePath(), openclassfile.getAbsolutePath(), ttSrcFile.getAbsolutePath(), model.getAbsolutePath()); |
|
355 |
|
|
356 |
System.out.println("Done: "+model.getAbsolutePath()); |
|
357 |
} else { |
|
358 |
System.out.println("Aborting."); |
|
359 |
} |
|
360 |
|
|
361 |
return Status.OK_STATUS; |
|
362 |
} catch (Exception e) { |
|
363 |
System.out.println("Error while training TT: "+e); |
|
364 |
Log.printStackTrace(e); |
|
365 |
} |
|
366 |
return Status.CANCEL_STATUS; |
|
367 |
} |
|
368 |
}; |
|
369 |
job.schedule(); |
|
370 |
} |
|
371 |
} |
|
0 | 372 |
tmp/org.txm.treetagger.rcp/src/org/txm/treetagger/commands/Apply.java (revision 460) | ||
---|---|---|
1 |
package org.txm.treetagger.commands; |
|
2 |
|
|
3 |
import java.io.File; |
|
4 |
import java.io.FileFilter; |
|
5 |
import java.util.Arrays; |
|
6 |
import java.util.HashMap; |
|
7 |
|
|
8 |
import org.eclipse.core.commands.AbstractHandler; |
|
9 |
import org.eclipse.core.commands.ExecutionEvent; |
|
10 |
import org.eclipse.core.commands.ExecutionException; |
|
11 |
import org.eclipse.core.runtime.IProgressMonitor; |
|
12 |
import org.eclipse.core.runtime.IStatus; |
|
13 |
import org.eclipse.core.runtime.Status; |
|
14 |
import org.eclipse.jface.viewers.ISelection; |
|
15 |
import org.eclipse.jface.viewers.IStructuredSelection; |
|
16 |
import org.eclipse.ui.IWorkbenchWindow; |
|
17 |
import org.eclipse.ui.handlers.HandlerUtil; |
|
18 |
import org.kohsuke.args4j.Option; |
|
19 |
import org.txm.rcp.commands.workspace.UpdateCorpus; |
|
20 |
import org.txm.rcp.swt.widget.parameters.ParametersDialog; |
|
21 |
import org.txm.rcp.utils.JobHandler; |
|
22 |
import org.txm.scripts.teitxm.Annotate; |
|
23 |
import org.txm.searchengine.cqp.corpus.Corpus; |
|
24 |
import org.txm.searchengine.cqp.corpus.MainCorpus; |
|
25 |
|
|
26 |
/** |
|
27 |
* Our sample handler extends AbstractHandler, an IHandler base class. |
|
28 |
* @see org.eclipse.core.commands.IHandler |
|
29 |
* @see org.eclipse.core.commands.AbstractHandler |
|
30 |
*/ |
|
31 |
public class Apply extends AbstractHandler { |
|
32 |
|
|
33 |
@Option(name="model", usage="Model file", widget="File", required=true, def="model.par") |
|
34 |
public File model = null; |
|
35 |
@Option(name="posProperty", usage="The pos property", widget="String", required=true, def="frpos") |
|
36 |
public String posProperty = null; |
|
37 |
@Option(name="lemmaProperty", usage="The lemma property", widget="String", required=true, def="frlemma") |
|
38 |
public String lemmaProperty = null; |
|
39 |
@Option(name="options", usage="TreeTagger supplementary options", widget="String", required=true, def="") |
|
40 |
public String options = null; |
|
41 |
|
|
42 |
/** |
|
43 |
* |
|
44 |
*/ |
|
45 |
public Object execute(ExecutionEvent event) throws ExecutionException { |
|
46 |
Corpus corpus = null; |
|
47 |
IWorkbenchWindow window = HandlerUtil.getActiveWorkbenchWindowChecked(event); |
|
48 |
|
|
49 |
ISelection isel = window.getActivePage().getSelection(); |
|
50 |
if (isel instanceof IStructuredSelection) { |
|
51 |
IStructuredSelection sel = (IStructuredSelection)isel; |
|
52 |
Object first = sel.getFirstElement(); |
|
53 |
if (first instanceof Corpus) { |
|
54 |
corpus = (Corpus)first; |
|
55 |
if (ParametersDialog.open(this)) { |
|
56 |
apply(corpus, model, new String[]{posProperty, lemmaProperty}, options.split(" ")); |
|
57 |
return corpus; |
|
58 |
} |
|
59 |
} |
|
60 |
} |
|
61 |
|
|
62 |
System.out.println("Wrong selection."); |
|
63 |
return null; |
|
64 |
} |
|
65 |
|
|
66 |
public static void apply(Corpus corpus, final File model, final String[] properties, final String[] options) { |
|
67 |
final MainCorpus mainCorpus = corpus.getMainCorpus(); |
|
68 |
final File corpusBinaryDirectory = mainCorpus.getBaseDirectory(); |
|
69 |
final File txmDirectory = new File(corpusBinaryDirectory, "txm/"+mainCorpus.getName()); |
|
70 |
|
|
71 |
if (!txmDirectory.exists()) { |
|
72 |
System.out.println("Can't apply TreeTagger to a corpus with no XML-TXM files."); |
|
73 |
} |
|
74 |
|
|
75 |
final File[] files = txmDirectory.listFiles(new FileFilter() { |
|
76 |
@Override |
|
77 |
public boolean accept(File file) { |
|
78 |
return file.isFile() && file.canWrite() && file.getName().endsWith(".xml"); |
|
79 |
} |
|
80 |
}); |
|
81 |
|
|
82 |
if (files == null || files.length == 0) { |
|
83 |
System.out.println("Can't apply TreeTagger to a corpus with no XML-TXM files in "+txmDirectory); |
|
84 |
} |
|
85 |
|
|
86 |
String lang = model.getName(); |
|
87 |
if (!lang.endsWith(".par")) { |
|
88 |
System.out.println("Model file name must ends with the '.par' extension"); |
|
89 |
return; |
|
90 |
} |
|
91 |
lang = lang.substring(0, lang.indexOf(".par")); |
|
92 |
|
|
93 |
final HashMap<String, String> hash = new HashMap<String, String>(); |
|
94 |
for (File txmFile : files) { |
|
95 |
hash.put(txmFile.getName(), lang); |
|
96 |
} |
|
97 |
|
|
98 |
for (int i = 0 ; i < properties.length ; i++) properties[i] = properties[i].trim(); |
|
99 |
|
|
100 |
System.out.println("APPLY : "+model+" to "+corpus+" updating "+Arrays.toString(properties)+ " with options "+Arrays.toString(options)); |
|
101 |
JobHandler job = new JobHandler("Applying TreeTagger to "+corpus+" corpus.") { |
|
102 |
@Override |
|
103 |
protected IStatus run(IProgressMonitor monitor) { |
|
104 |
this.runInit(monitor); |
|
105 |
Annotate annotator = new Annotate(); |
|
106 |
annotator.setModelsDirectory(model.getParentFile()); |
|
107 |
annotator.setDebug(); |
|
108 |
if (!annotator.run(corpusBinaryDirectory, txmDirectory, hash, true, properties, options)) { |
|
109 |
System.out.println("Fail to apply TreeTagger with "+txmDirectory+" files."); |
|
110 |
return Status.CANCEL_STATUS; |
|
111 |
} |
|
112 |
System.out.println("Done. Updating corpus..."); |
|
113 |
|
|
114 |
if (UpdateCorpus.update(mainCorpus) == null) { |
|
115 |
System.out.println("Fail to update corpus indexes and editions."); |
|
116 |
} |
|
117 |
System.out.println("Done."); |
|
118 |
return Status.OK_STATUS;//frppos |
|
119 |
} |
|
120 |
}; |
|
121 |
job.schedule(); |
|
122 |
} |
|
123 |
} |
|
0 | 124 |
tmp/org.txm.treetagger.rcp/src/org/txm/treetagger/commands/LemmaProjection.java (revision 460) | ||
---|---|---|
1 |
package org.txm.treetagger.commands; |
|
2 |
|
|
3 |
import java.io.BufferedReader; |
|
4 |
import java.io.File; |
|
5 |
import java.io.FileFilter; |
|
6 |
import java.io.PrintWriter; |
|
7 |
import java.util.Arrays; |
|
8 |
import java.util.Collections; |
|
9 |
import java.util.HashSet; |
|
10 |
import java.util.LinkedHashMap; |
|
11 |
import java.util.LinkedHashSet; |
|
12 |
|
|
13 |
import org.apache.commons.lang.StringUtils; |
|
14 |
import org.eclipse.core.commands.AbstractHandler; |
|
15 |
import org.eclipse.core.commands.ExecutionEvent; |
|
16 |
import org.eclipse.core.commands.ExecutionException; |
|
17 |
import org.eclipse.core.runtime.IProgressMonitor; |
|
18 |
import org.eclipse.core.runtime.IStatus; |
|
19 |
import org.eclipse.core.runtime.Status; |
|
20 |
import org.eclipse.jface.viewers.ISelection; |
|
21 |
import org.eclipse.jface.viewers.IStructuredSelection; |
|
22 |
import org.eclipse.ui.IWorkbenchWindow; |
|
23 |
import org.eclipse.ui.handlers.HandlerUtil; |
|
24 |
import org.kohsuke.args4j.Option; |
|
25 |
import org.txm.Toolbox; |
|
26 |
import org.txm.core.preferences.TBXPreferences; |
|
27 |
import org.txm.core.preferences.TXMPreferences; |
|
28 |
import org.txm.rcp.swt.widget.parameters.ParametersDialog; |
|
29 |
import org.txm.rcp.utils.JobHandler; |
|
30 |
import org.txm.searchengine.cqp.clientExceptions.CqiClientException; |
|
31 |
import org.txm.searchengine.cqp.corpus.Corpus; |
|
32 |
import org.txm.searchengine.cqp.corpus.MainCorpus; |
|
33 |
import org.txm.searchengine.cqp.corpus.Property; |
|
34 |
import org.txm.stat.utils.ConsoleProgressBar; |
|
35 |
import org.txm.utils.DeleteDir; |
|
36 |
import org.txm.utils.io.FileCopy; |
|
37 |
import org.txm.utils.io.IOUtils; |
|
38 |
import org.txm.utils.Tuple; |
|
39 |
|
|
40 |
/** |
|
41 |
* Our sample handler extends AbstractHandler, an IHandler base class. |
|
42 |
* @see org.eclipse.core.commands.IHandler |
|
43 |
* @see org.eclipse.core.commands.AbstractHandler |
|
44 |
*/ |
|
45 |
public class LemmaProjection extends AbstractHandler { |
|
46 |
|
|
47 |
protected static final String EXTRA = "extra"; |
|
48 |
@Option(name="dictionary", usage="TSV Dictionary file with form, msd, lemma, source columns", widget="File", required=true, def="frolex.tsv") |
|
49 |
public File dictionary = null; |
|
50 |
@Option(name="extrarules", usage="form+pos rules files", widget="File", required=false, def="extrarules.tsv") |
|
51 |
public File extrarules = null; |
|
52 |
@Option(name="posproperty", usage="The lexicon property to read", widget="String", required=true, def="frpos") |
|
53 |
public String posproperty = null; |
|
54 |
@Option(name="lemmaproperty", usage="The property to create/update in the corpus", widget="String", required=true, def="plemma") |
|
55 |
public String lemmaproperty = null; |
|
56 |
@Option(name="formAsLemmaPosList", usage="Pos values lemma exceptions", widget="String", required=false, def="NOMPro") |
|
57 |
public String formAsLemmaPosList = null; |
|
58 |
@Option(name="sourcePriorityList", usage="The property to create/update in the corpus", widget="String", required=true, def="TL") |
|
59 |
public String sourcePriorityList = null; |
|
60 |
|
|
61 |
/** |
|
62 |
* |
|
63 |
*/ |
|
64 |
public Object execute(ExecutionEvent event) throws ExecutionException { |
|
65 |
Corpus corpus = null; |
|
66 |
IWorkbenchWindow window = HandlerUtil.getActiveWorkbenchWindowChecked(event); |
|
67 |
|
|
68 |
ISelection isel = window.getActivePage().getSelection(); |
|
69 |
if (isel instanceof IStructuredSelection) { |
|
70 |
IStructuredSelection sel = (IStructuredSelection)isel; |
|
71 |
Object first = sel.getFirstElement(); |
|
72 |
if (first instanceof Corpus) { |
|
73 |
corpus = (Corpus)first; |
|
74 |
if (ParametersDialog.open(this)) { |
|
75 |
LinkedHashSet<String> formAsLemmaPosSet = new LinkedHashSet<String>(); |
|
76 |
formAsLemmaPosSet.addAll(Arrays.asList(formAsLemmaPosList.split(","))); |
|
77 |
LinkedHashSet<String> sourcePrioritySet = new LinkedHashSet<String>(); |
|
78 |
if (extrarules != null && extrarules.exists()) sourcePrioritySet.add(EXTRA); // extra must be the first source |
|
79 |
sourcePrioritySet.addAll(Arrays.asList(sourcePriorityList.split(","))); |
|
80 |
|
|
81 |
System.out.println("formAsLemmaPosSet="+formAsLemmaPosSet); |
|
82 |
System.out.println("sourcePrioritySet="+sourcePrioritySet); |
|
83 |
apply(corpus, dictionary, extrarules, posproperty, lemmaproperty, formAsLemmaPosSet, sourcePrioritySet); |
|
84 |
return corpus; |
|
85 |
} |
|
86 |
} |
|
87 |
} |
|
88 |
|
|
89 |
System.out.println("Wrong selection."); |
|
90 |
return null; |
|
91 |
} |
|
92 |
|
|
93 |
public static void apply(final Corpus corpus, final File dictionary, final File extrarules, final String posproperty, |
|
94 |
final String targetproperty, final LinkedHashSet<String> formAsLemmaPosList, final LinkedHashSet<String> sourceprioritylist) { |
|
95 |
final MainCorpus mainCorpus = corpus.getMainCorpus(); |
|
96 |
final File corpusBinaryDirectory = mainCorpus.getBaseDirectory(); |
|
97 |
final File txmDirectory = new File(corpusBinaryDirectory, "txm/"+mainCorpus.getName()); |
|
98 |
|
|
99 |
if (!txmDirectory.exists()) { |
|
100 |
System.out.println("Can't process a corpus with no XML-TXM files directory: "+txmDirectory); |
|
101 |
return; |
|
102 |
} |
|
103 |
|
|
104 |
final File[] files = txmDirectory.listFiles(new FileFilter() { |
|
105 |
@Override |
|
106 |
public boolean accept(File file) { |
|
107 |
return file.isFile() && file.canWrite() && file.getName().endsWith(".xml"); |
|
108 |
} |
|
109 |
}); |
|
110 |
|
|
111 |
Property pos = null; |
|
112 |
try { |
|
113 |
pos = mainCorpus.getProperty(posproperty); |
|
114 |
} catch (CqiClientException e1) { |
|
115 |
// TODO Auto-generated catch block |
|
116 |
e1.printStackTrace(); |
|
117 |
} |
|
118 |
if (pos == null) { |
|
119 |
System.out.println("No pos property found with name="+posproperty); |
|
120 |
return; |
|
121 |
} |
|
122 |
|
|
123 |
if (files == null || files.length == 0) { |
|
124 |
System.out.println("Can't process a corpus with no XML-TXM files in "+txmDirectory); |
|
125 |
return; |
|
126 |
} |
|
127 |
|
|
128 |
System.out.println("APPLYING : "+dictionary+" to "+mainCorpus+": creating/updating "+targetproperty+ " property with lexicon "+dictionary); |
|
129 |
JobHandler job = new JobHandler("Creating/Updating "+targetproperty+" property.") { |
|
130 |
@Override |
|
131 |
protected IStatus run(IProgressMonitor monitor) { |
|
132 |
this.runInit(monitor); |
|
133 |
Tuple t; |
|
134 |
LinkedHashMap<String, LinkedHashMap<String, LinkedHashMap<String, String>>> rules = new LinkedHashMap<String, LinkedHashMap<String, LinkedHashMap<String, String>>>(); |
|
135 |
HashSet<String> formAsLemmaPosSet = new HashSet<String>(); |
|
136 |
try { |
|
137 |
// load rules |
|
138 |
BufferedReader reader = IOUtils.getReader(dictionary); |
|
139 |
String line = reader.readLine(); |
|
140 |
while (line != null) { |
|
141 |
String[] splitTab = line.split("\t"); |
|
142 |
if (splitTab.length != 5) { |
|
143 |
System.out.println("Error in dictionary files with line='"+line+"': length is not 5. Found: "+Arrays.toString(splitTab)); |
|
144 |
line = reader.readLine(); |
|
145 |
reader.close(); |
|
146 |
return Status.CANCEL_STATUS; |
|
147 |
} |
|
148 |
String form = splitTab[0]; |
|
149 |
String pos = splitTab[1];//.replace("<no_pos>|", "").replace("|<no_pos>|", "").replace("|<no_pos>", ""); |
|
150 |
String lemma = splitTab[2];//.replace("<no_lemma>|", "").replace("|<no_lemma>|", "").replace("|<no_lemma>", ""); |
|
151 |
String source = splitTab[3]; |
|
152 |
|
|
153 |
if (! rules.containsKey(form)) rules.put(form, new LinkedHashMap<String, LinkedHashMap<String, String>>()); |
|
154 |
LinkedHashMap<String, LinkedHashMap<String, String>> posHash = rules.get(form); |
|
155 |
|
|
156 |
if (!lemma.equals("<no_lemma>")) { |
|
157 |
if (!posHash.containsKey(pos)) posHash.put(pos, new LinkedHashMap<String, String>()); |
|
158 |
LinkedHashMap<String, String> sourceHash = posHash.get(pos); |
|
159 |
|
|
160 |
sourceHash.put(source, lemma); |
|
161 |
} |
|
162 |
line = reader.readLine(); |
|
163 |
} |
|
164 |
reader.close(); |
|
165 |
System.out.println("Dictionary rules loaded: "+rules.size()); |
|
166 |
|
|
167 |
if (extrarules.exists()) { |
|
168 |
reader = IOUtils.getReader(extrarules); |
|
169 |
line = reader.readLine(); |
|
170 |
while (line != null) { |
|
171 |
String[] splitTab = line.split("\t"); |
|
172 |
if (splitTab.length != 3) { |
|
173 |
System.out.println("Error in extra rule files with line='"+line+"': length is not 3."); |
|
174 |
line = reader.readLine(); |
|
175 |
continue; |
|
176 |
} |
|
177 |
String form = splitTab[0]; |
|
178 |
String pos = splitTab[1]; |
|
179 |
String lemma = splitTab[2]; |
|
180 |
if (! rules.containsKey(form)) rules.put(form, new LinkedHashMap<String, LinkedHashMap<String, String>>()); |
|
181 |
LinkedHashMap<String, LinkedHashMap<String, String>> posHash = rules.get(form); |
|
182 |
|
|
183 |
if (!posHash.containsKey(pos)) posHash.put(pos, new LinkedHashMap<String, String>()); |
|
184 |
LinkedHashMap<String, String> sourceHash = posHash.get(pos); |
|
185 |
sourceHash.put(EXTRA, lemma); |
|
186 |
|
|
187 |
} |
|
188 |
reader.close(); |
|
189 |
System.out.println("Dictionary extra rules loaded: "+rules.size()); |
|
190 |
} else { |
|
191 |
System.out.println("No extra rule loaded."); |
|
192 |
} |
|
193 |
|
|
194 |
PrintWriter writer = IOUtils.getWriter("/tmp/rules.txt"); |
|
195 |
for (String k : rules.keySet()) { |
|
196 |
writer.println("FORM="+k); |
|
197 |
LinkedHashMap<String, LinkedHashMap<String, String>> rules2 = rules.get(k); |
|
198 |
for (String k2 : rules2.keySet()) { |
|
199 |
writer.println(" POS="+k2); |
|
200 |
LinkedHashMap<String, String> rules3 = rules2.get(k2); |
|
201 |
for (String k3 : rules3.keySet()) { |
|
202 |
writer.println(" SOURCE="+k3); |
|
203 |
String ls2 = rules3.get(k3); |
|
204 |
writer.println(" LEMMA="+ls2); |
|
205 |
} |
|
206 |
} |
|
207 |
} |
|
208 |
writer.close(); |
|
209 |
System.out.println("RULE DUMP: /tmp/rules.txt"); |
|
210 |
|
|
211 |
// load rules |
|
212 |
for (String s : formAsLemmaPosList) { |
|
213 |
formAsLemmaPosSet.add(s); |
|
214 |
} |
|
215 |
System.out.println("POS exception rules loaded: "+formAsLemmaPosSet.size()); |
|
216 |
|
|
217 |
// save previous version of XML-TXM files |
|
218 |
File previousXMLTXMDirectory = new File(txmDirectory.getAbsolutePath()+"_previous"); |
|
219 |
DeleteDir.deleteDirectory(previousXMLTXMDirectory); |
|
220 |
FileCopy.copyFiles(txmDirectory, previousXMLTXMDirectory); |
|
221 |
|
|
222 |
// work |
|
223 |
File noMatchsFile = new File(TXMPreferences.getString(TBXPreferences.USER_TXM_HOME, TBXPreferences.PREFERENCES_NODE), "results/nomatch.txt"); |
|
224 |
HashSet<String> noMatchsSet = new HashSet<String>(); |
|
225 |
ConsoleProgressBar cpb = new ConsoleProgressBar(files.length); |
|
226 |
for (File xmlFile : files) { |
|
227 |
cpb.tick(); |
|
228 |
XMLLemmaProjection p = new XMLLemmaProjection(xmlFile, rules, formAsLemmaPosSet, sourceprioritylist, posproperty, targetproperty); |
|
229 |
File tmpFile = new File(xmlFile.getParentFile(), "tmp_"+xmlFile.getName()); |
|
230 |
if (p.process(tmpFile)) { |
|
231 |
if (xmlFile.delete() && tmpFile.renameTo(xmlFile)) { |
|
232 |
// ok |
|
233 |
} else { |
|
234 |
System.out.println("Error during lemma projection: can't replace XML-TXM file: "+xmlFile); |
|
235 |
return Status.CANCEL_STATUS; |
|
236 |
} |
|
237 |
} else { |
|
238 |
System.out.println("Error during lemma projection. Aborting."); |
|
239 |
return Status.CANCEL_STATUS; |
|
240 |
} |
|
241 |
if (p.getNoMatchValues().size() > 0) { |
|
242 |
System.out.println("No matchs found with file "+xmlFile.getName()+": "+p.getNoMatchValues()); |
|
243 |
noMatchsSet.addAll(p.getNoMatchValues()); |
|
244 |
} |
|
245 |
} |
|
246 |
|
|
247 |
if (noMatchsSet.size() > 0) { |
|
248 |
System.out.println("Missing lemma values report saved in: "+noMatchsFile); |
|
249 |
IOUtils.write(noMatchsFile, StringUtils.join(noMatchsSet, "\n")); |
|
250 |
} |
|
251 |
|
|
252 |
cpb.done(); |
|
253 |
monitor.worked(50); |
|
254 |
|
|
255 |
// update corpus |
|
256 |
// update corpus indexes and edition |
|
257 |
// String txmhome = Toolbox.getParam(Toolbox.USER_TXM_HOME); |
|
258 |
// |
|
259 |
// BaseParameters params = corpus.getBase().getBaseParameters(); |
|
260 |
// params.getKeyValueParameters().put(ImportKeys.MULTITHREAD, "false"); //too soon |
|
261 |
// params.getKeyValueParameters().put(ImportKeys.DEBUG, Log.getLevel().intValue() < Level.WARNING.intValue()); // need debug for experimental stuff |
|
262 |
// params.getKeyValueParameters().put(ImportKeys.UPDATECORPUS, "true"); |
|
263 |
// |
|
264 |
// monitor.setTaskName("Updating corpus"); |
|
265 |
// File scriptDir = new File(txmhome, "scripts/import"); |
|
266 |
// File script = new File(scriptDir, "xtzLoader.groovy"); |
|
267 |
// System.out.println("Updating corpus "+corpus+" using "+params.paramFile); |
|
268 |
// boolean ret = ExecuteImportScript.executeScript(script.getAbsolutePath(), params); |
|
269 |
// if (!ret) { |
|
270 |
// System.out.println("Error during corpus re-import, check the XML-TXM files. Previous version can be restored from "+previousXMLTXMDirectory); |
|
271 |
// return Status.CANCEL_STATUS; |
|
272 |
// } |
|
273 |
// Display.getDefault().syncExec(new Runnable() { |
|
274 |
// @Override |
|
275 |
// public void run() {CloseEditorsUsing.corpus(corpus);} |
|
276 |
// }); |
|
277 |
// monitor.worked(50); |
|
278 |
|
|
279 |
} catch (Exception e) { |
|
280 |
e.printStackTrace(); |
|
281 |
return Status.CANCEL_STATUS; |
|
282 |
} |
|
283 |
System.out.println("Done."); |
|
284 |
return Status.OK_STATUS; |
|
285 |
} |
|
286 |
}; |
|
287 |
job.schedule(); |
|
288 |
} |
|
289 |
} |
|
0 | 290 |
tmp/org.txm.treetagger.rcp/src/org/txm/treetagger/commands/XMLLemmaProjection.java (revision 460) | ||
---|---|---|
1 |
package org.txm.treetagger.commands; |
|
2 |
|
|
3 |
import java.io.File; |
|
4 |
import java.io.IOException; |
|
5 |
import java.util.HashSet; |
|
6 |
import java.util.LinkedHashMap; |
|
7 |
import java.util.LinkedHashSet; |
|
8 |
import java.util.regex.Pattern; |
|
9 |
|
|
10 |
import javax.xml.stream.XMLStreamException; |
|
11 |
|
|
12 |
import org.txm.importer.StaxIdentityParser; |
|
13 |
|
|
14 |
public class XMLLemmaProjection extends StaxIdentityParser { |
|
15 |
|
|
16 |
// form -> pos -> source -> lemma |
|
17 |
protected LinkedHashMap<String, LinkedHashMap<String, LinkedHashMap<String, String>>> rules = null; |
|
18 |
protected HashSet<String> formAsLemmaPosList = null; |
|
19 |
protected String lemmaProperty; |
|
20 |
|
|
21 |
protected HashSet<String> noMatchValues = new HashSet<String>(); |
|
22 |
protected String posProperty; |
|
23 |
protected LinkedHashSet<String> lemmaSourcePriorityList; |
|
24 |
|
|
25 |
public XMLLemmaProjection(File infile, LinkedHashMap<String, LinkedHashMap<String, |
|
26 |
LinkedHashMap<String, String>>> rules, |
|
27 |
HashSet<String> formAsLemmaPosList, |
|
28 |
LinkedHashSet<String> lemmaSourcePriorityList, |
|
29 |
String posProperty, String lemmaProperty) throws IOException, XMLStreamException { |
|
30 |
super(infile); |
|
31 |
this.rules = rules; |
|
32 |
this.formAsLemmaPosList = formAsLemmaPosList; |
|
33 |
this.lemmaSourcePriorityList = lemmaSourcePriorityList; |
|
34 |
this.lemmaProperty = lemmaProperty; |
|
35 |
this.posProperty = posProperty; |
|
36 |
|
|
37 |
// the XML-TXM files word properties name starts wit # (they are references) |
|
38 |
if (!this.lemmaProperty.startsWith("#")) this.lemmaProperty = "#"+this.lemmaProperty; |
|
39 |
if (!this.posProperty.startsWith("#")) this.posProperty = "#"+this.posProperty; |
|
40 |
} |
|
41 |
|
|
42 |
boolean inW = false, inAna = false, inForm; |
|
43 |
LinkedHashMap<String, String> anaValues = new LinkedHashMap<String, String>(); |
|
44 |
LinkedHashMap<String, String> anaResps = new LinkedHashMap<String, String>(); |
|
45 |
String typeName = null; |
|
46 |
String respName = null; |
|
47 |
String formValue, typeValue = null; |
|
48 |
|
|
49 |
@Override |
|
50 |
public void processStartElement() throws XMLStreamException, IOException { |
|
51 |
if (!inW) super.processStartElement(); // don't write W content |
|
52 |
|
|
53 |
if (localname.equals("w")) { |
|
54 |
inW = true; |
|
55 |
anaValues.clear(); |
|
56 |
anaResps.clear(); |
|
57 |
|
|
58 |
//initialize the new type to a empty value in case there is transformation rule |
|
59 |
anaValues.put(lemmaProperty, ""); |
|
60 |
anaResps.put(lemmaProperty, "#txm_recode"); |
|
61 |
} else if (localname.equals("ana")) { |
|
62 |
inAna = true; |
|
63 |
typeName = parser.getAttributeValue(null, "type"); |
|
64 |
respName = parser.getAttributeValue(null, "resp"); |
|
65 |
anaResps.put(typeName, respName); |
|
66 |
//if (typeName != null) typeName = typeName.substring(1); // remove # |
|
67 |
typeValue = ""; |
|
68 |
} else if (localname.equals("form")) { |
|
69 |
inForm = true; |
|
70 |
formValue = ""; |
|
71 |
} |
|
72 |
} |
|
73 |
|
|
74 |
@Override |
|
75 |
public void processCharacters() throws XMLStreamException { |
|
76 |
if (inW && inAna) typeValue+=parser.getText(); |
|
77 |
else if (inW && inForm) formValue+=parser.getText(); |
|
78 |
else super.processCharacters(); |
|
79 |
} |
|
80 |
|
|
81 |
@Override |
|
82 |
public void processEndElement() throws XMLStreamException { |
|
83 |
if (localname.equals("w")) { |
|
84 |
inW = false; |
|
85 |
|
|
86 |
// write W content |
|
87 |
try { |
|
88 |
// get the value to test |
|
89 |
String posValue = anaValues.get(posProperty); |
|
90 |
if (posValue == null) { |
|
91 |
posValue = "<no_pos>"; |
|
92 |
// anaValues.put(posProperty, "<no_pos>"); |
|
93 |
// anaResps.put(posProperty, "txm_recode"); |
|
94 |
} |
|
95 |
String value = updateAnaValuesIfMatch(formValue.trim(), posValue.trim()); |
|
96 |
//System.out.println("form="+formValue+" + pos="+posValue+" -> "+value); |
|
97 |
anaValues.put(lemmaProperty, value); |
|
98 |
anaResps.put(lemmaProperty, "#txm_recode"); |
|
99 |
|
|
100 |
// write the word element |
|
101 |
writer.writeStartElement("txm:form"); |
|
102 |
writer.writeCharacters(formValue); |
|
103 |
writer.writeEndElement(); |
|
104 |
|
|
105 |
for (String k : anaValues.keySet()) { |
|
106 |
writer.writeStartElement("txm:ana"); |
|
107 |
writer.writeAttribute("resp", anaResps.get(k)); |
|
108 |
writer.writeAttribute("type", k); |
|
109 |
writer.writeCharacters(anaValues.get(k)); |
|
110 |
writer.writeEndElement(); |
|
111 |
} |
|
112 |
} catch (XMLStreamException e) { |
|
113 |
e.printStackTrace(); |
|
114 |
} |
|
115 |
} else if (localname.equals("ana")) { |
|
116 |
anaValues.put(typeName, typeValue); |
|
117 |
inAna = false; |
|
118 |
} else if (localname.equals("form")) { |
|
119 |
inForm = false; |
|
120 |
} |
|
121 |
|
|
122 |
if (!inW) super.processEndElement(); // don't write W content now |
|
123 |
} |
|
124 |
|
|
125 |
protected String updateAnaValuesIfMatch(String formValue, String posValue) { |
|
126 |
if (posValue == null) return ""; |
|
127 |
|
|
128 |
if (formAsLemmaPosList.contains(posValue)) return formValue; |
|
129 |
|
|
130 |
|
|
131 |
if (formValue.equals("virge")) System.out.println("testing: "+formValue+" "+posValue); |
|
132 |
if (formValue.equals("virge")) System.out.println("form connue? "+rules.containsKey(formValue)); |
|
133 |
if (rules.containsKey(formValue)) { |
|
134 |
LinkedHashMap<String, LinkedHashMap<String, String>> posHash = rules.get(formValue); |
|
135 |
// if (posHash.containsKey(posValue)) { |
|
136 |
// LinkedHashMap<String, String> sourceHash = posHash.get(posValue); |
|
137 |
// for (String source : lemmaSourcePriorityList) { |
|
138 |
// if (sourceHash.containsKey(source)) { |
|
139 |
// return sourceHash.get(source); |
|
140 |
// } |
|
141 |
// } |
|
142 |
// } |
|
143 |
if (formValue.equals("virge")) System.out.println(" tests"+posHash.keySet()); |
|
144 |
for (String posRegexp : posHash.keySet()) { |
|
145 |
|
|
146 |
if (posValue.matches(posRegexp)) { |
|
147 |
return posHash.get(posRegexp).toString(); |
|
148 |
} |
|
149 |
} |
|
150 |
} |
|
151 |
|
|
152 |
// try without maj |
|
153 |
String formValueMin = formValue.toLowerCase(); |
|
154 |
if (rules.containsKey(formValueMin)) { |
|
155 |
LinkedHashMap<String, LinkedHashMap<String, String>> posHash = rules.get(formValueMin); |
|
156 |
// if (posHash.containsKey(posValue)) { |
|
157 |
// LinkedHashMap<String, String> sourceHash = posHash.get(posValue); |
|
158 |
// for (String source : lemmaSourcePriorityList) { |
|
159 |
// if (sourceHash.containsKey(source)) { |
|
160 |
// return sourceHash.get(source); |
|
161 |
// } |
|
162 |
// } |
|
163 |
// } |
|
164 |
|
|
165 |
for (String posRegexp : posHash.keySet()) { |
|
166 |
if (posValue.matches(posRegexp)) { |
|
167 |
return posHash.get(posRegexp).toString(); |
|
168 |
} |
|
169 |
} |
|
170 |
} |
|
171 |
|
|
172 |
noMatchValues.add(formValue+"|"+posValue); |
|
173 |
return "!"+formValue; |
|
174 |
} |
|
175 |
|
|
176 |
public HashSet<String> getNoMatchValues() { |
|
177 |
return noMatchValues; |
|
178 |
} |
|
179 |
|
|
180 |
public static void main(String args[]) { |
|
181 |
File xmlFile = new File("/home/mdecorde/TXM/corpora/XTZTEXTUALPLANS/txm/XTZTEXTUALPLANS/test.xml"); |
|
182 |
File tmpFile = new File("/home/mdecorde/TXM/corpora/XTZTEXTUALPLANS/txm/XTZTEXTUALPLANS/test-o.xml"); |
|
183 |
String posProperty = "type"; |
|
184 |
String newType = "lemma"; |
|
185 |
LinkedHashMap<Pattern[], String> rules = new LinkedHashMap<Pattern[], String>(); |
|
186 |
rules.put(new Pattern[]{Pattern.compile("w"), Pattern.compile("w")}, "WORD"); |
|
187 |
rules.put(new Pattern[]{Pattern.compile("x.+"), Pattern.compile("w")}, "XWORD"); |
|
188 |
rules.put(new Pattern[]{Pattern.compile("y"), Pattern.compile("w")}, "YWORD"); |
|
189 |
rules.put(new Pattern[]{Pattern.compile("y.*"), Pattern.compile("w")}, "YMULTIWORD"); |
|
190 |
//XMLPropertyProjection converter = new XMLPropertyProjection(xmlFile, rules, posProperty, newType); |
|
191 |
//System.out.println(converter.process(tmpFile)); |
|
192 |
} |
|
193 |
} |
|
0 | 194 |
tmp/org.txm.treetagger.rcp/src/org/txm/treetagger/commands/RemoveProperties.java (revision 460) | ||
---|---|---|
1 |
package org.txm.treetagger.commands; |
|
2 |
|
|
3 |
import java.io.BufferedReader; |
|
4 |
import java.io.File; |
|
5 |
import java.io.FileFilter; |
|
6 |
import java.util.Arrays; |
|
7 |
import java.util.HashSet; |
|
8 |
import java.util.LinkedHashMap; |
|
9 |
import java.util.LinkedHashSet; |
|
10 |
|
|
11 |
import org.apache.commons.lang.StringUtils; |
|
12 |
import org.eclipse.core.commands.AbstractHandler; |
|
13 |
import org.eclipse.core.commands.ExecutionEvent; |
|
14 |
import org.eclipse.core.commands.ExecutionException; |
|
15 |
import org.eclipse.core.runtime.IProgressMonitor; |
|
16 |
import org.eclipse.core.runtime.IStatus; |
|
17 |
import org.eclipse.core.runtime.Status; |
|
18 |
import org.eclipse.jface.viewers.ISelection; |
|
19 |
import org.eclipse.jface.viewers.IStructuredSelection; |
|
20 |
import org.eclipse.ui.IWorkbenchWindow; |
|
21 |
import org.eclipse.ui.handlers.HandlerUtil; |
|
22 |
import org.kohsuke.args4j.Option; |
|
23 |
import org.txm.Toolbox; |
|
24 |
import org.txm.core.preferences.TBXPreferences; |
|
25 |
import org.txm.core.preferences.TXMPreferences; |
|
26 |
import org.txm.rcp.swt.widget.parameters.ParametersDialog; |
|
27 |
import org.txm.rcp.utils.JobHandler; |
|
28 |
import org.txm.searchengine.cqp.clientExceptions.CqiClientException; |
|
29 |
import org.txm.searchengine.cqp.corpus.Corpus; |
|
30 |
import org.txm.searchengine.cqp.corpus.MainCorpus; |
|
31 |
import org.txm.searchengine.cqp.corpus.Property; |
|
32 |
import org.txm.stat.utils.ConsoleProgressBar; |
|
33 |
import org.txm.utils.DeleteDir; |
|
34 |
import org.txm.utils.io.FileCopy; |
|
35 |
import org.txm.utils.io.IOUtils; |
|
36 |
import org.txm.utils.Tuple; |
|
37 |
|
|
38 |
/** |
|
39 |
* Our sample handler extends AbstractHandler, an IHandler base class. |
|
40 |
* @see org.eclipse.core.commands.IHandler |
|
41 |
* @see org.eclipse.core.commands.AbstractHandler |
|
42 |
*/ |
|
43 |
public class RemoveProperties extends AbstractHandler { |
|
44 |
|
|
45 |
@Option(name="propertiesList", usage="The properties to remove", widget="String", required=true, def="plemma") |
|
46 |
public String propertiesList = null; |
|
47 |
|
|
48 |
/** |
|
49 |
* |
|
50 |
*/ |
|
51 |
public Object execute(ExecutionEvent event) throws ExecutionException { |
|
52 |
Corpus corpus = null; |
|
53 |
IWorkbenchWindow window = HandlerUtil.getActiveWorkbenchWindowChecked(event); |
|
54 |
|
|
55 |
ISelection isel = window.getActivePage().getSelection(); |
|
56 |
if (isel instanceof IStructuredSelection) { |
|
57 |
IStructuredSelection sel = (IStructuredSelection)isel; |
|
58 |
Object first = sel.getFirstElement(); |
|
59 |
if (first instanceof Corpus) { |
|
60 |
corpus = (Corpus)first; |
|
61 |
if (ParametersDialog.open(this)) { |
|
62 |
LinkedHashSet<String> propertiesSet = new LinkedHashSet<String>(); |
|
63 |
propertiesSet.addAll(Arrays.asList(propertiesList.split(","))); |
|
64 |
|
|
65 |
apply(corpus, propertiesSet); |
|
66 |
return corpus; |
|
67 |
} |
|
68 |
} |
|
69 |
} |
|
70 |
|
|
71 |
System.out.println("Wrong selection."); |
|
72 |
return null; |
|
73 |
} |
|
74 |
|
|
75 |
public static void apply(final Corpus corpus, final HashSet<String> propertiesSet) { |
|
76 |
final MainCorpus mainCorpus = corpus.getMainCorpus(); |
|
77 |
final File corpusBinaryDirectory = mainCorpus.getBaseDirectory(); |
|
78 |
final File txmDirectory = new File(corpusBinaryDirectory, "txm/"+mainCorpus.getName()); |
|
79 |
|
|
80 |
if (!txmDirectory.exists()) { |
|
81 |
System.out.println("Can't process a corpus with no XML-TXM files directory: "+txmDirectory); |
|
82 |
return; |
|
83 |
} |
|
84 |
|
|
85 |
final File[] files = txmDirectory.listFiles(new FileFilter() { |
|
86 |
@Override |
|
87 |
public boolean accept(File file) { |
|
88 |
return file.isFile() && file.canWrite() && file.getName().endsWith(".xml"); |
|
89 |
} |
|
90 |
}); |
|
91 |
|
|
92 |
if (files == null || files.length == 0) { |
|
93 |
System.out.println("Can't process a corpus with no XML-TXM files in "+txmDirectory); |
|
94 |
return; |
|
95 |
} |
|
96 |
|
|
97 |
System.out.println("Removing "+propertiesSet+" to "+mainCorpus+" XML-TXM files..."); |
|
98 |
JobHandler job = new JobHandler("Removing "+propertiesSet+" to "+mainCorpus+" XML-TXM files.") { |
|
99 |
@Override |
|
100 |
protected IStatus run(IProgressMonitor monitor) { |
|
101 |
this.runInit(monitor); |
|
102 |
LinkedHashMap<String, LinkedHashMap<String, LinkedHashMap<String, String>>> rules = new LinkedHashMap<String, LinkedHashMap<String, LinkedHashMap<String, String>>>(); |
|
103 |
HashSet<String> no_change_rules = new HashSet<String>(); |
|
104 |
try { |
|
105 |
// save previous version of XML-TXM files |
|
106 |
File previousXMLTXMDirectory = new File(txmDirectory.getAbsolutePath()+"_previous"); |
|
107 |
DeleteDir.deleteDirectory(previousXMLTXMDirectory); |
|
108 |
FileCopy.copyFiles(txmDirectory, previousXMLTXMDirectory); |
|
109 |
|
|
110 |
// work |
|
111 |
File noMatchsFile = new File(TXMPreferences.getString(TBXPreferences.USER_TXM_HOME, TBXPreferences.PREFERENCES_NODE), "results/nomatch.txt"); |
|
112 |
HashSet<String> noMatchsSet = new HashSet<String>(); |
|
113 |
ConsoleProgressBar cpb = new ConsoleProgressBar(files.length); |
|
114 |
for (File xmlFile : files) { |
|
115 |
cpb.tick(); |
|
116 |
XMLRemoveProperties p = new XMLRemoveProperties(xmlFile, propertiesSet); |
|
117 |
File tmpFile = new File(xmlFile.getParentFile(), "tmp_"+xmlFile.getName()); |
|
118 |
if (p.process(tmpFile)) { |
|
119 |
if (xmlFile.delete() && tmpFile.renameTo(xmlFile)) { |
|
120 |
// ok |
|
121 |
} else { |
|
122 |
System.out.println("Error during properties removal: can't replace XML-TXM file: "+xmlFile); |
|
123 |
return Status.CANCEL_STATUS; |
|
124 |
} |
|
125 |
} else { |
|
126 |
System.out.println("Error during properties removal. Aborting."); |
|
127 |
return Status.CANCEL_STATUS; |
|
128 |
} |
|
129 |
} |
|
130 |
|
|
131 |
cpb.done(); |
|
132 |
monitor.worked(50); |
|
133 |
|
|
134 |
} catch (Exception e) { |
|
135 |
e.printStackTrace(); |
|
136 |
return Status.CANCEL_STATUS; |
|
137 |
} |
|
138 |
System.out.println("Done."); |
|
139 |
return Status.OK_STATUS; |
|
140 |
} |
|
141 |
}; |
|
142 |
job.schedule(); |
|
143 |
} |
|
144 |
} |
|
0 | 145 |
tmp/org.txm.treetagger.rcp/src/org/txm/treetagger/commands/XMLRemoveProperties.java (revision 460) | ||
---|---|---|
1 |
package org.txm.treetagger.commands; |
|
2 |
|
|
3 |
import java.io.File; |
|
4 |
import java.io.IOException; |
|
5 |
import java.util.HashSet; |
|
6 |
import java.util.LinkedHashMap; |
|
7 |
import java.util.regex.Pattern; |
|
8 |
|
|
9 |
import javax.xml.stream.XMLStreamException; |
|
10 |
|
|
11 |
import org.txm.importer.StaxIdentityParser; |
|
12 |
|
|
13 |
/** |
|
14 |
* Remove XML-TXM file ana elements which 'type' attribute value is in a set |
|
15 |
* |
|
16 |
* @author mdecorde |
|
17 |
* |
|
18 |
*/ |
|
19 |
public class XMLRemoveProperties extends StaxIdentityParser { |
|
20 |
|
|
21 |
// form -> pos -> source -> lemma |
|
22 |
protected HashSet<String> propertiesSet = null; |
|
23 |
|
|
24 |
/** |
|
25 |
* |
|
26 |
* @param infile the XML-TXM file to process |
|
27 |
* @param propertiesSet the set of ana@type attributes to remove |
|
28 |
* @throws XMLStreamException |
|
29 |
* @throws IOException |
|
30 |
*/ |
|
31 |
public XMLRemoveProperties(File infile, HashSet<String> propertiesSet) throws IOException, XMLStreamException { |
|
32 |
super(infile); |
|
33 |
this.propertiesSet = new HashSet<String>(); |
|
34 |
for (String property : propertiesSet) { |
|
35 |
// the XML-TXM files word properties name starts with # (they are references) |
|
36 |
if (!property.startsWith("#")) property = "#"+property; |
|
37 |
this.propertiesSet.add(property); |
|
38 |
} |
|
39 |
} |
|
40 |
|
|
41 |
boolean inW = false, inAna = false; |
|
42 |
String typeName = null; |
|
43 |
@Override |
|
44 |
public void processStartElement() throws XMLStreamException, IOException { |
|
45 |
|
|
46 |
if (localname.equals("w")) { |
|
47 |
inW = true; |
|
48 |
} else if (inW && localname.equals("ana")) { |
|
49 |
inAna = true; |
|
50 |
typeName = parser.getAttributeValue(null, "type"); |
|
51 |
if (propertiesSet.contains(typeName)) return; // don't write this element start tag |
|
52 |
} |
|
53 |
|
|
54 |
super.processStartElement(); |
|
55 |
} |
|
56 |
|
|
57 |
@Override |
|
58 |
public void processCharacters() throws XMLStreamException { |
|
59 |
|
|
60 |
if (inW && typeName != null && propertiesSet.contains(typeName)) {return;} // don't write the element content |
|
61 |
super.processCharacters(); |
|
62 |
} |
|
63 |
|
|
64 |
@Override |
|
65 |
public void processEndElement() throws XMLStreamException { |
|
66 |
if (localname.equals("w")) { |
|
67 |
inW = false; |
|
68 |
} else if (inW && localname.equals("ana")) { |
|
69 |
inAna = false; |
|
70 |
if (propertiesSet.contains(typeName)) {typeName = null; return;} // don't write the element end tag |
|
71 |
typeName = null; |
|
72 |
} |
|
73 |
|
|
74 |
super.processEndElement(); // don't write W content now |
|
75 |
} |
|
76 |
|
|
77 |
public static void main(String args[]) { |
|
78 |
File xmlFile = new File("/home/mdecorde/TXM/corpora/XTZTEXTUALPLANS/txm/XTZTEXTUALPLANS/test.xml"); |
|
79 |
File tmpFile = new File("/home/mdecorde/TXM/corpora/XTZTEXTUALPLANS/txm/XTZTEXTUALPLANS/test-o.xml"); |
|
80 |
String posProperty = "type"; |
|
81 |
String newType = "lemma"; |
|
82 |
LinkedHashMap<Pattern[], String> rules = new LinkedHashMap<Pattern[], String>(); |
|
83 |
rules.put(new Pattern[]{Pattern.compile("w"), Pattern.compile("w")}, "WORD"); |
|
84 |
rules.put(new Pattern[]{Pattern.compile("x.+"), Pattern.compile("w")}, "XWORD"); |
|
85 |
rules.put(new Pattern[]{Pattern.compile("y"), Pattern.compile("w")}, "YWORD"); |
|
86 |
rules.put(new Pattern[]{Pattern.compile("y.*"), Pattern.compile("w")}, "YMULTIWORD"); |
|
87 |
//XMLPropertyProjection converter = new XMLPropertyProjection(xmlFile, rules, posProperty, newType); |
|
88 |
//System.out.println(converter.process(tmpFile)); |
|
89 |
} |
|
90 |
} |
|
0 | 91 |
Formats disponibles : Unified diff