Révision 460

tmp/org.txm.treetagger.rcp/.project (revision 460)
1
<?xml version="1.0" encoding="UTF-8"?>
2
<projectDescription>
3
	<name>TreeTagger</name>
4
	<comment></comment>
5
	<projects>
6
	</projects>
7
	<buildSpec>
8
		<buildCommand>
9
			<name>org.eclipse.jdt.core.javabuilder</name>
10
			<arguments>
11
			</arguments>
12
		</buildCommand>
13
		<buildCommand>
14
			<name>org.eclipse.pde.ManifestBuilder</name>
15
			<arguments>
16
			</arguments>
17
		</buildCommand>
18
		<buildCommand>
19
			<name>org.eclipse.pde.SchemaBuilder</name>
20
			<arguments>
21
			</arguments>
22
		</buildCommand>
23
	</buildSpec>
24
	<natures>
25
		<nature>org.eclipse.pde.PluginNature</nature>
26
		<nature>org.eclipse.jdt.core.javanature</nature>
27
	</natures>
28
</projectDescription>
0 29

  
tmp/org.txm.treetagger.rcp/src/treetagger/Activator.java (revision 460)
1
package treetagger;
2

  
3
import org.eclipse.jface.resource.ImageDescriptor;
4
import org.eclipse.ui.plugin.AbstractUIPlugin;
5
import org.osgi.framework.BundleContext;
6

  
7
/**
8
 * The activator class controls the plug-in life cycle
9
 */
10
public class Activator extends AbstractUIPlugin {
11

  
12
	// The plug-in ID
13
	public static final String PLUGIN_ID = "TreeTagger"; //$NON-NLS-1$
14

  
15
	// The shared instance
16
	private static Activator plugin;
17
	
18
	/**
19
	 * The constructor
20
	 */
21
	public Activator() {
22
	}
23

  
24
	/*
25
	 * (non-Javadoc)
26
	 * @see org.eclipse.ui.plugin.AbstractUIPlugin#start(org.osgi.framework.BundleContext)
27
	 */
28
	public void start(BundleContext context) throws Exception {
29
		super.start(context);
30
		plugin = this;
31
	}
32

  
33
	/*
34
	 * (non-Javadoc)
35
	 * @see org.eclipse.ui.plugin.AbstractUIPlugin#stop(org.osgi.framework.BundleContext)
36
	 */
37
	public void stop(BundleContext context) throws Exception {
38
		plugin = null;
39
		super.stop(context);
40
	}
41

  
42
	/**
43
	 * Returns the shared instance
44
	 *
45
	 * @return the shared instance
46
	 */
47
	public static Activator getDefault() {
48
		return plugin;
49
	}
50

  
51
	/**
52
	 * Returns an image descriptor for the image file at the given
53
	 * plug-in relative path
54
	 *
55
	 * @param path the path
56
	 * @return the image descriptor
57
	 */
58
	public static ImageDescriptor getImageDescriptor(String path) {
59
		return imageDescriptorFromPlugin(PLUGIN_ID, path);
60
	}
61
}
0 62

  
tmp/org.txm.treetagger.rcp/src/org/txm/treetagger/commands/Train.java (revision 460)
1
package org.txm.treetagger.commands;
2

  
3
import java.io.BufferedOutputStream;
4
import java.io.BufferedReader;
5
import java.io.BufferedWriter;
6
import java.io.File;
7
import java.io.FileOutputStream;
8
import java.io.OutputStreamWriter;
9
import java.io.PrintStream;
10
import java.io.PrintWriter;
11
import java.util.ArrayList;
12
import java.util.Arrays;
13
import java.util.HashMap;
14
import java.util.HashSet;
15
import java.util.LinkedHashMap;
16
import java.util.LinkedHashSet;
17
import java.util.List;
18

  
19
import org.eclipse.core.commands.AbstractHandler;
20
import org.eclipse.core.commands.ExecutionEvent;
21
import org.eclipse.core.commands.ExecutionException;
22
import org.eclipse.core.runtime.IProgressMonitor;
23
import org.eclipse.core.runtime.IStatus;
24
import org.eclipse.core.runtime.Status;
25
import org.eclipse.jface.viewers.ISelection;
26
import org.eclipse.jface.viewers.IStructuredSelection;
27
import org.eclipse.ui.IWorkbenchWindow;
28
import org.eclipse.ui.handlers.HandlerUtil;
29
import org.kohsuke.args4j.Option;
30
import org.txm.core.preferences.TBXPreferences;
31
import org.txm.core.preferences.TXMPreferences;
32
import org.txm.index.core.functions.Index;
33
import org.txm.index.core.functions.Line;
34
import org.txm.rcp.swt.widget.parameters.ParametersDialog;
35
import org.txm.rcp.utils.JobHandler;
36
import org.txm.searchengine.cqp.AbstractCqiClient;
37
import org.txm.searchengine.cqp.CQPEngine;
38
import org.txm.searchengine.cqp.corpus.Corpus;
39
import org.txm.searchengine.cqp.corpus.MainCorpus;
40
import org.txm.searchengine.cqp.corpus.Property;
41
import org.txm.searchengine.cqp.corpus.query.Match;
42
import org.txm.searchengine.cqp.corpus.query.Query;
43
import org.txm.utils.DeleteDir;
44
import org.txm.utils.io.IOUtils;
45
import org.txm.utils.logger.Log;
46
import org.txm.utils.treetagger.TreeTagger;
47

  
48
/**
49
 * Our sample handler extends AbstractHandler, an IHandler base class.
50
 * @see org.eclipse.core.commands.IHandler
51
 * @see org.eclipse.core.commands.AbstractHandler
52
 */
53
public class Train extends AbstractHandler {
54

  
55
	public Corpus corpus = null;
56

  
57
	@Option(name="model", usage="The model file to create", widget="CreateFile", required=true, def="fr.par")
58
	public File model = null;
59
	@Option(name="posProperty", usage="The pos property", widget="String", required=true, def="frpos")
60
	public String posProperty = null;
61
	@Option(name="sentenceTag", usage="The pos property", widget="String", required=true, def="SENT")
62
	public String sentenceTag = null;
63
	@Option(name="lemmaProperty", usage="The lemma property", widget="String", required=true, def="frlemma")
64
	public String lemmaProperty = null;
65
	@Option(name="lexique", usage="Lexicon file", widget="File", required=true, def="lexicon.txt")
66
	public File lexique = null;
67
	@Option(name="options", usage="TreeTagger supplementary options", widget="String", required=true, def="")
68
	public String options = null;
69

  
70
	/**
71
	 * 
72
	 */
73
	public Object execute(ExecutionEvent event) throws ExecutionException {
74

  
75

  
76
		IWorkbenchWindow window = HandlerUtil.getActiveWorkbenchWindowChecked(event);
77

  
78
		ISelection isel = window.getActivePage().getSelection();
79
		if (isel instanceof IStructuredSelection) {
80
			IStructuredSelection sel = (IStructuredSelection)isel;
81
			Object first = sel.getFirstElement();
82
			if (first instanceof Corpus) {
83
				corpus = (Corpus)first;
84
				if (ParametersDialog.open(this)) {
85

  
86
					train(corpus, model, lexique, new String[]{posProperty, lemmaProperty}, sentenceTag, options.split("  "));
87

  
88
					return corpus;
89
				}
90
			}
91
		}
92

  
93
		System.out.println("Wrong selection.");
94
		return null;
95
	}
96

  
97
	public static void train(final Corpus corpus, final File model, final File lexique, final String[] properties, final String sentenceTag, final String[] options) {
98

  
99
		JobHandler job = new JobHandler("Applying TreeTagger to "+corpus+" corpus.") {
100
			@Override
101
			protected IStatus run(IProgressMonitor monitor) {
102
				this.runInit(monitor);
103
				try {
104
					File lexique2 = lexique;
105
					MainCorpus mainCorpus = corpus.getMainCorpus();
106
					File corpusBinaryDirectory = mainCorpus.getBaseDirectory();
107
					
108
					System.out.println("TRAIN : "+corpus+" with "+lexique2+" to create "+model+ " with properties "+Arrays.toString(properties));
109

  
110
					if (properties == null || properties.length != 2) {
111
						System.out.println("Error can't continue with selected word properties: "+Arrays.toString(properties));
112
						return Status.CANCEL_STATUS;
113
					}
114

  
115
					for (String p : properties) {
116
						Property prop = corpus.getProperty(p);
117
						if (prop == null) {
118
							System.out.println("Missing property in corpus: "+p);
119
							return Status.CANCEL_STATUS;
120
						}
121
					}
122
					
123
					Property pos = corpus.getProperty(properties[0]);
124
					Property lemma = corpus.getProperty(properties[1]);
125
					
126
					// Prepare temporary directory
127
					File treetaggerSrcDirectory = new File(mainCorpus.getBaseDirectory(), "treetagger");
128
					DeleteDir.deleteDirectory(treetaggerSrcDirectory);
129
					treetaggerSrcDirectory.mkdirs();
130
					
131
					HashMap<String, HashSet<String>> simplified_lexicon = null;
132
					HashMap<String, HashSet<String>> simplified_lexicon_errors = null;
133
					int error_counter = 0;
134
					// Create Lexicon file from an Index
135
					if (lexique2 == null || !lexique2.exists()) {
136
						System.out.println("Warning: no lexicon file or given lexicon file does not exist ("+lexique2+"). Using corpus Index...");
137

  
138
						File lexiconfile = new File(treetaggerSrcDirectory, "lexicon.txt");
139
						List<Property> corpusProperties = new ArrayList<Property>();
140
						corpusProperties.add(mainCorpus.getProperty("word"));
141
						for (String p : properties) {
142
							Property prop = mainCorpus.getProperty(p);
143
							if (prop == null) {
144
								System.out.println("Error, a property is missing: "+p);
145
								return Status.CANCEL_STATUS;
146
							}
147
							corpusProperties.add(prop);
148
						}
149
						Index index = new Index(mainCorpus, new Query("[]"), corpusProperties);
150
						List<Line> lines = index.getAllLines();
151
						LinkedHashMap<String, ArrayList<String>> lex = new LinkedHashMap<String, ArrayList<String>>();
152
						HashMap<String, HashSet<String>> allPosValues = new HashMap<String, HashSet<String>>();
153
						for (Line l : lines) {
154
							List<List<String>> values = l.getUnitsProperties();
155
							String form = values.get(0).get(0);
156
							if (!lex.containsKey(form)) {
157
								ArrayList<String> pairs = new ArrayList<String>();
158
								HashSet<String> posValues = new HashSet<String>();
159
								
160
								allPosValues.put(form, posValues);
161
								lex.put(form, pairs);
162
							}
163
							ArrayList<String> pairs = lex.get(form);
164
							HashSet<String> posValues = allPosValues.get(form);
165
							String posValue = values.get(1).get(0);
166
							String lemmaValue = values.get(2).get(0);
167
							if (posValues.contains(posValue)) {
168
								
169
							} else {
170
								posValues.add(posValue);
171
								pairs.add(posValue);
172
								pairs.add(lemmaValue);
173
							}
174
						}
175

  
176
						BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(lexiconfile), "UTF-8"));
177
						for (String form : lex.keySet()) {
178

  
179
							writer.write(form);
180
							boolean tab = true;
181
							for (String v : lex.get(form)) {
182
								if (tab) writer.write("\t"+v);
183
								else writer.write(" "+v);
184
								
185
								tab = !tab;
186
							}
187
							writer.write("\n");
188
						}
189
						writer.close();
190
						lexique2 = lexiconfile;
191
					} else { // diagnose lexicon content
192
						simplified_lexicon = new HashMap<String, HashSet<String>>();
193
						simplified_lexicon_errors = new HashMap<String, HashSet<String>>();
194
						BufferedReader reader = IOUtils.getReader(lexique2);
195
						String line = reader.readLine();
196
						while (line != null) {
197
							String[] split = line.split("\t", 2);
198
							HashSet<String> posValues = new HashSet<String>();
199
							simplified_lexicon.put(split[0], posValues);
200
							for (String poslemme : split[1].split("\t")) {
201
								String[] split2 = poslemme.split(" ", 2);
202
								posValues.add(split2[0]);
203
							}
204
							line = reader.readLine();
205
						}
206
						reader.close();
207
					}
208
					
209
					
210
					// create TT SRC file from CWB indexes
211
					
212
					File ttSrcFile = new File(treetaggerSrcDirectory, mainCorpus.getName()+".tt");
213
					System.out.println("TT SRC file: "+ttSrcFile.getAbsolutePath());
214
					BufferedOutputStream fos = new BufferedOutputStream(new FileOutputStream(ttSrcFile));
215
					PrintStream ps = new PrintStream(fos);
216
					LinkedHashSet<Integer> positions = new LinkedHashSet<Integer>();
217
					Property word = corpus.getProperty("word");
218
					AbstractCqiClient CQI = CQPEngine.getCqiClient();
219
					for (Match m : corpus.getMatches()) {
220
						for (int i = m.getStart() ; i <= m.getEnd() ; i++) { // end match must be included
221
							positions.add(i);
222
							
223
							if (positions.size() >= 1000) { // avoid too big array
224
								int[] positions_array = new int[positions.size()];
225
								int ip = 0;
226
								for (int p : positions) positions_array[ip++] = p;
227
								String[] words = CQI.cpos2Str(word.getQualifiedName(), positions_array);
228
								String[] values = CQI.cpos2Str(pos.getQualifiedName(), positions_array);
229

  
230
								for (int iW = 0 ; iW < words.length ; iW++) {
231
									String w = words[iW];
232
									if (w != null) {
233
										String s = w+"\t"+values[iW];
234
										ps.println(s);
235
										
236
										if (simplified_lexicon != null) { // check given lexicon
237
											if (simplified_lexicon.containsKey(w)) {
238
												if (!simplified_lexicon.get(w).contains(values[iW])) {
239
													//System.out.println("Lexicon error: cannot find pos="+values[iW]+" for form="+w);
240
													if (!simplified_lexicon_errors.containsKey(w)) simplified_lexicon_errors.put(w, new HashSet<String>());
241
													HashSet<String> error_values = simplified_lexicon_errors.get(w);
242
													error_values.add(values[iW]);
243
													error_counter++;
244
												}
245
											} else {
246
												//System.out.println("Lexicon error: cannot find form="+w);
247
												if (!simplified_lexicon_errors.containsKey(w)) simplified_lexicon_errors.put(w, new HashSet<String>());
248
												HashSet<String> error_values = simplified_lexicon_errors.get(w);
249
												error_values.add("#"+values[iW]);
250
												error_counter++;
251
											}
252
										}
253
									}
254
								}
255
								positions.clear();
256
							}
257
						}
258
					}
259
					if (positions.size() > 0) { // write last words
260
						int[] positions_array = new int[positions.size()];
261
						int ip = 0;
262
						for (int p : positions) positions_array[ip++] = p;
263
						String[] words = CQI.cpos2Str(word.getQualifiedName(), positions_array);
264
						String[] values = CQI.cpos2Str(pos.getQualifiedName(), positions_array);
265

  
266
						for (int iW = 0 ; iW < words.length ; iW++) {
267
							String w = words[iW];
268
							if (w != null) {
269
								String s = w+"\t"+values[iW];
270
								ps.println(s);
271
							}
272
						}
273
						positions.clear();
274
					}
275
					ps.close();
276
					
277
					if (simplified_lexicon_errors != null && simplified_lexicon_errors.size() > 0) {
278
						File error_file = new File(treetaggerSrcDirectory, "errors.txt");
279
						PrintWriter errorwriter = IOUtils.getWriter(error_file);
280
						int c = 0;
281
						System.out.println("Warning, lexicon errors ("+error_counter+") found with words:");
282
						for (String w : simplified_lexicon_errors.keySet()) {
283
							errorwriter.println(w+"="+simplified_lexicon_errors.get(w));
284
							if (c < 10) {
285
								System.out.println(w+"="+simplified_lexicon_errors.get(w));
286
								c++;
287
								if (c == 10) System.out.println("... errors display is trucated, see "+error_file.getAbsolutePath());
288
							}
289
						}
290
						errorwriter.close();
291
						//System.out.println("Cannot apply train-treetagger if lexicon is missing words and pos.");
292
						//return Status.CANCEL_STATUS;
293
						File lexique3 = new File(lexique2.getParentFile(), lexique2.getName()+".fix");
294
						BufferedReader reader = IOUtils.getReader(lexique2);
295
						PrintWriter writer = IOUtils.getWriter(lexique3);
296
						String line = reader.readLine();
297
						while (line != null) {
298
							String w = line.split("\t", 2)[0];
299
							
300
							if (simplified_lexicon_errors.containsKey(w)) {
301
								for (String p : simplified_lexicon_errors.get(w)) {
302
									if (!p.startsWith("#"))
303
										line += ("\t"+p+" <no_lemma>"); // append missing value
304
								}
305
								simplified_lexicon_errors.remove(w);
306
							}
307
							
308
							writer.println(line);
309
							line = reader.readLine();
310
						}
311
						
312
						// write missing words
313
						for (String w2 : simplified_lexicon_errors.keySet()) {
314
							writer.print(w2);
315
							for (String p : simplified_lexicon_errors.get(w2)) {
316
								writer.print("\t"+p+" <no_lemma>");
317
							}
318
							writer.println("");
319
						}
320
						
321
						reader.close();
322
						writer.close();
323
						System.out.println("Adding words to a temporary lexicon: "+lexique3);
324
						lexique2 = lexique3;
325
					}
326
					
327
					// Create open class file : contains all pos values
328
					File openclassfile = new File(treetaggerSrcDirectory, "openclasses.txt");
329
					PrintWriter openClassFileWriter = IOUtils.getWriter(openclassfile);
330
					
331
//					Lexicon poslexicon = corpus.getLexicon(pos);
332
//					String[] posValues = poslexicon.getForms();
333
//					for (int iV = 0 ; iV < posValues.length ; iV++) {
334
//						if (iV == 0) openClassFileWriter.print(posValues[iV]);
335
//						else openClassFileWriter.print(" "+posValues[iV]);
336
//					}
337
					openClassFileWriter.close();
338

  
339
					// Call treetagger-train
340
					if (ttSrcFile.exists() && lexique2.exists() && openclassfile.exists()) {
341
						System.out.println("Running ");
342
						String treetaggerBinDirectory = new File(TXMPreferences.getString(TBXPreferences.TREETAGGER_INSTALL_PATH, TBXPreferences.PREFERENCES_NODE), "bin").getAbsolutePath();
343
						if (!treetaggerBinDirectory.endsWith("/")) treetaggerBinDirectory += "/";
344

  
345
						TreeTagger tt = new TreeTagger(treetaggerBinDirectory, options);
346
						tt.settoken();
347
						tt.setquiet();
348
						tt.setlemma();
349
						tt.setsgml();
350
						tt.setst(sentenceTag);
351
						tt.setproto();
352
						tt.setutf8();
353
						tt.debug(true);
354
						tt.traintreetagger(lexique2.getAbsolutePath(), openclassfile.getAbsolutePath(), ttSrcFile.getAbsolutePath(), model.getAbsolutePath());
355
						
356
						System.out.println("Done: "+model.getAbsolutePath());
357
					} else {
358
						System.out.println("Aborting.");
359
					}
360
					
361
					return Status.OK_STATUS;
362
				} catch (Exception e) {
363
					System.out.println("Error while training TT: "+e);
364
					Log.printStackTrace(e);
365
				}
366
				return Status.CANCEL_STATUS;
367
			}
368
		};
369
		job.schedule();
370
	}
371
}
0 372

  
tmp/org.txm.treetagger.rcp/src/org/txm/treetagger/commands/Apply.java (revision 460)
1
package org.txm.treetagger.commands;
2

  
3
import java.io.File;
4
import java.io.FileFilter;
5
import java.util.Arrays;
6
import java.util.HashMap;
7

  
8
import org.eclipse.core.commands.AbstractHandler;
9
import org.eclipse.core.commands.ExecutionEvent;
10
import org.eclipse.core.commands.ExecutionException;
11
import org.eclipse.core.runtime.IProgressMonitor;
12
import org.eclipse.core.runtime.IStatus;
13
import org.eclipse.core.runtime.Status;
14
import org.eclipse.jface.viewers.ISelection;
15
import org.eclipse.jface.viewers.IStructuredSelection;
16
import org.eclipse.ui.IWorkbenchWindow;
17
import org.eclipse.ui.handlers.HandlerUtil;
18
import org.kohsuke.args4j.Option;
19
import org.txm.rcp.commands.workspace.UpdateCorpus;
20
import org.txm.rcp.swt.widget.parameters.ParametersDialog;
21
import org.txm.rcp.utils.JobHandler;
22
import org.txm.scripts.teitxm.Annotate;
23
import org.txm.searchengine.cqp.corpus.Corpus;
24
import org.txm.searchengine.cqp.corpus.MainCorpus;
25

  
26
/**
27
 * Our sample handler extends AbstractHandler, an IHandler base class.
28
 * @see org.eclipse.core.commands.IHandler
29
 * @see org.eclipse.core.commands.AbstractHandler
30
 */
31
public class Apply extends AbstractHandler {
32
	
33
	@Option(name="model", usage="Model file", widget="File", required=true, def="model.par")
34
	public File model = null;
35
	@Option(name="posProperty", usage="The pos property", widget="String", required=true, def="frpos")
36
	public String posProperty = null;
37
	@Option(name="lemmaProperty", usage="The lemma property", widget="String", required=true, def="frlemma")
38
	public String lemmaProperty = null;
39
	@Option(name="options", usage="TreeTagger supplementary options", widget="String", required=true, def="")
40
	public String options = null;
41

  
42
	/**
43
	 * 
44
	 */
45
	public Object execute(ExecutionEvent event) throws ExecutionException {
46
		Corpus corpus = null;
47
		IWorkbenchWindow window = HandlerUtil.getActiveWorkbenchWindowChecked(event);
48
		
49
		ISelection isel = window.getActivePage().getSelection();
50
		if (isel instanceof IStructuredSelection) {
51
			IStructuredSelection sel = (IStructuredSelection)isel;
52
			Object first = sel.getFirstElement();
53
			if (first instanceof Corpus) {
54
				corpus = (Corpus)first;
55
				if (ParametersDialog.open(this)) {
56
					apply(corpus, model, new String[]{posProperty, lemmaProperty}, options.split("  "));
57
					return corpus;
58
				}
59
			}
60
		}
61
		
62
		System.out.println("Wrong selection.");
63
		return null;
64
	}
65
	
66
	public static void apply(Corpus corpus, final File model, final String[] properties, final String[] options) {
67
		final MainCorpus mainCorpus = corpus.getMainCorpus();
68
		final File corpusBinaryDirectory = mainCorpus.getBaseDirectory();
69
		final File txmDirectory = new File(corpusBinaryDirectory, "txm/"+mainCorpus.getName());
70
		
71
		if (!txmDirectory.exists()) {
72
			System.out.println("Can't apply TreeTagger to a corpus with no XML-TXM files.");
73
		}
74
		
75
		final File[] files = txmDirectory.listFiles(new FileFilter() {
76
			@Override
77
			public boolean accept(File file) {
78
				return file.isFile() && file.canWrite() && file.getName().endsWith(".xml");
79
			}
80
		});
81
		
82
		if (files == null || files.length == 0) {
83
			System.out.println("Can't apply TreeTagger to a corpus with no XML-TXM files in "+txmDirectory);
84
		}
85
		
86
		String lang = model.getName();
87
		if (!lang.endsWith(".par")) {
88
			System.out.println("Model file name must ends with the '.par' extension");
89
			return;
90
		}
91
		lang = lang.substring(0, lang.indexOf(".par"));
92
		
93
		final HashMap<String, String> hash = new HashMap<String, String>();
94
		for (File txmFile : files) {
95
			hash.put(txmFile.getName(), lang);
96
		}
97
			
98
		for (int i = 0 ; i < properties.length ; i++) properties[i] = properties[i].trim(); 
99
		
100
		System.out.println("APPLY : "+model+" to "+corpus+" updating "+Arrays.toString(properties)+ " with options "+Arrays.toString(options));
101
		JobHandler job = new JobHandler("Applying TreeTagger to "+corpus+" corpus.") {
102
			@Override
103
			protected IStatus run(IProgressMonitor monitor) {
104
				this.runInit(monitor);
105
				Annotate annotator = new Annotate();
106
				annotator.setModelsDirectory(model.getParentFile());
107
				annotator.setDebug();
108
				if (!annotator.run(corpusBinaryDirectory, txmDirectory, hash, true, properties, options)) {
109
					System.out.println("Fail to apply TreeTagger with "+txmDirectory+" files.");
110
					return Status.CANCEL_STATUS;
111
				}
112
				System.out.println("Done. Updating corpus...");
113
				
114
				if (UpdateCorpus.update(mainCorpus) == null) {
115
					System.out.println("Fail to update corpus indexes and editions.");
116
				}
117
				System.out.println("Done.");
118
				return Status.OK_STATUS;//frppos
119
			}
120
		};
121
		job.schedule();
122
	}
123
}
0 124

  
tmp/org.txm.treetagger.rcp/src/org/txm/treetagger/commands/LemmaProjection.java (revision 460)
1
package org.txm.treetagger.commands;
2

  
3
import java.io.BufferedReader;
4
import java.io.File;
5
import java.io.FileFilter;
6
import java.io.PrintWriter;
7
import java.util.Arrays;
8
import java.util.Collections;
9
import java.util.HashSet;
10
import java.util.LinkedHashMap;
11
import java.util.LinkedHashSet;
12

  
13
import org.apache.commons.lang.StringUtils;
14
import org.eclipse.core.commands.AbstractHandler;
15
import org.eclipse.core.commands.ExecutionEvent;
16
import org.eclipse.core.commands.ExecutionException;
17
import org.eclipse.core.runtime.IProgressMonitor;
18
import org.eclipse.core.runtime.IStatus;
19
import org.eclipse.core.runtime.Status;
20
import org.eclipse.jface.viewers.ISelection;
21
import org.eclipse.jface.viewers.IStructuredSelection;
22
import org.eclipse.ui.IWorkbenchWindow;
23
import org.eclipse.ui.handlers.HandlerUtil;
24
import org.kohsuke.args4j.Option;
25
import org.txm.Toolbox;
26
import org.txm.core.preferences.TBXPreferences;
27
import org.txm.core.preferences.TXMPreferences;
28
import org.txm.rcp.swt.widget.parameters.ParametersDialog;
29
import org.txm.rcp.utils.JobHandler;
30
import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
31
import org.txm.searchengine.cqp.corpus.Corpus;
32
import org.txm.searchengine.cqp.corpus.MainCorpus;
33
import org.txm.searchengine.cqp.corpus.Property;
34
import org.txm.stat.utils.ConsoleProgressBar;
35
import org.txm.utils.DeleteDir;
36
import org.txm.utils.io.FileCopy;
37
import org.txm.utils.io.IOUtils;
38
import org.txm.utils.Tuple;
39

  
40
/**
41
 * Our sample handler extends AbstractHandler, an IHandler base class.
42
 * @see org.eclipse.core.commands.IHandler
43
 * @see org.eclipse.core.commands.AbstractHandler
44
 */
45
public class LemmaProjection extends AbstractHandler {
46

  
47
	protected static final String EXTRA = "extra";
48
	@Option(name="dictionary", usage="TSV Dictionary file with form, msd, lemma, source columns", widget="File", required=true, def="frolex.tsv")
49
	public File dictionary = null;
50
	@Option(name="extrarules", usage="form+pos rules files", widget="File", required=false, def="extrarules.tsv")
51
	public File extrarules = null;
52
	@Option(name="posproperty", usage="The lexicon property to read", widget="String", required=true, def="frpos")
53
	public String posproperty = null;
54
	@Option(name="lemmaproperty", usage="The property to create/update in the corpus", widget="String", required=true, def="plemma")
55
	public String lemmaproperty = null;
56
	@Option(name="formAsLemmaPosList", usage="Pos values lemma exceptions", widget="String", required=false, def="NOMPro")
57
	public String formAsLemmaPosList = null;
58
	@Option(name="sourcePriorityList", usage="The property to create/update in the corpus", widget="String", required=true, def="TL")
59
	public String sourcePriorityList = null;
60

  
61
	/**
62
	 * 
63
	 */
64
	public Object execute(ExecutionEvent event) throws ExecutionException {
65
		Corpus corpus = null;
66
		IWorkbenchWindow window = HandlerUtil.getActiveWorkbenchWindowChecked(event);
67

  
68
		ISelection isel = window.getActivePage().getSelection();
69
		if (isel instanceof IStructuredSelection) {
70
			IStructuredSelection sel = (IStructuredSelection)isel;
71
			Object first = sel.getFirstElement();
72
			if (first instanceof Corpus) {
73
				corpus = (Corpus)first;
74
				if (ParametersDialog.open(this)) {
75
					LinkedHashSet<String> formAsLemmaPosSet = new LinkedHashSet<String>();
76
					formAsLemmaPosSet.addAll(Arrays.asList(formAsLemmaPosList.split(",")));
77
					LinkedHashSet<String> sourcePrioritySet = new LinkedHashSet<String>();
78
					if (extrarules != null && extrarules.exists()) sourcePrioritySet.add(EXTRA); // extra must be the first source
79
					sourcePrioritySet.addAll(Arrays.asList(sourcePriorityList.split(",")));
80

  
81
					System.out.println("formAsLemmaPosSet="+formAsLemmaPosSet);
82
					System.out.println("sourcePrioritySet="+sourcePrioritySet);
83
					apply(corpus, dictionary, extrarules, posproperty, lemmaproperty, formAsLemmaPosSet, sourcePrioritySet);
84
					return corpus;
85
				}
86
			}
87
		}
88

  
89
		System.out.println("Wrong selection.");
90
		return null;
91
	}
92

  
93
	public static void apply(final Corpus corpus, final File dictionary, final File extrarules, final String posproperty, 
94
			final String targetproperty, final LinkedHashSet<String> formAsLemmaPosList, final LinkedHashSet<String> sourceprioritylist) {
95
		final MainCorpus mainCorpus = corpus.getMainCorpus();
96
		final File corpusBinaryDirectory = mainCorpus.getBaseDirectory();
97
		final File txmDirectory = new File(corpusBinaryDirectory, "txm/"+mainCorpus.getName());
98

  
99
		if (!txmDirectory.exists()) {
100
			System.out.println("Can't process a corpus with no XML-TXM files directory: "+txmDirectory);
101
			return;
102
		}
103

  
104
		final File[] files = txmDirectory.listFiles(new FileFilter() {
105
			@Override
106
			public boolean accept(File file) {
107
				return file.isFile() && file.canWrite() && file.getName().endsWith(".xml");
108
			}
109
		});
110

  
111
		Property pos = null;
112
		try {
113
			pos = mainCorpus.getProperty(posproperty);
114
		} catch (CqiClientException e1) {
115
			// TODO Auto-generated catch block
116
			e1.printStackTrace();
117
		}
118
		if (pos == null) {
119
			System.out.println("No pos property found with name="+posproperty);
120
			return;
121
		}
122

  
123
		if (files == null || files.length == 0) {
124
			System.out.println("Can't process a corpus with no XML-TXM files in "+txmDirectory);
125
			return;
126
		}
127

  
128
		System.out.println("APPLYING : "+dictionary+" to "+mainCorpus+": creating/updating "+targetproperty+ " property with lexicon "+dictionary);
129
		JobHandler job = new JobHandler("Creating/Updating "+targetproperty+" property.") {
130
			@Override
131
			protected IStatus run(IProgressMonitor monitor) {
132
				this.runInit(monitor);
133
				Tuple t;
134
				LinkedHashMap<String, LinkedHashMap<String, LinkedHashMap<String, String>>> rules = new LinkedHashMap<String, LinkedHashMap<String, LinkedHashMap<String, String>>>();
135
				HashSet<String> formAsLemmaPosSet = new HashSet<String>();
136
				try {
137
					// load rules
138
					BufferedReader reader = IOUtils.getReader(dictionary);
139
					String line = reader.readLine();
140
					while (line != null) {
141
						String[] splitTab = line.split("\t");
142
						if (splitTab.length != 5) {
143
							System.out.println("Error in dictionary files with line='"+line+"': length is not 5. Found: "+Arrays.toString(splitTab));
144
							line = reader.readLine();
145
							reader.close();
146
							return Status.CANCEL_STATUS;
147
						}
148
						String form = splitTab[0];
149
						String pos = splitTab[1];//.replace("<no_pos>|", "").replace("|<no_pos>|", "").replace("|<no_pos>", "");
150
						String lemma = splitTab[2];//.replace("<no_lemma>|", "").replace("|<no_lemma>|", "").replace("|<no_lemma>", "");
151
						String source = splitTab[3];
152
						
153
						if (! rules.containsKey(form)) rules.put(form, new LinkedHashMap<String, LinkedHashMap<String, String>>());
154
						LinkedHashMap<String, LinkedHashMap<String, String>> posHash = rules.get(form);
155
						
156
						if (!lemma.equals("<no_lemma>")) {
157
							if (!posHash.containsKey(pos)) posHash.put(pos, new LinkedHashMap<String, String>());
158
							LinkedHashMap<String, String> sourceHash = posHash.get(pos);
159
							
160
							sourceHash.put(source,  lemma);
161
						}
162
						line = reader.readLine();
163
					}
164
					reader.close();
165
					System.out.println("Dictionary rules loaded: "+rules.size());
166

  
167
					if (extrarules.exists()) {
168
						reader = IOUtils.getReader(extrarules);
169
						line = reader.readLine();
170
						while (line != null) {
171
							String[] splitTab = line.split("\t");
172
							if (splitTab.length != 3) {
173
								System.out.println("Error in extra rule files with line='"+line+"': length is not 3.");
174
								line = reader.readLine();
175
								continue;
176
							}
177
							String form = splitTab[0];
178
							String pos = splitTab[1];
179
							String lemma = splitTab[2];
180
							if (! rules.containsKey(form)) rules.put(form, new LinkedHashMap<String, LinkedHashMap<String, String>>());
181
							LinkedHashMap<String, LinkedHashMap<String, String>> posHash = rules.get(form);
182

  
183
							if (!posHash.containsKey(pos)) posHash.put(pos, new LinkedHashMap<String, String>());
184
							LinkedHashMap<String, String> sourceHash = posHash.get(pos);
185
							sourceHash.put(EXTRA,  lemma);
186

  
187
						}
188
						reader.close();
189
						System.out.println("Dictionary extra rules loaded: "+rules.size());
190
					} else {
191
						System.out.println("No extra rule loaded.");
192
					}
193
					
194
					PrintWriter writer = IOUtils.getWriter("/tmp/rules.txt");
195
					for (String k : rules.keySet()) {
196
						writer.println("FORM="+k);
197
						LinkedHashMap<String, LinkedHashMap<String, String>> rules2 = rules.get(k);
198
						for (String k2 : rules2.keySet()) {
199
							writer.println(" POS="+k2);
200
							LinkedHashMap<String, String> rules3 = rules2.get(k2);
201
							for (String k3 : rules3.keySet()) {
202
								writer.println("  SOURCE="+k3);
203
								String ls2 = rules3.get(k3);
204
								writer.println("   LEMMA="+ls2);
205
							}
206
						}
207
					}
208
					writer.close();
209
					System.out.println("RULE DUMP: /tmp/rules.txt");
210
					
211
					// load rules
212
					for (String s : formAsLemmaPosList) {
213
						formAsLemmaPosSet.add(s);
214
					}
215
					System.out.println("POS exception rules loaded: "+formAsLemmaPosSet.size());
216

  
217
					// save previous version of XML-TXM files
218
					File previousXMLTXMDirectory = new File(txmDirectory.getAbsolutePath()+"_previous");
219
					DeleteDir.deleteDirectory(previousXMLTXMDirectory);
220
					FileCopy.copyFiles(txmDirectory, previousXMLTXMDirectory);
221

  
222
					// work
223
					File noMatchsFile = new File(TXMPreferences.getString(TBXPreferences.USER_TXM_HOME, TBXPreferences.PREFERENCES_NODE), "results/nomatch.txt");
224
					HashSet<String> noMatchsSet = new HashSet<String>();
225
					ConsoleProgressBar cpb = new ConsoleProgressBar(files.length);
226
					for (File xmlFile : files) {
227
						cpb.tick();
228
						XMLLemmaProjection p = new XMLLemmaProjection(xmlFile, rules, formAsLemmaPosSet, sourceprioritylist, posproperty, targetproperty);
229
						File tmpFile = new File(xmlFile.getParentFile(), "tmp_"+xmlFile.getName());
230
						if (p.process(tmpFile)) {
231
							if (xmlFile.delete() && tmpFile.renameTo(xmlFile)) {
232
								// ok
233
							} else {
234
								System.out.println("Error during lemma projection: can't replace XML-TXM file: "+xmlFile);
235
								return Status.CANCEL_STATUS;
236
							}
237
						} else {
238
							System.out.println("Error during lemma projection. Aborting.");
239
							return Status.CANCEL_STATUS;
240
						}
241
						if (p.getNoMatchValues().size() > 0) {
242
							System.out.println("No matchs found with file "+xmlFile.getName()+": "+p.getNoMatchValues());
243
							noMatchsSet.addAll(p.getNoMatchValues());
244
						}
245
					}
246

  
247
					if (noMatchsSet.size() > 0) {
248
						System.out.println("Missing lemma values report saved in: "+noMatchsFile);
249
						IOUtils.write(noMatchsFile, StringUtils.join(noMatchsSet, "\n"));
250
					}
251

  
252
					cpb.done();
253
					monitor.worked(50);
254

  
255
					// update corpus
256
					// update corpus indexes and edition
257
					//					String txmhome = Toolbox.getParam(Toolbox.USER_TXM_HOME);
258
					//
259
					//					BaseParameters params = corpus.getBase().getBaseParameters();
260
					//					params.getKeyValueParameters().put(ImportKeys.MULTITHREAD, "false"); //too soon
261
					//					params.getKeyValueParameters().put(ImportKeys.DEBUG, Log.getLevel().intValue() < Level.WARNING.intValue()); // need debug for experimental stuff
262
					//					params.getKeyValueParameters().put(ImportKeys.UPDATECORPUS, "true");
263
					//
264
					//					monitor.setTaskName("Updating corpus");
265
					//					File scriptDir = new File(txmhome, "scripts/import");
266
					//					File script = new File(scriptDir, "xtzLoader.groovy");
267
					//					System.out.println("Updating corpus "+corpus+" using "+params.paramFile);
268
					//					boolean ret = ExecuteImportScript.executeScript(script.getAbsolutePath(), params);
269
					//					if (!ret) {
270
					//						System.out.println("Error during corpus re-import, check the XML-TXM files. Previous version can be restored from "+previousXMLTXMDirectory);
271
					//						return Status.CANCEL_STATUS;
272
					//					}
273
					//					Display.getDefault().syncExec(new Runnable() {
274
					//						@Override
275
					//						public void run() {CloseEditorsUsing.corpus(corpus);}
276
					//					});
277
					//					monitor.worked(50);
278

  
279
				} catch (Exception e) {
280
					e.printStackTrace();
281
					return Status.CANCEL_STATUS;
282
				}
283
				System.out.println("Done.");
284
				return Status.OK_STATUS;
285
			}
286
		};
287
		job.schedule();
288
	}
289
}
0 290

  
tmp/org.txm.treetagger.rcp/src/org/txm/treetagger/commands/XMLLemmaProjection.java (revision 460)
1
package org.txm.treetagger.commands;
2

  
3
import java.io.File;
4
import java.io.IOException;
5
import java.util.HashSet;
6
import java.util.LinkedHashMap;
7
import java.util.LinkedHashSet;
8
import java.util.regex.Pattern;
9

  
10
import javax.xml.stream.XMLStreamException;
11

  
12
import org.txm.importer.StaxIdentityParser;
13

  
14
public class XMLLemmaProjection extends StaxIdentityParser {
15

  
16
	// form -> pos -> source -> lemma
17
	protected LinkedHashMap<String, LinkedHashMap<String, LinkedHashMap<String, String>>> rules = null;
18
	protected HashSet<String> formAsLemmaPosList = null;
19
	protected String lemmaProperty;
20

  
21
	protected HashSet<String> noMatchValues = new HashSet<String>();
22
	protected String posProperty;
23
	protected LinkedHashSet<String> lemmaSourcePriorityList;
24

  
25
	public XMLLemmaProjection(File infile, LinkedHashMap<String, LinkedHashMap<String, 
26
			LinkedHashMap<String, String>>> rules, 
27
			HashSet<String> formAsLemmaPosList,
28
			LinkedHashSet<String> lemmaSourcePriorityList, 
29
			String posProperty, String lemmaProperty) throws IOException, XMLStreamException {
30
		super(infile);
31
		this.rules = rules;
32
		this.formAsLemmaPosList = formAsLemmaPosList;
33
		this.lemmaSourcePriorityList = lemmaSourcePriorityList;
34
		this.lemmaProperty = lemmaProperty;
35
		this.posProperty = posProperty;
36
		
37
		// the XML-TXM files word properties name starts wit # (they are references)
38
		if (!this.lemmaProperty.startsWith("#")) this.lemmaProperty = "#"+this.lemmaProperty;
39
		if (!this.posProperty.startsWith("#")) this.posProperty = "#"+this.posProperty;
40
	}
41

  
42
	boolean inW = false, inAna = false, inForm;
43
	LinkedHashMap<String, String> anaValues = new LinkedHashMap<String, String>();
44
	LinkedHashMap<String, String> anaResps = new LinkedHashMap<String, String>();
45
	String typeName = null;
46
	String respName = null;
47
	String formValue, typeValue = null;
48

  
49
	@Override
50
	public void processStartElement() throws XMLStreamException, IOException {
51
		if (!inW) super.processStartElement(); // don't write W content
52

  
53
		if (localname.equals("w")) {
54
			inW = true;
55
			anaValues.clear();
56
			anaResps.clear();
57

  
58
			//initialize the new type to a empty value in case there is transformation rule
59
			anaValues.put(lemmaProperty, "");
60
			anaResps.put(lemmaProperty, "#txm_recode");
61
		} else if (localname.equals("ana")) {
62
			inAna = true;
63
			typeName = parser.getAttributeValue(null, "type");
64
			respName = parser.getAttributeValue(null, "resp");
65
			anaResps.put(typeName, respName);
66
			//if (typeName != null) typeName = typeName.substring(1); // remove #
67
			typeValue = "";
68
		} else if (localname.equals("form")) {
69
			inForm = true;
70
			formValue = "";
71
		} 
72
	}
73

  
74
	@Override
75
	public void processCharacters() throws XMLStreamException {
76
		if (inW && inAna) typeValue+=parser.getText();
77
		else if (inW && inForm) formValue+=parser.getText();
78
		else super.processCharacters();
79
	}
80

  
81
	@Override
82
	public void processEndElement() throws XMLStreamException {
83
		if (localname.equals("w")) {
84
			inW = false;
85

  
86
			// write W content
87
			try {
88
				// get the value to test
89
				String posValue = anaValues.get(posProperty);
90
				if (posValue == null) {
91
					posValue = "<no_pos>";
92
//					anaValues.put(posProperty, "<no_pos>");
93
//					anaResps.put(posProperty, "txm_recode");
94
				}
95
				String value = updateAnaValuesIfMatch(formValue.trim(), posValue.trim());
96
				//System.out.println("form="+formValue+" + pos="+posValue+" -> "+value);
97
				anaValues.put(lemmaProperty, value);
98
				anaResps.put(lemmaProperty, "#txm_recode");
99
				
100
				// write the word element
101
				writer.writeStartElement("txm:form");
102
				writer.writeCharacters(formValue);
103
				writer.writeEndElement();
104

  
105
				for (String k : anaValues.keySet()) {
106
					writer.writeStartElement("txm:ana");
107
					writer.writeAttribute("resp", anaResps.get(k));
108
					writer.writeAttribute("type", k);
109
					writer.writeCharacters(anaValues.get(k));
110
					writer.writeEndElement();
111
				}
112
			} catch (XMLStreamException e) {
113
				e.printStackTrace();
114
			}
115
		} else if (localname.equals("ana")) {
116
			anaValues.put(typeName, typeValue);
117
			inAna = false;
118
		} else if (localname.equals("form")) {
119
			inForm = false;
120
		} 
121

  
122
		if (!inW) super.processEndElement(); // don't write W content now
123
	}
124

  
125
	protected String updateAnaValuesIfMatch(String formValue, String posValue) {
126
		if (posValue == null) return "";
127
		
128
		if (formAsLemmaPosList.contains(posValue)) return formValue;
129
		
130
		
131
		if (formValue.equals("virge")) System.out.println("testing: "+formValue+" "+posValue);
132
		if (formValue.equals("virge")) System.out.println("form connue? "+rules.containsKey(formValue));
133
		if (rules.containsKey(formValue)) {
134
			LinkedHashMap<String, LinkedHashMap<String, String>> posHash = rules.get(formValue);
135
//			if (posHash.containsKey(posValue)) {
136
//				LinkedHashMap<String, String> sourceHash = posHash.get(posValue);
137
//				for (String source : lemmaSourcePriorityList) {
138
//					if (sourceHash.containsKey(source)) {
139
//						return sourceHash.get(source);
140
//					}
141
//				}
142
//			}
143
			if (formValue.equals("virge")) System.out.println(" tests"+posHash.keySet());
144
			for (String posRegexp : posHash.keySet()) {
145
				
146
				if (posValue.matches(posRegexp)) {
147
					return posHash.get(posRegexp).toString();
148
				}
149
			}
150
		}
151
		
152
		// try without maj
153
		String formValueMin = formValue.toLowerCase();
154
		if (rules.containsKey(formValueMin)) {
155
			LinkedHashMap<String, LinkedHashMap<String, String>> posHash = rules.get(formValueMin);
156
//			if (posHash.containsKey(posValue)) {
157
//				LinkedHashMap<String, String> sourceHash = posHash.get(posValue);
158
//				for (String source : lemmaSourcePriorityList) {
159
//					if (sourceHash.containsKey(source)) {
160
//						return sourceHash.get(source);
161
//					}
162
//				}
163
//			}
164
			
165
			for (String posRegexp : posHash.keySet()) {
166
				if (posValue.matches(posRegexp)) {
167
					return posHash.get(posRegexp).toString();
168
				}
169
			}
170
		}
171
		
172
		noMatchValues.add(formValue+"|"+posValue);
173
		return "!"+formValue;
174
	}
175

  
176
	public HashSet<String> getNoMatchValues() {
177
		return noMatchValues;
178
	}
179

  
180
	public static void main(String args[]) {
181
		File xmlFile = new File("/home/mdecorde/TXM/corpora/XTZTEXTUALPLANS/txm/XTZTEXTUALPLANS/test.xml");
182
		File tmpFile = new File("/home/mdecorde/TXM/corpora/XTZTEXTUALPLANS/txm/XTZTEXTUALPLANS/test-o.xml");
183
		String posProperty = "type";
184
		String newType = "lemma";
185
		LinkedHashMap<Pattern[], String> rules = new LinkedHashMap<Pattern[], String>();
186
		rules.put(new Pattern[]{Pattern.compile("w"), Pattern.compile("w")}, "WORD");
187
		rules.put(new Pattern[]{Pattern.compile("x.+"), Pattern.compile("w")}, "XWORD");
188
		rules.put(new Pattern[]{Pattern.compile("y"), Pattern.compile("w")}, "YWORD");
189
		rules.put(new Pattern[]{Pattern.compile("y.*"), Pattern.compile("w")}, "YMULTIWORD");
190
		//XMLPropertyProjection converter = new XMLPropertyProjection(xmlFile, rules, posProperty, newType);
191
		//System.out.println(converter.process(tmpFile));
192
	}
193
}
0 194

  
tmp/org.txm.treetagger.rcp/src/org/txm/treetagger/commands/RemoveProperties.java (revision 460)
1
package org.txm.treetagger.commands;
2

  
3
import java.io.BufferedReader;
4
import java.io.File;
5
import java.io.FileFilter;
6
import java.util.Arrays;
7
import java.util.HashSet;
8
import java.util.LinkedHashMap;
9
import java.util.LinkedHashSet;
10

  
11
import org.apache.commons.lang.StringUtils;
12
import org.eclipse.core.commands.AbstractHandler;
13
import org.eclipse.core.commands.ExecutionEvent;
14
import org.eclipse.core.commands.ExecutionException;
15
import org.eclipse.core.runtime.IProgressMonitor;
16
import org.eclipse.core.runtime.IStatus;
17
import org.eclipse.core.runtime.Status;
18
import org.eclipse.jface.viewers.ISelection;
19
import org.eclipse.jface.viewers.IStructuredSelection;
20
import org.eclipse.ui.IWorkbenchWindow;
21
import org.eclipse.ui.handlers.HandlerUtil;
22
import org.kohsuke.args4j.Option;
23
import org.txm.Toolbox;
24
import org.txm.core.preferences.TBXPreferences;
25
import org.txm.core.preferences.TXMPreferences;
26
import org.txm.rcp.swt.widget.parameters.ParametersDialog;
27
import org.txm.rcp.utils.JobHandler;
28
import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
29
import org.txm.searchengine.cqp.corpus.Corpus;
30
import org.txm.searchengine.cqp.corpus.MainCorpus;
31
import org.txm.searchengine.cqp.corpus.Property;
32
import org.txm.stat.utils.ConsoleProgressBar;
33
import org.txm.utils.DeleteDir;
34
import org.txm.utils.io.FileCopy;
35
import org.txm.utils.io.IOUtils;
36
import org.txm.utils.Tuple;
37

  
38
/**
39
 * Our sample handler extends AbstractHandler, an IHandler base class.
40
 * @see org.eclipse.core.commands.IHandler
41
 * @see org.eclipse.core.commands.AbstractHandler
42
 */
43
public class RemoveProperties extends AbstractHandler {
44

  
45
	@Option(name="propertiesList", usage="The properties to remove", widget="String", required=true, def="plemma")
46
	public String propertiesList = null;
47

  
48
	/**
49
	 * 
50
	 */
51
	public Object execute(ExecutionEvent event) throws ExecutionException {
52
		Corpus corpus = null;
53
		IWorkbenchWindow window = HandlerUtil.getActiveWorkbenchWindowChecked(event);
54

  
55
		ISelection isel = window.getActivePage().getSelection();
56
		if (isel instanceof IStructuredSelection) {
57
			IStructuredSelection sel = (IStructuredSelection)isel;
58
			Object first = sel.getFirstElement();
59
			if (first instanceof Corpus) {
60
				corpus = (Corpus)first;
61
				if (ParametersDialog.open(this)) {
62
					LinkedHashSet<String> propertiesSet = new LinkedHashSet<String>();
63
					propertiesSet.addAll(Arrays.asList(propertiesList.split(",")));
64
					
65
					apply(corpus, propertiesSet);
66
					return corpus;
67
				}
68
			}
69
		}
70

  
71
		System.out.println("Wrong selection.");
72
		return null;
73
	}
74

  
75
	public static void apply(final Corpus corpus, final HashSet<String> propertiesSet) {
76
		final MainCorpus mainCorpus = corpus.getMainCorpus();
77
		final File corpusBinaryDirectory = mainCorpus.getBaseDirectory();
78
		final File txmDirectory = new File(corpusBinaryDirectory, "txm/"+mainCorpus.getName());
79

  
80
		if (!txmDirectory.exists()) {
81
			System.out.println("Can't process a corpus with no XML-TXM files directory: "+txmDirectory);
82
			return;
83
		}
84

  
85
		final File[] files = txmDirectory.listFiles(new FileFilter() {
86
			@Override
87
			public boolean accept(File file) {
88
				return file.isFile() && file.canWrite() && file.getName().endsWith(".xml");
89
			}
90
		});
91

  
92
		if (files == null || files.length == 0) {
93
			System.out.println("Can't process a corpus with no XML-TXM files in "+txmDirectory);
94
			return;
95
		}
96

  
97
		System.out.println("Removing "+propertiesSet+" to "+mainCorpus+" XML-TXM files...");
98
		JobHandler job = new JobHandler("Removing "+propertiesSet+" to "+mainCorpus+" XML-TXM files.") {
99
			@Override
100
			protected IStatus run(IProgressMonitor monitor) {
101
				this.runInit(monitor);
102
				LinkedHashMap<String, LinkedHashMap<String, LinkedHashMap<String, String>>> rules = new LinkedHashMap<String, LinkedHashMap<String, LinkedHashMap<String, String>>>();
103
				HashSet<String> no_change_rules = new HashSet<String>();
104
				try {
105
					// save previous version of XML-TXM files
106
					File previousXMLTXMDirectory = new File(txmDirectory.getAbsolutePath()+"_previous");
107
					DeleteDir.deleteDirectory(previousXMLTXMDirectory);
108
					FileCopy.copyFiles(txmDirectory, previousXMLTXMDirectory);
109

  
110
					// work
111
					File noMatchsFile = new File(TXMPreferences.getString(TBXPreferences.USER_TXM_HOME, TBXPreferences.PREFERENCES_NODE), "results/nomatch.txt");
112
					HashSet<String> noMatchsSet = new HashSet<String>();
113
					ConsoleProgressBar cpb = new ConsoleProgressBar(files.length);
114
					for (File xmlFile : files) {
115
						cpb.tick();
116
						XMLRemoveProperties p = new XMLRemoveProperties(xmlFile, propertiesSet);
117
						File tmpFile = new File(xmlFile.getParentFile(), "tmp_"+xmlFile.getName());
118
						if (p.process(tmpFile)) {
119
							if (xmlFile.delete() && tmpFile.renameTo(xmlFile)) {
120
								// ok
121
							} else {
122
								System.out.println("Error during properties removal: can't replace XML-TXM file: "+xmlFile);
123
								return Status.CANCEL_STATUS;
124
							}
125
						} else {
126
							System.out.println("Error during properties removal. Aborting.");
127
							return Status.CANCEL_STATUS;
128
						}
129
					}
130

  
131
					cpb.done();
132
					monitor.worked(50);
133

  
134
				} catch (Exception e) {
135
					e.printStackTrace();
136
					return Status.CANCEL_STATUS;
137
				}
138
				System.out.println("Done.");
139
				return Status.OK_STATUS;
140
			}
141
		};
142
		job.schedule();
143
	}
144
}
0 145

  
tmp/org.txm.treetagger.rcp/src/org/txm/treetagger/commands/XMLRemoveProperties.java (revision 460)
1
package org.txm.treetagger.commands;
2

  
3
import java.io.File;
4
import java.io.IOException;
5
import java.util.HashSet;
6
import java.util.LinkedHashMap;
7
import java.util.regex.Pattern;
8

  
9
import javax.xml.stream.XMLStreamException;
10

  
11
import org.txm.importer.StaxIdentityParser;
12

  
13
/**
14
 * Remove XML-TXM file ana elements which 'type' attribute value is in a set
15
 * 
16
 * @author mdecorde
17
 *
18
 */
19
public class XMLRemoveProperties extends StaxIdentityParser {
20

  
21
	// form -> pos -> source -> lemma
22
	protected HashSet<String> propertiesSet = null;
23

  
24
	/**
25
	 * 
26
	 * @param infile the XML-TXM file to process
27
	 * @param propertiesSet the set of ana@type attributes to remove
28
	 * @throws XMLStreamException 
29
	 * @throws IOException 
30
	 */
31
	public XMLRemoveProperties(File infile, HashSet<String> propertiesSet) throws IOException, XMLStreamException {
32
		super(infile);
33
		this.propertiesSet = new HashSet<String>();
34
		for (String property : propertiesSet) {
35
			// the XML-TXM files word properties name starts with # (they are references)
36
			if (!property.startsWith("#")) property = "#"+property;
37
			this.propertiesSet.add(property);
38
		}
39
	}
40

  
41
	boolean inW = false, inAna = false;
42
	String typeName = null;
43
	@Override
44
	public void processStartElement() throws XMLStreamException, IOException {
45
		
46
		if (localname.equals("w")) {
47
			inW = true;
48
		} else if (inW && localname.equals("ana")) {
49
			inAna = true;
50
			typeName = parser.getAttributeValue(null, "type");
51
			if (propertiesSet.contains(typeName)) return; // don't write this element start tag
52
		}
53
		
54
		super.processStartElement();
55
	}
56

  
57
	@Override
58
	public void processCharacters() throws XMLStreamException {
59
		
60
		if (inW && typeName != null && propertiesSet.contains(typeName)) {return;} // don't write the element content
61
		super.processCharacters();
62
	}
63

  
64
	@Override
65
	public void processEndElement() throws XMLStreamException {
66
		if (localname.equals("w")) {
67
			inW = false;
68
		} else if (inW && localname.equals("ana")) {
69
			inAna = false;
70
			if (propertiesSet.contains(typeName)) {typeName = null; return;} // don't write the element end tag
71
			typeName = null;
72
		}
73

  
74
		super.processEndElement(); // don't write W content now
75
	}
76

  
77
	public static void main(String args[]) {
78
		File xmlFile = new File("/home/mdecorde/TXM/corpora/XTZTEXTUALPLANS/txm/XTZTEXTUALPLANS/test.xml");
79
		File tmpFile = new File("/home/mdecorde/TXM/corpora/XTZTEXTUALPLANS/txm/XTZTEXTUALPLANS/test-o.xml");
80
		String posProperty = "type";
81
		String newType = "lemma";
82
		LinkedHashMap<Pattern[], String> rules = new LinkedHashMap<Pattern[], String>();
83
		rules.put(new Pattern[]{Pattern.compile("w"), Pattern.compile("w")}, "WORD");
84
		rules.put(new Pattern[]{Pattern.compile("x.+"), Pattern.compile("w")}, "XWORD");
85
		rules.put(new Pattern[]{Pattern.compile("y"), Pattern.compile("w")}, "YWORD");
86
		rules.put(new Pattern[]{Pattern.compile("y.*"), Pattern.compile("w")}, "YMULTIWORD");
87
		//XMLPropertyProjection converter = new XMLPropertyProjection(xmlFile, rules, posProperty, newType);
88
		//System.out.println(converter.process(tmpFile));
89
	}
90
}
0 91

  
... Ce différentiel a été tronqué car il excède la taille maximale pouvant être affichée.

Formats disponibles : Unified diff