Révision 2075

tmp/org.txm.treetagger.rcp/src/org/txm/treetagger/rcp/handlers/Train.java (revision 2075)
66 66
	public String lemmaProperty = null;
67 67
	@Option(name="lexique", usage="Lexicon file", widget="File", required=true, def="lexicon.txt")
68 68
	public File lexique = null;
69
	@Option(name="openclassfile", usage="openclassfile file", widget="File", required=true, def="")
70
	public File openclassfile = null;
69 71
	@Option(name="options", usage="TreeTagger supplementary options", widget="String", required=true, def="")
70 72
	public String options = null;
71 73

  
......
85 87
				corpus = (CQPCorpus)first;
86 88
				if (ParametersDialog.open(this)) {
87 89

  
88
					train(corpus, model, lexique, new String[]{posProperty, lemmaProperty}, sentenceTag, options.split("  "));
90
					train(corpus, model, lexique, openclassfile, new String[]{posProperty, lemmaProperty}, sentenceTag, options.split("  "));
89 91

  
90 92
					return corpus;
91 93
				}
......
96 98
		return null;
97 99
	}
98 100

  
99
	public static void train(final CQPCorpus corpus, final File model, final File lexique, final String[] properties, final String sentenceTag, final String[] options) {
101
	public static void train(final CQPCorpus corpus, final File model, final File lexique, final File openclassfile, final String[] properties, final String sentenceTag, final String[] options) {
100 102

  
101 103
		JobHandler job = new JobHandler("Applying TreeTagger to "+corpus+" corpus.") {
102 104
			@Override
......
106 108
					File lexique2 = lexique;
107 109
					MainCorpus mainCorpus = corpus.getMainCorpus();
108 110
					File corpusBinaryDirectory = mainCorpus.getProjectDirectory();
109
					
111

  
110 112
					System.out.println("TRAIN : "+corpus+" with "+lexique2+" to create "+model+ " with properties "+Arrays.toString(properties));
111 113

  
112 114
					if (properties == null || properties.length != 2) {
......
121 123
							return Status.CANCEL_STATUS;
122 124
						}
123 125
					}
124
					
126

  
125 127
					Property pos = corpus.getProperty(properties[0]);
126 128
					Property lemma = corpus.getProperty(properties[1]);
127
					
129

  
128 130
					// Prepare temporary directory
129 131
					File treetaggerSrcDirectory = new File(mainCorpus.getProjectDirectory(), "treetagger");
130 132
					DeleteDir.deleteDirectory(treetaggerSrcDirectory);
131 133
					treetaggerSrcDirectory.mkdirs();
132
					
134

  
133 135
					HashMap<String, HashSet<String>> simplified_lexicon = null;
134 136
					HashMap<String, HashSet<String>> simplified_lexicon_errors = null;
135 137
					int error_counter = 0;
......
160 162
							if (!lex.containsKey(form)) {
161 163
								ArrayList<String> pairs = new ArrayList<String>();
162 164
								HashSet<String> posValues = new HashSet<String>();
163
								
165

  
164 166
								allPosValues.put(form, posValues);
165 167
								lex.put(form, pairs);
166 168
							}
......
169 171
							String posValue = values.get(1).get(0);
170 172
							String lemmaValue = values.get(2).get(0);
171 173
							if (posValues.contains(posValue)) {
172
								
174

  
173 175
							} else {
174 176
								posValues.add(posValue);
175 177
								pairs.add(posValue);
......
185 187
							for (String v : lex.get(form)) {
186 188
								if (tab) writer.write("\t"+v);
187 189
								else writer.write(" "+v);
188
								
190

  
189 191
								tab = !tab;
190 192
							}
191 193
							writer.write("\n");
......
209 211
						}
210 212
						reader.close();
211 213
					}
212
					
213
					
214

  
215

  
214 216
					// create TT SRC file from CWB indexes
215
					
217

  
216 218
					File ttSrcFile = new File(treetaggerSrcDirectory, mainCorpus.getID()+".tt");
217 219
					System.out.println("TT SRC file: "+ttSrcFile.getAbsolutePath());
218 220
					BufferedOutputStream fos = new BufferedOutputStream(new FileOutputStream(ttSrcFile));
......
223 225
					for (Match m : corpus.getMatches()) {
224 226
						for (int i = m.getStart() ; i <= m.getEnd() ; i++) { // end match must be included
225 227
							positions.add(i);
226
							
228

  
227 229
							if (positions.size() >= 1000) { // avoid too big array
228 230
								int[] positions_array = new int[positions.size()];
229 231
								int ip = 0;
......
236 238
									if (w != null) {
237 239
										String s = w+"\t"+values[iW];
238 240
										ps.println(s);
239
										
241

  
240 242
										if (simplified_lexicon != null) { // check given lexicon
241 243
											if (simplified_lexicon.containsKey(w)) {
242 244
												if (!simplified_lexicon.get(w).contains(values[iW])) {
......
277 279
						positions.clear();
278 280
					}
279 281
					ps.close();
280
					
282

  
281 283
					if (simplified_lexicon_errors != null && simplified_lexicon_errors.size() > 0) {
282 284
						File error_file = new File(treetaggerSrcDirectory, "errors.txt");
283 285
						PrintWriter errorwriter = IOUtils.getWriter(error_file);
......
300 302
						String line = reader.readLine();
301 303
						while (line != null) {
302 304
							String w = line.split("\t", 2)[0];
303
							
305

  
304 306
							if (simplified_lexicon_errors.containsKey(w)) {
305 307
								for (String p : simplified_lexicon_errors.get(w)) {
306 308
									if (!p.startsWith("#"))
......
308 310
								}
309 311
								simplified_lexicon_errors.remove(w);
310 312
							}
311
							
313

  
312 314
							writer.println(line);
313 315
							line = reader.readLine();
314 316
						}
315
						
317

  
316 318
						// write missing words
317 319
						for (String w2 : simplified_lexicon_errors.keySet()) {
318 320
							writer.print(w2);
......
321 323
							}
322 324
							writer.println("");
323 325
						}
324
						
326

  
325 327
						reader.close();
326 328
						writer.close();
327 329
						System.out.println("Adding words to a temporary lexicon: "+lexique3);
328 330
						lexique2 = lexique3;
329 331
					}
330
					
332

  
331 333
					// Create open class file : contains all pos values
332
					File openclassfile = new File(treetaggerSrcDirectory, "openclasses.txt");
333
					PrintWriter openClassFileWriter = IOUtils.getWriter(openclassfile);
334
					
335
//					Lexicon poslexicon = corpus.getLexicon(pos);
336
//					String[] posValues = poslexicon.getForms();
337
//					for (int iV = 0 ; iV < posValues.length ; iV++) {
338
//						if (iV == 0) openClassFileWriter.print(posValues[iV]);
339
//						else openClassFileWriter.print(" "+posValues[iV]);
340
//					}
341
					openClassFileWriter.close();
334
					File tmpopenclassfile = openclassfile;
335
					if (tmpopenclassfile == null || tmpopenclassfile.getName().length() ==0) {
336
						tmpopenclassfile = new File(treetaggerSrcDirectory, "openclasses.txt");
337
						PrintWriter openClassFileWriter = IOUtils.getWriter(openclassfile);
342 338

  
339
						//					Lexicon poslexicon = corpus.getLexicon(pos);
340
						//					String[] posValues = poslexicon.getForms();
341
						//					for (int iV = 0 ; iV < posValues.length ; iV++) {
342
						//						if (iV == 0) openClassFileWriter.print(posValues[iV]);
343
						//						else openClassFileWriter.print(" "+posValues[iV]);
344
						//					}
345
						openClassFileWriter.close();
346
					}
347

  
343 348
					// Call treetagger-train
344
					if (ttSrcFile.exists() && lexique2.exists() && openclassfile.exists()) {
349
					if (ttSrcFile.exists() && lexique2.exists() && tmpopenclassfile.exists()) {
345 350
						System.out.println("Running treetagger-train...");
346 351
						String treetaggerBinDirectory = new File(TreeTaggerPreferences.getInstance().getString(TreeTaggerPreferences.INSTALL_PATH), "bin").getAbsolutePath();
347 352
						if (!treetaggerBinDirectory.endsWith("/")) treetaggerBinDirectory += "/";
348 353

  
349 354
						TreeTagger tt = new TreeTagger(treetaggerBinDirectory, options);
350 355
						tt.settoken();
351
						
356

  
352 357
						//tt.setlemma();
353 358
						tt.setsgml();
354 359
						tt.setst(sentenceTag);
......
358 363
						if (cl > 0) {
359 364
							tt.setcl(cl);
360 365
						}
361
						
366

  
362 367
						float dtg = TreeTaggerPreferences.getInstance().getFloat(TreeTaggerPreferences.OPTIONS_DTG);
363 368
						if (dtg > 0) {
364 369
							tt.setdtg(dtg);
365 370
						}
366
						
371

  
367 372
						float sw = TreeTaggerPreferences.getInstance().getFloat(TreeTaggerPreferences.OPTIONS_SW);
368 373
						if (sw > 0) {
369 374
							tt.setsw(sw);
370 375
						}
371
						
376

  
372 377
						float atg = TreeTaggerPreferences.getInstance().getFloat(TreeTaggerPreferences.OPTIONS_ATG);
373 378
						if (atg > 0) {
374 379
							tt.setatg(atg);
375 380
						}
376
						
381

  
377 382
						float ecw = TreeTaggerPreferences.getInstance().getFloat(TreeTaggerPreferences.OPTIONS_ECW);
378 383
						if (ecw > 0) {
379 384
							tt.setecw(ecw);
380 385
						}
381
						
386

  
382 387
						float lt = TreeTaggerPreferences.getInstance().getFloat(TreeTaggerPreferences.OPTIONS_LT);
383 388
						if (lt > 0) {
384 389
							tt.setlt(lt);
385 390
						}
386
						
391

  
387 392
						if (TreeTaggerPreferences.getInstance().getBoolean(TreeTaggerPreferences.OPTIONS_DEBUG)) {
388 393
							tt.debug(true);
389 394
						} else {
390 395
							tt.setquiet();
391 396
						}
392
						
393
						tt.traintreetagger(lexique2.getAbsolutePath(), openclassfile.getAbsolutePath(), ttSrcFile.getAbsolutePath(), model.getAbsolutePath());
394
						
397

  
398
						tt.traintreetagger(lexique2.getAbsolutePath(), tmpopenclassfile.getAbsolutePath(), ttSrcFile.getAbsolutePath(), model.getAbsolutePath());
399

  
395 400
						System.out.println("Done: "+model.getAbsolutePath());
396 401
					} else {
397 402
						System.out.println("Aborting.");
398 403
					}
399
					
404

  
400 405
					return Status.OK_STATUS;
401 406
				} catch (Exception e) {
402 407
					System.out.println("Error while training TT: "+e);
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZPager.groovy (revision 2075)
163 163
		boolean mustBuildFacsEdition = project.getEditionDefinition("facs").getBuildEdition()
164 164
		if (!mustBuildFacsEdition) return true;
165 165

  
166
		String imageDirectoryPath = project.getEditionDefinition("facs").getImagesDirectory().trim();
167
		File imageDirectory = new File(imageDirectoryPath);
168
		if (!imageDirectoryPath.startsWith("http") && imageDirectoryPath.length()== 0 && !imageDirectory.exists() && !imageDirectory.isDirectory()) {
169
			imageDirectory = null;
166
		String imageDirectoryPath = project.getEditionDefinition("facs").getImagesDirectory();
167
		if (imageDirectoryPath != null) {
168
			imageDirectoryPath = imageDirectoryPath.trim()
169
			File imageDirectory = new File(imageDirectoryPath);
170
			if (!imageDirectoryPath.startsWith("http") && imageDirectoryPath.length()== 0 && !imageDirectory.exists() && !imageDirectory.isDirectory()) {
171
				imageDirectory = null;
172
			}
170 173
		}
171 174

  
172 175
		def second = 0
tmp/org.txm.core/src/java/org/txm/core/preferences/TBXPreferences.java (revision 2075)
148 148
		
149 149
		preferences.putBoolean(TBXPreferences.EXPERT_USER, false);
150 150
		preferences.putBoolean(TBXPreferences.VISIBLE, true);
151
		preferences.putBoolean(TBXPreferences.CLEAN, true);
152
		
151 153

  
152 154
		// FIXME
153 155
		preferences.putBoolean(TBXPreferences.ADD_TECH_LOGS, true);

Formats disponibles : Unified diff