Révision 3705

TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImport.groovy (revision 3705)
17 17
import org.txm.scripts.importer.xtz.*
18 18
import org.txm.scripts.importer.tigersearch.TSImport
19 19
import org.txm.utils.*
20
import org.txm.searchengine.cqp.corpus.MainCorpus
20 21
import org.txm.searchengine.ts.TIGERSearchEngine
21 22
import org.txm.conllu.core.preferences.UDPreferences
22 23
import org.txm.tigersearch.preferences.TigerSearchPreferences
......
116 117
			if (isSuccessful) {
117 118
				// read from the 'tiger-xml' and write to the 'tiger' directory
118 119
				TIGERSearchEngine.buildTIGERCorpus(tigerXMLDirectory, this.binaryDirectory, corpusName);
120
				
121
				// re-align TIGER word indexes with the CQP word indexes using the TS@editionId and CQP@id properties
122
				File tigerDirectory = new File(this.binaryDirectory, "tiger");
123
				File tigerCorpusExistingDirectory = new File(this.binaryDirectory, "tiger/"+corpusName);
124
				MainCorpus corpus = this.project.getFirstChild(MainCorpus.class);
125
				corpus.compute(false); // load  the corpus in CQP
126
				TIGERSearchEngine.writeOffsetDataFiles(corpus, "editionId", tigerCorpusExistingDirectory, tigerDirectory, tigerCorpusExistingDirectory)
119 127
			}
120 128
		} else {
121 129
			println "Can not do the TIGER indexes step."
TXM/trunk/bundles/org.txm.tigersearch.rcp/src/org/txm/searchengine/ts/TIGERSearchEngine.java (revision 3705)
5 5
import java.io.FileNotFoundException;
6 6
import java.io.IOException;
7 7
import java.io.PrintWriter;
8
import java.io.RandomAccessFile;
8 9
import java.io.UnsupportedEncodingException;
9 10
import java.nio.MappedByteBuffer;
11
import java.nio.channels.FileChannel;
10 12
import java.util.ArrayList;
11 13
import java.util.HashMap;
12 14
import java.util.LinkedHashSet;
......
29 31
import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
30 32
import org.txm.searchengine.cqp.clientExceptions.UnexpectedAnswerException;
31 33
import org.txm.searchengine.cqp.corpus.CQPCorpus;
34
import org.txm.searchengine.cqp.corpus.MainCorpus;
32 35
import org.txm.searchengine.cqp.serverException.CqiServerError;
33 36
import org.txm.tigersearch.preferences.TigerSearchPreferences;
34 37
import org.txm.tigersearch.preferences.TigerSearchTreePreferences;
38
import org.txm.utils.ConsoleProgressBar;
35 39
import org.txm.utils.DeleteDir;
36 40
import org.txm.utils.io.IOUtils;
37 41
import org.txm.utils.logger.Log;
38 42

  
43
import ims.tiger.corpus.Sentence;
44
import ims.tiger.corpus.T_Node;
39 45
import ims.tiger.index.reader.Index;
40 46
import ims.tiger.index.reader.IndexException;
41 47
import ims.tiger.index.writer.IndexBuilderErrorHandler;
42 48
import ims.tiger.index.writer.SimpleErrorHandler;
43 49
import ims.tiger.index.writer.XMLIndexing;
44 50
import ims.tiger.query.api.MatchResult;
51
import ims.tiger.query.api.QueryIndexException;
52
import ims.tiger.query.processor.CorpusQueryProcessor;
45 53

  
46 54
public class TIGERSearchEngine extends SearchEngine {
47 55
	
......
392 400
		}
393 401
		return true;
394 402
	}
403
	
404

  
405
	public static int writeOffsetDataFiles(MainCorpus corpus, String wordIdAttribute, File tigerCorpusDirectory, File tigerDirectory, File tigerCorpusExistingDirectory) throws IndexException, QueryIndexException, UnexpectedAnswerException, IOException, CqiServerError, CqiClientException {
406
		
407
		// TXM corpus files
408
		File configfile = new File(tigerDirectory, "tigersearch.logprop");
409
		
410
		TSCorpusManager manager = new TSCorpusManager(tigerCorpusDirectory.getParentFile(), configfile);
411
		TSCorpus tcorpus = manager.getCorpus(tigerCorpusDirectory.getName());
412
		InternalCorpusQueryManagerLocal2 tigermanager = tcorpus.manager;
413
		CorpusQueryProcessor processor = tigermanager.getQueryProcessor();
414
		AbstractCqiClient CQI = CQPSearchEngine.getCqiClient();
415
		
416
		Index index = processor.getIndex();
417
		int size = 0;
418
		for (int nr = 0; nr < index.getNumberOfGraphs(); nr++) {
419
			size += index.getNumberOfTNodes(nr);
420
		}
421
		
422
		if (size == 0) {
423
			Log.warning("No word found in the TIGERSearch corpus: " + tigerCorpusDirectory + ". Aborting.");
424
			return 0;
425
		}
426
		
427
		Log.info("Importing " + size + " word annotations...");
428
		
429
		// compute start position of sentences
430
		int[] starts = new int[index.getNumberOfGraphs()];
431
		for (int i = 0; i < index.getNumberOfGraphs(); i++) {
432
			starts[i] = 0;
433
			if (i > 0) {
434
				starts[i] += index.getNumberOfTNodes(i - 1) + starts[i - 1];
435
			}
436
		}
437
		
438
		File offsetsFile = new File(tigerCorpusExistingDirectory, "offsets.data");
439
		RandomAccessFile offsetsRAFile = new RandomAccessFile(offsetsFile, "rw");
440
		FileChannel offsetsFileChannel = offsetsRAFile.getChannel();
441
		MappedByteBuffer offsetsMapped = offsetsFileChannel.map(FileChannel.MapMode.READ_WRITE, 0, size * Integer.BYTES);
442
		// out.putInt(positions[i])
443
		
444
		File presencesFile = new File(tigerCorpusExistingDirectory, "presences.data");
445
		RandomAccessFile presencesRAFile = new RandomAccessFile(presencesFile, "rw");
446
		FileChannel presencesFileChannel = presencesRAFile.getChannel();
447
		MappedByteBuffer presencesMapped = presencesFileChannel.map(FileChannel.MapMode.READ_WRITE, 0, size);
448
		
449
		int numberOfWordsAnnotated = 0;
450
		
451
		// for each sentence
452
		ConsoleProgressBar cpb = new ConsoleProgressBar(index.getNumberOfGraphs());
453
		for (int nr = 0; nr < index.getNumberOfGraphs(); nr++) {
454
			cpb.tick();
455
			int sent_size = index.getNumberOfTNodes(nr);
456
			Sentence sent = tcorpus.manager.getSentence(nr);
457
			
458
			String[] ids = new String[sent_size];
459
			int[] tigerPositions = new int[sent_size];
460
			for (int t = 0; t < sent_size; t++) {
461
				T_Node terminal = (T_Node) sent.getTerminalAt(t);
462
				ids[t] = terminal.getFeature(wordIdAttribute);
463
				
464
				// try fixing ID
465
				if (ids[t].startsWith("w")) {
466
					if (!ids[t].startsWith("w_")) {
467
						ids[t] = "w_" + ids[t].substring(1);
468
					}
469
				}
470
				else {
471
					ids[t] = "w_" + ids[t];
472
				}
473
				tigerPositions[t] = starts[nr] + t;
474
				// System.out.println("T id="+terminal.getID());
475
			}
476
			
477
			int[] ids_idx = CQI.str2Id(corpus.getProperty("id").getQualifiedName(), ids);
478
			Integer[] cqpPositions = new Integer[sent_size];
479
			Integer[] offsets = new Integer[sent_size];
480
			for (int t = 0; t < sent_size; t++) {
481
				if (ids_idx[t] >= 0) {
482
					int[] positions = CQI.id2Cpos(corpus.getProperty("id").getQualifiedName(), ids_idx[t]);
483
					if (positions.length > 1) {
484
						Log.warning("Warning: multiple CQP positions for word_id=" + ids[t]);
485
					}
486
					cqpPositions[t] = positions[0]; // take the first position
487
				}
488
				else { // word not in the CQP corpus
489
					Log.warning("Could not find word for id=" + ids[t]);
490
					cqpPositions[t] = null;
491
				}
492
				
493
				if (cqpPositions[t] != null) {
494
					offsets[t] = cqpPositions[t] - tigerPositions[t];
495
				}
496
				else {
497
					offsets[t] = null;
498
				}
499
			}
500
			// System.out.println("ids="+Arrays.toString(ids));
501
			// System.out.println("cqp indexes="+Arrays.toString(ids_idx));
502
			// System.out.println("tiger positions="+Arrays.toString(tigerPositions));
503
			// System.out.println("cqp positions="+Arrays.toString(cqpPositions));
504
			// System.out.println("offsets="+Arrays.toString(offsets));
505
			
506
			// writing data to offset and presences files
507
			for (int t = 0; t < sent_size; t++) {
508
				
509
				if (offsets[t] != null) {
510
					numberOfWordsAnnotated++;
511
					presencesMapped.put((byte) 1);
512
					offsetsMapped.putInt(offsets[t]);
513
				}
514
				else {
515
					presencesMapped.put((byte) 0);
516
					offsetsMapped.putInt(0);
517
				}
518
			}
519
		}
520
		cpb.done();
521
		
522
		offsetsFileChannel.close();
523
		offsetsRAFile.close();
524
		presencesFileChannel.close();
525
		presencesRAFile.close();
526
		
527
		return numberOfWordsAnnotated;
528
	}
395 529
}
TXM/trunk/bundles/org.txm.tigersearch.rcp/src/org/txm/tigersearch/commands/ImportTIGERAnnotations.java (revision 3705)
28 28
package org.txm.tigersearch.commands;
29 29

  
30 30
import java.io.File;
31
import java.io.FileNotFoundException;
31 32
import java.io.IOException;
32 33
import java.io.RandomAccessFile;
33 34
import java.nio.MappedByteBuffer;
......
50 51
import org.txm.searchengine.cqp.corpus.MainCorpus;
51 52
import org.txm.searchengine.cqp.serverException.CqiServerError;
52 53
import org.txm.searchengine.ts.InternalCorpusQueryManagerLocal2;
54
import org.txm.searchengine.ts.TIGERSearchEngine;
53 55
import org.txm.searchengine.ts.TSCorpus;
54 56
import org.txm.searchengine.ts.TSCorpusManager;
55 57
import org.txm.utils.ConsoleProgressBar;
......
213 215
			CqiServerError,
214 216
			CqiClientException {
215 217
		
216
		// TXM corpus files
218

  
219
		
217 220
		File tigerDirectory = new File(corpus.getProjectDirectory(), "tiger");
218 221
		File tigerCorpusExistingDirectory = new File(tigerDirectory, corpus.getName());
219 222
		DeleteDir.deleteDirectory(tigerCorpusExistingDirectory);
......
223 226
		if (!configfile.exists()) {
224 227
			TSCorpus.createLogPropFile(tigerDirectory);
225 228
		}
229
	
230
		int numberOfWordsAnnotated = TIGERSearchEngine.writeOffsetDataFiles(corpus, wordIdAttribute, tigerCorpusDirectory, tigerDirectory, tigerCorpusExistingDirectory);
226 231
		
227
		AbstractCqiClient CQI = CQPSearchEngine.getCqiClient();
228
		
229
		TSCorpusManager manager = new TSCorpusManager(tigerCorpusDirectory.getParentFile(), configfile);
230
		
231
		TSCorpus tcorpus = manager.getCorpus(tigerCorpusDirectory.getName());
232
		InternalCorpusQueryManagerLocal2 tigermanager = tcorpus.manager;
233
		CorpusQueryProcessor processor = tigermanager.getQueryProcessor();
234
		
235
		Index index = processor.getIndex();
236
		int size = 0;
237
		for (int nr = 0; nr < index.getNumberOfGraphs(); nr++) {
238
			size += index.getNumberOfTNodes(nr);
239
		}
240
		
241
		if (size == 0) {
242
			Log.warning("No word found in the TIGERSearch corpus: " + tigerCorpusDirectory + ". Aborting.");
243
			return 0;
244
		}
245
		
246
		Log.info("Importing " + size + " word annotations...");
247
		
248
		// compute start position of sentences
249
		int[] starts = new int[index.getNumberOfGraphs()];
250
		for (int i = 0; i < index.getNumberOfGraphs(); i++) {
251
			starts[i] = 0;
252
			if (i > 0) {
253
				starts[i] += index.getNumberOfTNodes(i - 1) + starts[i - 1];
254
			}
255
		}
256
		
257
		File offsetsFile = new File(tigerCorpusExistingDirectory, "offsets.data");
258
		RandomAccessFile offsetsRAFile = new RandomAccessFile(offsetsFile, "rw");
259
		FileChannel offsetsFileChannel = offsetsRAFile.getChannel();
260
		MappedByteBuffer offsetsMapped = offsetsFileChannel.map(FileChannel.MapMode.READ_WRITE, 0, size * Integer.BYTES);
261
		// out.putInt(positions[i])
262
		
263
		File presencesFile = new File(tigerCorpusExistingDirectory, "presences.data");
264
		RandomAccessFile presencesRAFile = new RandomAccessFile(presencesFile, "rw");
265
		FileChannel presencesFileChannel = presencesRAFile.getChannel();
266
		MappedByteBuffer presencesMapped = presencesFileChannel.map(FileChannel.MapMode.READ_WRITE, 0, size);
267
		
268
		int numberOfWordsAnnotated = 0;
269
		
270
		// for each sentence
271
		ConsoleProgressBar cpb = new ConsoleProgressBar(index.getNumberOfGraphs());
272
		for (int nr = 0; nr < index.getNumberOfGraphs(); nr++) {
273
			cpb.tick();
274
			int sent_size = index.getNumberOfTNodes(nr);
275
			Sentence sent = tcorpus.manager.getSentence(nr);
276
			
277
			String[] ids = new String[sent_size];
278
			int[] tigerPositions = new int[sent_size];
279
			for (int t = 0; t < sent_size; t++) {
280
				T_Node terminal = (T_Node) sent.getTerminalAt(t);
281
				ids[t] = terminal.getFeature(wordIdAttribute);
282
				
283
				// try fixing ID
284
				if (ids[t].startsWith("w")) {
285
					if (!ids[t].startsWith("w_")) {
286
						ids[t] = "w_" + ids[t].substring(1);
287
					}
288
				}
289
				else {
290
					ids[t] = "w_" + ids[t];
291
				}
292
				tigerPositions[t] = starts[nr] + t;
293
				// System.out.println("T id="+terminal.getID());
294
			}
295
			
296
			int[] ids_idx = CQI.str2Id(corpus.getProperty("id").getQualifiedName(), ids);
297
			Integer[] cqpPositions = new Integer[sent_size];
298
			Integer[] offsets = new Integer[sent_size];
299
			for (int t = 0; t < sent_size; t++) {
300
				if (ids_idx[t] >= 0) {
301
					int[] positions = CQI.id2Cpos(corpus.getProperty("id").getQualifiedName(), ids_idx[t]);
302
					if (positions.length > 1) {
303
						Log.warning("Warning: multiple CQP positions for word_id=" + ids[t]);
304
					}
305
					cqpPositions[t] = positions[0]; // take the first position
306
				}
307
				else { // word not in the CQP corpus
308
					Log.warning("Could not find word for id=" + ids[t]);
309
					cqpPositions[t] = null;
310
				}
311
				
312
				if (cqpPositions[t] != null) {
313
					offsets[t] = cqpPositions[t] - tigerPositions[t];
314
				}
315
				else {
316
					offsets[t] = null;
317
				}
318
			}
319
			// System.out.println("ids="+Arrays.toString(ids));
320
			// System.out.println("cqp indexes="+Arrays.toString(ids_idx));
321
			// System.out.println("tiger positions="+Arrays.toString(tigerPositions));
322
			// System.out.println("cqp positions="+Arrays.toString(cqpPositions));
323
			// System.out.println("offsets="+Arrays.toString(offsets));
324
			
325
			// writing data to offset and presences files
326
			for (int t = 0; t < sent_size; t++) {
327
				
328
				if (offsets[t] != null) {
329
					numberOfWordsAnnotated++;
330
					presencesMapped.put((byte) 1);
331
					offsetsMapped.putInt(offsets[t]);
332
				}
333
				else {
334
					presencesMapped.put((byte) 0);
335
					offsetsMapped.putInt(0);
336
				}
337
			}
338
		}
339
		cpb.done();
340
		
341
		offsetsFileChannel.close();
342
		offsetsRAFile.close();
343
		presencesFileChannel.close();
344
		presencesRAFile.close();
345
		
346 232
		Log.info("Finalizing TIGERSearch corpus");
347
		if (numberOfWordsAnnotated > 0) {
233
		if (numberOfWordsAnnotated > 0) { // copy the TIGERcorpus to import
348 234
			FileCopy.copyFiles(tigerCorpusDirectory, tigerCorpusExistingDirectory);
349 235
			Log.info("Done. " + numberOfWordsAnnotated + " words annotated.");
350 236
		}
......
354 240
		
355 241
		return numberOfWordsAnnotated;
356 242
	}
243

  
357 244
}

Formats disponibles : Unified diff