Révision 3705
TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImport.groovy (revision 3705) | ||
---|---|---|
17 | 17 |
import org.txm.scripts.importer.xtz.* |
18 | 18 |
import org.txm.scripts.importer.tigersearch.TSImport |
19 | 19 |
import org.txm.utils.* |
20 |
import org.txm.searchengine.cqp.corpus.MainCorpus |
|
20 | 21 |
import org.txm.searchengine.ts.TIGERSearchEngine |
21 | 22 |
import org.txm.conllu.core.preferences.UDPreferences |
22 | 23 |
import org.txm.tigersearch.preferences.TigerSearchPreferences |
... | ... | |
116 | 117 |
if (isSuccessful) { |
117 | 118 |
// read from the 'tiger-xml' and write to the 'tiger' directory |
118 | 119 |
TIGERSearchEngine.buildTIGERCorpus(tigerXMLDirectory, this.binaryDirectory, corpusName); |
120 |
|
|
121 |
// re-align TIGER word indexes with the CQP word indexes using the TS@editionId and CQP@id properties |
|
122 |
File tigerDirectory = new File(this.binaryDirectory, "tiger"); |
|
123 |
File tigerCorpusExistingDirectory = new File(this.binaryDirectory, "tiger/"+corpusName); |
|
124 |
MainCorpus corpus = this.project.getFirstChild(MainCorpus.class); |
|
125 |
corpus.compute(false); // load the corpus in CQP |
|
126 |
TIGERSearchEngine.writeOffsetDataFiles(corpus, "editionId", tigerCorpusExistingDirectory, tigerDirectory, tigerCorpusExistingDirectory) |
|
119 | 127 |
} |
120 | 128 |
} else { |
121 | 129 |
println "Can not do the TIGER indexes step." |
TXM/trunk/bundles/org.txm.tigersearch.rcp/src/org/txm/searchengine/ts/TIGERSearchEngine.java (revision 3705) | ||
---|---|---|
5 | 5 |
import java.io.FileNotFoundException; |
6 | 6 |
import java.io.IOException; |
7 | 7 |
import java.io.PrintWriter; |
8 |
import java.io.RandomAccessFile; |
|
8 | 9 |
import java.io.UnsupportedEncodingException; |
9 | 10 |
import java.nio.MappedByteBuffer; |
11 |
import java.nio.channels.FileChannel; |
|
10 | 12 |
import java.util.ArrayList; |
11 | 13 |
import java.util.HashMap; |
12 | 14 |
import java.util.LinkedHashSet; |
... | ... | |
29 | 31 |
import org.txm.searchengine.cqp.clientExceptions.CqiClientException; |
30 | 32 |
import org.txm.searchengine.cqp.clientExceptions.UnexpectedAnswerException; |
31 | 33 |
import org.txm.searchengine.cqp.corpus.CQPCorpus; |
34 |
import org.txm.searchengine.cqp.corpus.MainCorpus; |
|
32 | 35 |
import org.txm.searchengine.cqp.serverException.CqiServerError; |
33 | 36 |
import org.txm.tigersearch.preferences.TigerSearchPreferences; |
34 | 37 |
import org.txm.tigersearch.preferences.TigerSearchTreePreferences; |
38 |
import org.txm.utils.ConsoleProgressBar; |
|
35 | 39 |
import org.txm.utils.DeleteDir; |
36 | 40 |
import org.txm.utils.io.IOUtils; |
37 | 41 |
import org.txm.utils.logger.Log; |
38 | 42 |
|
43 |
import ims.tiger.corpus.Sentence; |
|
44 |
import ims.tiger.corpus.T_Node; |
|
39 | 45 |
import ims.tiger.index.reader.Index; |
40 | 46 |
import ims.tiger.index.reader.IndexException; |
41 | 47 |
import ims.tiger.index.writer.IndexBuilderErrorHandler; |
42 | 48 |
import ims.tiger.index.writer.SimpleErrorHandler; |
43 | 49 |
import ims.tiger.index.writer.XMLIndexing; |
44 | 50 |
import ims.tiger.query.api.MatchResult; |
51 |
import ims.tiger.query.api.QueryIndexException; |
|
52 |
import ims.tiger.query.processor.CorpusQueryProcessor; |
|
45 | 53 |
|
46 | 54 |
public class TIGERSearchEngine extends SearchEngine { |
47 | 55 |
|
... | ... | |
392 | 400 |
} |
393 | 401 |
return true; |
394 | 402 |
} |
403 |
|
|
404 |
|
|
405 |
public static int writeOffsetDataFiles(MainCorpus corpus, String wordIdAttribute, File tigerCorpusDirectory, File tigerDirectory, File tigerCorpusExistingDirectory) throws IndexException, QueryIndexException, UnexpectedAnswerException, IOException, CqiServerError, CqiClientException { |
|
406 |
|
|
407 |
// TXM corpus files |
|
408 |
File configfile = new File(tigerDirectory, "tigersearch.logprop"); |
|
409 |
|
|
410 |
TSCorpusManager manager = new TSCorpusManager(tigerCorpusDirectory.getParentFile(), configfile); |
|
411 |
TSCorpus tcorpus = manager.getCorpus(tigerCorpusDirectory.getName()); |
|
412 |
InternalCorpusQueryManagerLocal2 tigermanager = tcorpus.manager; |
|
413 |
CorpusQueryProcessor processor = tigermanager.getQueryProcessor(); |
|
414 |
AbstractCqiClient CQI = CQPSearchEngine.getCqiClient(); |
|
415 |
|
|
416 |
Index index = processor.getIndex(); |
|
417 |
int size = 0; |
|
418 |
for (int nr = 0; nr < index.getNumberOfGraphs(); nr++) { |
|
419 |
size += index.getNumberOfTNodes(nr); |
|
420 |
} |
|
421 |
|
|
422 |
if (size == 0) { |
|
423 |
Log.warning("No word found in the TIGERSearch corpus: " + tigerCorpusDirectory + ". Aborting."); |
|
424 |
return 0; |
|
425 |
} |
|
426 |
|
|
427 |
Log.info("Importing " + size + " word annotations..."); |
|
428 |
|
|
429 |
// compute start position of sentences |
|
430 |
int[] starts = new int[index.getNumberOfGraphs()]; |
|
431 |
for (int i = 0; i < index.getNumberOfGraphs(); i++) { |
|
432 |
starts[i] = 0; |
|
433 |
if (i > 0) { |
|
434 |
starts[i] += index.getNumberOfTNodes(i - 1) + starts[i - 1]; |
|
435 |
} |
|
436 |
} |
|
437 |
|
|
438 |
File offsetsFile = new File(tigerCorpusExistingDirectory, "offsets.data"); |
|
439 |
RandomAccessFile offsetsRAFile = new RandomAccessFile(offsetsFile, "rw"); |
|
440 |
FileChannel offsetsFileChannel = offsetsRAFile.getChannel(); |
|
441 |
MappedByteBuffer offsetsMapped = offsetsFileChannel.map(FileChannel.MapMode.READ_WRITE, 0, size * Integer.BYTES); |
|
442 |
// out.putInt(positions[i]) |
|
443 |
|
|
444 |
File presencesFile = new File(tigerCorpusExistingDirectory, "presences.data"); |
|
445 |
RandomAccessFile presencesRAFile = new RandomAccessFile(presencesFile, "rw"); |
|
446 |
FileChannel presencesFileChannel = presencesRAFile.getChannel(); |
|
447 |
MappedByteBuffer presencesMapped = presencesFileChannel.map(FileChannel.MapMode.READ_WRITE, 0, size); |
|
448 |
|
|
449 |
int numberOfWordsAnnotated = 0; |
|
450 |
|
|
451 |
// for each sentence |
|
452 |
ConsoleProgressBar cpb = new ConsoleProgressBar(index.getNumberOfGraphs()); |
|
453 |
for (int nr = 0; nr < index.getNumberOfGraphs(); nr++) { |
|
454 |
cpb.tick(); |
|
455 |
int sent_size = index.getNumberOfTNodes(nr); |
|
456 |
Sentence sent = tcorpus.manager.getSentence(nr); |
|
457 |
|
|
458 |
String[] ids = new String[sent_size]; |
|
459 |
int[] tigerPositions = new int[sent_size]; |
|
460 |
for (int t = 0; t < sent_size; t++) { |
|
461 |
T_Node terminal = (T_Node) sent.getTerminalAt(t); |
|
462 |
ids[t] = terminal.getFeature(wordIdAttribute); |
|
463 |
|
|
464 |
// try fixing ID |
|
465 |
if (ids[t].startsWith("w")) { |
|
466 |
if (!ids[t].startsWith("w_")) { |
|
467 |
ids[t] = "w_" + ids[t].substring(1); |
|
468 |
} |
|
469 |
} |
|
470 |
else { |
|
471 |
ids[t] = "w_" + ids[t]; |
|
472 |
} |
|
473 |
tigerPositions[t] = starts[nr] + t; |
|
474 |
// System.out.println("T id="+terminal.getID()); |
|
475 |
} |
|
476 |
|
|
477 |
int[] ids_idx = CQI.str2Id(corpus.getProperty("id").getQualifiedName(), ids); |
|
478 |
Integer[] cqpPositions = new Integer[sent_size]; |
|
479 |
Integer[] offsets = new Integer[sent_size]; |
|
480 |
for (int t = 0; t < sent_size; t++) { |
|
481 |
if (ids_idx[t] >= 0) { |
|
482 |
int[] positions = CQI.id2Cpos(corpus.getProperty("id").getQualifiedName(), ids_idx[t]); |
|
483 |
if (positions.length > 1) { |
|
484 |
Log.warning("Warning: multiple CQP positions for word_id=" + ids[t]); |
|
485 |
} |
|
486 |
cqpPositions[t] = positions[0]; // take the first position |
|
487 |
} |
|
488 |
else { // word not in the CQP corpus |
|
489 |
Log.warning("Could not find word for id=" + ids[t]); |
|
490 |
cqpPositions[t] = null; |
|
491 |
} |
|
492 |
|
|
493 |
if (cqpPositions[t] != null) { |
|
494 |
offsets[t] = cqpPositions[t] - tigerPositions[t]; |
|
495 |
} |
|
496 |
else { |
|
497 |
offsets[t] = null; |
|
498 |
} |
|
499 |
} |
|
500 |
// System.out.println("ids="+Arrays.toString(ids)); |
|
501 |
// System.out.println("cqp indexes="+Arrays.toString(ids_idx)); |
|
502 |
// System.out.println("tiger positions="+Arrays.toString(tigerPositions)); |
|
503 |
// System.out.println("cqp positions="+Arrays.toString(cqpPositions)); |
|
504 |
// System.out.println("offsets="+Arrays.toString(offsets)); |
|
505 |
|
|
506 |
// writing data to offset and presences files |
|
507 |
for (int t = 0; t < sent_size; t++) { |
|
508 |
|
|
509 |
if (offsets[t] != null) { |
|
510 |
numberOfWordsAnnotated++; |
|
511 |
presencesMapped.put((byte) 1); |
|
512 |
offsetsMapped.putInt(offsets[t]); |
|
513 |
} |
|
514 |
else { |
|
515 |
presencesMapped.put((byte) 0); |
|
516 |
offsetsMapped.putInt(0); |
|
517 |
} |
|
518 |
} |
|
519 |
} |
|
520 |
cpb.done(); |
|
521 |
|
|
522 |
offsetsFileChannel.close(); |
|
523 |
offsetsRAFile.close(); |
|
524 |
presencesFileChannel.close(); |
|
525 |
presencesRAFile.close(); |
|
526 |
|
|
527 |
return numberOfWordsAnnotated; |
|
528 |
} |
|
395 | 529 |
} |
TXM/trunk/bundles/org.txm.tigersearch.rcp/src/org/txm/tigersearch/commands/ImportTIGERAnnotations.java (revision 3705) | ||
---|---|---|
28 | 28 |
package org.txm.tigersearch.commands; |
29 | 29 |
|
30 | 30 |
import java.io.File; |
31 |
import java.io.FileNotFoundException; |
|
31 | 32 |
import java.io.IOException; |
32 | 33 |
import java.io.RandomAccessFile; |
33 | 34 |
import java.nio.MappedByteBuffer; |
... | ... | |
50 | 51 |
import org.txm.searchengine.cqp.corpus.MainCorpus; |
51 | 52 |
import org.txm.searchengine.cqp.serverException.CqiServerError; |
52 | 53 |
import org.txm.searchengine.ts.InternalCorpusQueryManagerLocal2; |
54 |
import org.txm.searchengine.ts.TIGERSearchEngine; |
|
53 | 55 |
import org.txm.searchengine.ts.TSCorpus; |
54 | 56 |
import org.txm.searchengine.ts.TSCorpusManager; |
55 | 57 |
import org.txm.utils.ConsoleProgressBar; |
... | ... | |
213 | 215 |
CqiServerError, |
214 | 216 |
CqiClientException { |
215 | 217 |
|
216 |
// TXM corpus files |
|
218 |
|
|
219 |
|
|
217 | 220 |
File tigerDirectory = new File(corpus.getProjectDirectory(), "tiger"); |
218 | 221 |
File tigerCorpusExistingDirectory = new File(tigerDirectory, corpus.getName()); |
219 | 222 |
DeleteDir.deleteDirectory(tigerCorpusExistingDirectory); |
... | ... | |
223 | 226 |
if (!configfile.exists()) { |
224 | 227 |
TSCorpus.createLogPropFile(tigerDirectory); |
225 | 228 |
} |
229 |
|
|
230 |
int numberOfWordsAnnotated = TIGERSearchEngine.writeOffsetDataFiles(corpus, wordIdAttribute, tigerCorpusDirectory, tigerDirectory, tigerCorpusExistingDirectory); |
|
226 | 231 |
|
227 |
AbstractCqiClient CQI = CQPSearchEngine.getCqiClient(); |
|
228 |
|
|
229 |
TSCorpusManager manager = new TSCorpusManager(tigerCorpusDirectory.getParentFile(), configfile); |
|
230 |
|
|
231 |
TSCorpus tcorpus = manager.getCorpus(tigerCorpusDirectory.getName()); |
|
232 |
InternalCorpusQueryManagerLocal2 tigermanager = tcorpus.manager; |
|
233 |
CorpusQueryProcessor processor = tigermanager.getQueryProcessor(); |
|
234 |
|
|
235 |
Index index = processor.getIndex(); |
|
236 |
int size = 0; |
|
237 |
for (int nr = 0; nr < index.getNumberOfGraphs(); nr++) { |
|
238 |
size += index.getNumberOfTNodes(nr); |
|
239 |
} |
|
240 |
|
|
241 |
if (size == 0) { |
|
242 |
Log.warning("No word found in the TIGERSearch corpus: " + tigerCorpusDirectory + ". Aborting."); |
|
243 |
return 0; |
|
244 |
} |
|
245 |
|
|
246 |
Log.info("Importing " + size + " word annotations..."); |
|
247 |
|
|
248 |
// compute start position of sentences |
|
249 |
int[] starts = new int[index.getNumberOfGraphs()]; |
|
250 |
for (int i = 0; i < index.getNumberOfGraphs(); i++) { |
|
251 |
starts[i] = 0; |
|
252 |
if (i > 0) { |
|
253 |
starts[i] += index.getNumberOfTNodes(i - 1) + starts[i - 1]; |
|
254 |
} |
|
255 |
} |
|
256 |
|
|
257 |
File offsetsFile = new File(tigerCorpusExistingDirectory, "offsets.data"); |
|
258 |
RandomAccessFile offsetsRAFile = new RandomAccessFile(offsetsFile, "rw"); |
|
259 |
FileChannel offsetsFileChannel = offsetsRAFile.getChannel(); |
|
260 |
MappedByteBuffer offsetsMapped = offsetsFileChannel.map(FileChannel.MapMode.READ_WRITE, 0, size * Integer.BYTES); |
|
261 |
// out.putInt(positions[i]) |
|
262 |
|
|
263 |
File presencesFile = new File(tigerCorpusExistingDirectory, "presences.data"); |
|
264 |
RandomAccessFile presencesRAFile = new RandomAccessFile(presencesFile, "rw"); |
|
265 |
FileChannel presencesFileChannel = presencesRAFile.getChannel(); |
|
266 |
MappedByteBuffer presencesMapped = presencesFileChannel.map(FileChannel.MapMode.READ_WRITE, 0, size); |
|
267 |
|
|
268 |
int numberOfWordsAnnotated = 0; |
|
269 |
|
|
270 |
// for each sentence |
|
271 |
ConsoleProgressBar cpb = new ConsoleProgressBar(index.getNumberOfGraphs()); |
|
272 |
for (int nr = 0; nr < index.getNumberOfGraphs(); nr++) { |
|
273 |
cpb.tick(); |
|
274 |
int sent_size = index.getNumberOfTNodes(nr); |
|
275 |
Sentence sent = tcorpus.manager.getSentence(nr); |
|
276 |
|
|
277 |
String[] ids = new String[sent_size]; |
|
278 |
int[] tigerPositions = new int[sent_size]; |
|
279 |
for (int t = 0; t < sent_size; t++) { |
|
280 |
T_Node terminal = (T_Node) sent.getTerminalAt(t); |
|
281 |
ids[t] = terminal.getFeature(wordIdAttribute); |
|
282 |
|
|
283 |
// try fixing ID |
|
284 |
if (ids[t].startsWith("w")) { |
|
285 |
if (!ids[t].startsWith("w_")) { |
|
286 |
ids[t] = "w_" + ids[t].substring(1); |
|
287 |
} |
|
288 |
} |
|
289 |
else { |
|
290 |
ids[t] = "w_" + ids[t]; |
|
291 |
} |
|
292 |
tigerPositions[t] = starts[nr] + t; |
|
293 |
// System.out.println("T id="+terminal.getID()); |
|
294 |
} |
|
295 |
|
|
296 |
int[] ids_idx = CQI.str2Id(corpus.getProperty("id").getQualifiedName(), ids); |
|
297 |
Integer[] cqpPositions = new Integer[sent_size]; |
|
298 |
Integer[] offsets = new Integer[sent_size]; |
|
299 |
for (int t = 0; t < sent_size; t++) { |
|
300 |
if (ids_idx[t] >= 0) { |
|
301 |
int[] positions = CQI.id2Cpos(corpus.getProperty("id").getQualifiedName(), ids_idx[t]); |
|
302 |
if (positions.length > 1) { |
|
303 |
Log.warning("Warning: multiple CQP positions for word_id=" + ids[t]); |
|
304 |
} |
|
305 |
cqpPositions[t] = positions[0]; // take the first position |
|
306 |
} |
|
307 |
else { // word not in the CQP corpus |
|
308 |
Log.warning("Could not find word for id=" + ids[t]); |
|
309 |
cqpPositions[t] = null; |
|
310 |
} |
|
311 |
|
|
312 |
if (cqpPositions[t] != null) { |
|
313 |
offsets[t] = cqpPositions[t] - tigerPositions[t]; |
|
314 |
} |
|
315 |
else { |
|
316 |
offsets[t] = null; |
|
317 |
} |
|
318 |
} |
|
319 |
// System.out.println("ids="+Arrays.toString(ids)); |
|
320 |
// System.out.println("cqp indexes="+Arrays.toString(ids_idx)); |
|
321 |
// System.out.println("tiger positions="+Arrays.toString(tigerPositions)); |
|
322 |
// System.out.println("cqp positions="+Arrays.toString(cqpPositions)); |
|
323 |
// System.out.println("offsets="+Arrays.toString(offsets)); |
|
324 |
|
|
325 |
// writing data to offset and presences files |
|
326 |
for (int t = 0; t < sent_size; t++) { |
|
327 |
|
|
328 |
if (offsets[t] != null) { |
|
329 |
numberOfWordsAnnotated++; |
|
330 |
presencesMapped.put((byte) 1); |
|
331 |
offsetsMapped.putInt(offsets[t]); |
|
332 |
} |
|
333 |
else { |
|
334 |
presencesMapped.put((byte) 0); |
|
335 |
offsetsMapped.putInt(0); |
|
336 |
} |
|
337 |
} |
|
338 |
} |
|
339 |
cpb.done(); |
|
340 |
|
|
341 |
offsetsFileChannel.close(); |
|
342 |
offsetsRAFile.close(); |
|
343 |
presencesFileChannel.close(); |
|
344 |
presencesRAFile.close(); |
|
345 |
|
|
346 | 232 |
Log.info("Finalizing TIGERSearch corpus"); |
347 |
if (numberOfWordsAnnotated > 0) { |
|
233 |
if (numberOfWordsAnnotated > 0) { // copy the TIGERcorpus to import
|
|
348 | 234 |
FileCopy.copyFiles(tigerCorpusDirectory, tigerCorpusExistingDirectory); |
349 | 235 |
Log.info("Done. " + numberOfWordsAnnotated + " words annotated."); |
350 | 236 |
} |
... | ... | |
354 | 240 |
|
355 | 241 |
return numberOfWordsAnnotated; |
356 | 242 |
} |
243 |
|
|
357 | 244 |
} |
Formats disponibles : Unified diff