56 |
56 |
|
57 |
57 |
public class TIGERSearchEngine extends SearchEngine {
|
58 |
58 |
|
59 |
|
public static final String NAME = "TIGER";
|
|
59 |
public static final String NAME = "TIGER"; //$NON-NLS-1$
|
60 |
60 |
|
61 |
61 |
HashMap<CorpusBuild, TSCorpus> corpora = null;
|
62 |
62 |
|
... | ... | |
68 |
68 |
return tscorpus;
|
69 |
69 |
}
|
70 |
70 |
|
71 |
|
File tigerDirectory = new File(root.getProjectDirectory(), "tiger");
|
72 |
|
File configfile = new File(tigerDirectory, "tigersearch.logprop");
|
|
71 |
File tigerDirectory = new File(root.getProjectDirectory(), "tiger"); //$NON-NLS-1$
|
|
72 |
File configfile = new File(tigerDirectory, "tigersearch.logprop"); //$NON-NLS-1$
|
73 |
73 |
if (!tigerDirectory.exists()) {
|
74 |
74 |
return null;
|
75 |
75 |
}
|
... | ... | |
139 |
139 |
return new int[] { 0, 0 };
|
140 |
140 |
}
|
141 |
141 |
int[] cpos = new int[] { start, end };
|
142 |
|
int[] structs = CQI.cpos2Struc(cqpCorpus.getStructuralUnit("s").getProperty("n").getQualifiedName(), cpos);
|
|
142 |
int[] structs = CQI.cpos2Struc(cqpCorpus.getStructuralUnit("s").getProperty("n").getQualifiedName(), cpos); //$NON-NLS-1$ //$NON-NLS-2$
|
143 |
143 |
if (structs.length == 0) {
|
144 |
144 |
return new int[] { 0, 0 };
|
145 |
145 |
}
|
... | ... | |
197 |
197 |
TSCorpus tcorpus = this.getTSCorpus(corpus);
|
198 |
198 |
TSResult result = null;
|
199 |
199 |
if (corpus == corpus.getRootCorpusBuild() || !(corpus instanceof CQPCorpus)) { // root corpus or something not a CQPCorpus
|
200 |
|
result = tcorpus.query(query.getQueryString().replace("\n", " "));
|
|
200 |
result = tcorpus.query(query.getQueryString().replace("\n", " ")); //$NON-NLS-1$ //$NON-NLS-2$
|
201 |
201 |
}
|
202 |
202 |
else {
|
203 |
203 |
CQPCorpus cqpCorpus = (CQPCorpus) corpus;
|
... | ... | |
227 |
227 |
}
|
228 |
228 |
}
|
229 |
229 |
|
230 |
|
Log.finest("QUERYING sentences: " + sent_min + " -> " + sent_max);
|
231 |
|
result = tcorpus.query(query.getQueryString().replace("\n", " "), sent_min, sent_max, -1);
|
|
230 |
Log.finest("QUERYING sentences: " + sent_min + " -> " + sent_max); //$NON-NLS-1$ //$NON-NLS-2$
|
|
231 |
result = tcorpus.query(query.getQueryString().replace("\n", " "), sent_min, sent_max, -1); //$NON-NLS-1$ //$NON-NLS-2$
|
232 |
232 |
}
|
233 |
233 |
|
234 |
234 |
return result;
|
... | ... | |
254 |
254 |
|
255 |
255 |
List<String> variables = java.util.Arrays.asList(mresult.getVariableNames());
|
256 |
256 |
// System.out.println("Variables: "+variables+" iPivot="+variables.indexOf("pivot"));
|
257 |
|
int iPivot = variables.indexOf("pivot");
|
|
257 |
int iPivot = variables.indexOf("pivot"); //$NON-NLS-1$
|
258 |
258 |
|
259 |
259 |
MappedByteBuffer offsetsMapped = tcorpus.getOffsetsMapped();
|
260 |
260 |
// MappedByteBuffer offsetsMapped = tcorpus.getOffsetsMapped();
|
... | ... | |
320 |
320 |
|
321 |
321 |
// TODO implement a corpora of TIGER corpus
|
322 |
322 |
CorpusBuild root = corpus.getRootCorpusBuild();
|
323 |
|
File buildDirectory = new File(root.getProjectDirectory(), "tiger");
|
324 |
|
return new File(buildDirectory, "tigersearch.logprop").exists() &&
|
|
323 |
File buildDirectory = new File(root.getProjectDirectory(), "tiger"); //$NON-NLS-1$
|
|
324 |
return new File(buildDirectory, "tigersearch.logprop").exists() && //$NON-NLS-1$
|
325 |
325 |
new File(buildDirectory, root.getID()).exists();
|
326 |
326 |
}
|
327 |
327 |
|
... | ... | |
335 |
335 |
// }
|
336 |
336 |
// }
|
337 |
337 |
// else
|
338 |
|
if (r instanceof Project && "clean".equals(state)) { // the Project has been deleted by the user
|
|
338 |
if (r instanceof Project && "clean".equals(state)) { // the Project has been deleted by the user //$NON-NLS-1$
|
339 |
339 |
Project c = (Project) r;
|
340 |
|
File buildDirectory = new File(c.getProjectDirectory(), "tiger");
|
|
340 |
File buildDirectory = new File(c.getProjectDirectory(), "tiger"); //$NON-NLS-1$
|
341 |
341 |
if (buildDirectory.exists()) {
|
342 |
342 |
DeleteDir.deleteDirectory(buildDirectory);
|
343 |
343 |
}
|
... | ... | |
366 |
366 |
*/
|
367 |
367 |
public static boolean buildTIGERCorpus(File sourceDirectory, File binaryDirectory, String corpusName) throws UnsupportedEncodingException, FileNotFoundException {
|
368 |
368 |
|
369 |
|
File tigerDir = new File(binaryDirectory, "tiger");
|
|
369 |
File tigerDir = new File(binaryDirectory, "tiger"); //$NON-NLS-1$
|
370 |
370 |
tigerDir.mkdir();
|
371 |
371 |
|
372 |
|
File logprop = new File(tigerDir, "tigersearch.logprop");
|
|
372 |
File logprop = new File(tigerDir, "tigersearch.logprop"); //$NON-NLS-1$
|
373 |
373 |
|
374 |
|
PrintWriter writer = IOUtils.getWriter(logprop, "UTF-8");
|
375 |
|
writer.println("# Default log configuration of the TIGERSearch suite"+"\n"+
|
376 |
|
"log4j.rootLogger=SEVERE,Logfile"+"\n"+
|
377 |
|
"log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=SEVERE"+"\n"+
|
378 |
|
"log4j.appender.Logfile=org.apache.log4j.RollingFileAppender"+"\n"+
|
379 |
|
"log4j.appender.Logfile.File="+logprop.getAbsolutePath()+"\n"+
|
380 |
|
"log4j.appender.Logfile.MaxFileSize=500KB"+"\n"+
|
381 |
|
"log4j.appender.Logfile.MaxBackupIndex=1"+"\n"+
|
382 |
|
"log4j.appender.Logfile.layout=org.apache.log4j.PatternLayout"+"\n"+
|
383 |
|
"log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n");
|
|
374 |
PrintWriter writer = IOUtils.getWriter(logprop, "UTF-8"); //$NON-NLS-1$
|
|
375 |
writer.println("# Default log configuration of the TIGERSearch suite"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
|
|
376 |
"log4j.rootLogger=SEVERE,Logfile"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
|
|
377 |
"log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=SEVERE"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
|
|
378 |
"log4j.appender.Logfile=org.apache.log4j.RollingFileAppender"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
|
|
379 |
"log4j.appender.Logfile.File="+logprop.getAbsolutePath()+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
|
|
380 |
"log4j.appender.Logfile.MaxFileSize=500KB"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
|
|
381 |
"log4j.appender.Logfile.MaxBackupIndex=1"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
|
|
382 |
"log4j.appender.Logfile.layout=org.apache.log4j.PatternLayout"+"\n"+ //$NON-NLS-1$ //$NON-NLS-2$
|
|
383 |
"log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n"); //$NON-NLS-1$
|
384 |
384 |
|
385 |
385 |
BasicConfigurator.configure();
|
386 |
386 |
|
... | ... | |
417 |
417 |
};
|
418 |
418 |
XMLIndexing indexing = new XMLIndexing(corpusName, uri, tigerBinDir.getAbsolutePath(), handler, false);
|
419 |
419 |
indexing.startIndexing();
|
420 |
|
File logs = new File(tigerBinDir, "indexing.log");
|
|
420 |
File logs = new File(tigerBinDir, "indexing.log"); //$NON-NLS-1$
|
421 |
421 |
String txt = IOUtils.getText(logs);
|
422 |
422 |
if (txt.contains("Error in corpus graph ")) {
|
423 |
423 |
Log.warning("Error while importing TIGER corpus: "+txt);
|
... | ... | |
435 |
435 |
public static int writeOffsetDataFiles(MainCorpus corpus, String wordIdAttribute, File tigerCorpusDirectory, File tigerDirectory, File tigerCorpusExistingDirectory) throws IndexException, QueryIndexException, UnexpectedAnswerException, IOException, CqiServerError, CqiClientException {
|
436 |
436 |
|
437 |
437 |
// TXM corpus files
|
438 |
|
File configfile = new File(tigerDirectory, "tigersearch.logprop");
|
|
438 |
File configfile = new File(tigerDirectory, "tigersearch.logprop"); //$NON-NLS-1$
|
439 |
439 |
|
440 |
440 |
TSCorpusManager manager = new TSCorpusManager(tigerCorpusDirectory.getParentFile(), configfile);
|
441 |
441 |
TSCorpus tcorpus = manager.getCorpus(tigerCorpusDirectory.getName());
|
... | ... | |
465 |
465 |
}
|
466 |
466 |
}
|
467 |
467 |
|
468 |
|
File offsetsFile = new File(tigerCorpusExistingDirectory, "offsets.data");
|
469 |
|
RandomAccessFile offsetsRAFile = new RandomAccessFile(offsetsFile, "rw");
|
|
468 |
File offsetsFile = new File(tigerCorpusExistingDirectory, "offsets.data"); //$NON-NLS-1$
|
|
469 |
RandomAccessFile offsetsRAFile = new RandomAccessFile(offsetsFile, "rw"); //$NON-NLS-1$
|
470 |
470 |
FileChannel offsetsFileChannel = offsetsRAFile.getChannel();
|
471 |
471 |
MappedByteBuffer offsetsMapped = offsetsFileChannel.map(FileChannel.MapMode.READ_WRITE, 0, size * Integer.BYTES);
|
472 |
472 |
// out.putInt(positions[i])
|
473 |
473 |
|
474 |
|
File presencesFile = new File(tigerCorpusExistingDirectory, "presences.data");
|
475 |
|
RandomAccessFile presencesRAFile = new RandomAccessFile(presencesFile, "rw");
|
|
474 |
File presencesFile = new File(tigerCorpusExistingDirectory, "presences.data"); //$NON-NLS-1$
|
|
475 |
RandomAccessFile presencesRAFile = new RandomAccessFile(presencesFile, "rw"); //$NON-NLS-1$
|
476 |
476 |
FileChannel presencesFileChannel = presencesRAFile.getChannel();
|
477 |
477 |
MappedByteBuffer presencesMapped = presencesFileChannel.map(FileChannel.MapMode.READ_WRITE, 0, size);
|
478 |
478 |
|
... | ... | |
492 |
492 |
ids[t] = terminal.getFeature(wordIdAttribute);
|
493 |
493 |
|
494 |
494 |
// try fixing ID
|
495 |
|
if (ids[t].startsWith("w")) {
|
496 |
|
if (!ids[t].startsWith("w_")) {
|
497 |
|
ids[t] = "w_" + ids[t].substring(1);
|
|
495 |
if (ids[t].startsWith("w")) { //$NON-NLS-1$
|
|
496 |
if (!ids[t].startsWith("w_")) { //$NON-NLS-1$
|
|
497 |
ids[t] = "w_" + ids[t].substring(1); //$NON-NLS-1$
|
498 |
498 |
}
|
499 |
499 |
}
|
500 |
500 |
else {
|
501 |
|
ids[t] = "w_" + ids[t];
|
|
501 |
ids[t] = "w_" + ids[t]; //$NON-NLS-1$
|
502 |
502 |
}
|
503 |
503 |
tigerPositions[t] = starts[nr] + t;
|
504 |
504 |
// System.out.println("T id="+terminal.getID());
|
505 |
505 |
}
|
506 |
506 |
|
507 |
|
int[] ids_idx = CQI.str2Id(corpus.getProperty("id").getQualifiedName(), ids);
|
|
507 |
int[] ids_idx = CQI.str2Id(corpus.getProperty("id").getQualifiedName(), ids); //$NON-NLS-1$
|
508 |
508 |
Integer[] cqpPositions = new Integer[sent_size];
|
509 |
509 |
Integer[] offsets = new Integer[sent_size];
|
510 |
510 |
for (int t = 0; t < sent_size; t++) {
|
511 |
511 |
if (ids_idx[t] >= 0) {
|
512 |
|
int[] positions = CQI.id2Cpos(corpus.getProperty("id").getQualifiedName(), ids_idx[t]);
|
|
512 |
int[] positions = CQI.id2Cpos(corpus.getProperty("id").getQualifiedName(), ids_idx[t]); //$NON-NLS-1$
|
513 |
513 |
if (positions.length > 1) {
|
514 |
514 |
Log.warning("Warning: multiple CQP positions for word_id=" + ids[t]);
|
515 |
515 |
}
|
... | ... | |
572 |
572 |
StringBuilder buffer = new StringBuilder();
|
573 |
573 |
buffer.append("<h3>TIGERSearch informations</h3>\n");
|
574 |
574 |
buffer.append("<h4>Commons</h4>\n");
|
575 |
|
buffer.append("<p>Name: "+header.getCorpus_Name()+"</p>\n");
|
576 |
|
buffer.append("<p>Id: "+header.getCorpus_ID()+"</p>\n");
|
577 |
|
buffer.append("<p>Date: "+header.getCorpus_Date()+"</p>\n");
|
578 |
|
buffer.append("<p>Format: "+header.getCorpus_Format()+"</p>\n");
|
579 |
|
buffer.append("<p>History: "+header.getCorpus_History()+"</p>\n");
|
|
575 |
buffer.append("<p>Name: "+header.getCorpus_Name()+"</p>\n"); //$NON-NLS-2$
|
|
576 |
buffer.append("<p>Id: "+header.getCorpus_ID()+"</p>\n"); //$NON-NLS-1$ //$NON-NLS-2$
|
|
577 |
buffer.append("<p>Date: "+header.getCorpus_Date()+"</p>\n"); //$NON-NLS-1$ //$NON-NLS-2$
|
|
578 |
buffer.append("<p>Format: "+header.getCorpus_Format()+"</p>\n"); //$NON-NLS-2$
|
|
579 |
buffer.append("<p>History: "+header.getCorpus_History()+"</p>\n"); //$NON-NLS-2$
|
580 |
580 |
buffer.append("<h4>Statiticss</h4>\n");
|
581 |
|
buffer.append("<p>Edges: "+header.getNumberOfEdges()+"</p>\n");
|
582 |
|
buffer.append("<p>Sentences: "+header.getNumberOfSentences()+"</p>\n");
|
583 |
|
buffer.append("<p>NT Nodes: "+header.getNumberOfNTNodes()+"</p>\n");
|
584 |
|
buffer.append("<p>T Nodes: "+header.getNumberOfTNodes()+"</p>\n");
|
|
581 |
buffer.append("<p>Edges: "+header.getNumberOfEdges()+"</p>\n"); //$NON-NLS-2$
|
|
582 |
buffer.append("<p>Sentences: "+header.getNumberOfSentences()+"</p>\n"); //$NON-NLS-2$
|
|
583 |
buffer.append("<p>NT Nodes: "+header.getNumberOfNTNodes()+"</p>\n"); //$NON-NLS-2$
|
|
584 |
buffer.append("<p>T Nodes: "+header.getNumberOfTNodes()+"</p>\n"); //$NON-NLS-2$
|
585 |
585 |
buffer.append("<h4>Terminals</h4>\n");
|
586 |
586 |
//buffer.append("<p>"+header.getAllTerminalFeatures()+"</p>\n");
|
587 |
|
buffer.append("<p>"+header.getAllTerminalFeaturesSize()+" "+StringUtils.join(header.getAllTFeatureNames(), ", ")+"</p>\n");
|
|
587 |
buffer.append("<p>"+header.getAllTerminalFeaturesSize()+" "+StringUtils.join(header.getAllTFeatureNames(), ", ")+"</p>\n"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
|
588 |
588 |
if (header.getAllTFeatureNames().size() > 0) {
|
589 |
|
buffer.append("<ul>\n");
|
|
589 |
buffer.append("<ul>\n"); //$NON-NLS-1$
|
590 |
590 |
for (Object name : header.getAllTFeatureNames()) {
|
591 |
591 |
Feature f = header.getTFeature(name.toString());
|
592 |
|
buffer.append("<li>"+f.getName());
|
|
592 |
buffer.append("<li>"+f.getName()); //$NON-NLS-1$
|
593 |
593 |
List desc = f.getDescriptions();
|
594 |
594 |
List vals = f.getItems();
|
595 |
595 |
if (f.getItems().size() > 0) {
|
596 |
|
buffer.append(":");
|
|
596 |
buffer.append(":"); //$NON-NLS-1$
|
597 |
597 |
for (int i = 0 ; i < vals.size() ; i++) {
|
598 |
|
if (i > 0) buffer.append(",");
|
599 |
|
buffer.append(" ("+vals.get(i)+": "+desc.get(i)+")");
|
|
598 |
if (i > 0) buffer.append(","); //$NON-NLS-1$
|
|
599 |
buffer.append(" ("+vals.get(i)+": "+desc.get(i)+")"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
|
600 |
600 |
}
|
601 |
601 |
} else { // not defined
|
602 |
602 |
try {
|
... | ... | |
605 |
605 |
values.add(index.getTFeatureValueAt(name.toString(), p));
|
606 |
606 |
}
|
607 |
607 |
if (values.size() > 0) {
|
608 |
|
buffer.append(": "+StringUtils.join(values, ", "));
|
|
608 |
buffer.append(": "+StringUtils.join(values, ", ")); //$NON-NLS-1$ //$NON-NLS-2$
|
609 |
609 |
}
|
610 |
610 |
} catch (IndexException e) {
|
611 |
611 |
// TODO Auto-generated catch block
|
612 |
612 |
e.printStackTrace();
|
613 |
613 |
}
|
614 |
614 |
}
|
615 |
|
buffer.append("</li>\n");
|
|
615 |
buffer.append("</li>\n"); //$NON-NLS-1$
|
616 |
616 |
}
|
617 |
|
buffer.append("</ul>\n");
|
|
617 |
buffer.append("</ul>\n"); //$NON-NLS-1$
|
618 |
618 |
}
|
619 |
619 |
|
620 |
620 |
buffer.append("<h4>Non-Terminals</h4>\n");
|
621 |
621 |
//buffer.append("<p>"+header.getAllNonterminalFeatures()+"</p>\n");
|
622 |
|
buffer.append("<p>"+header.getAllNonterminalFeaturesSize()+" "+StringUtils.join(header.getAllNTFeatureNames(), ", ")+"</p>\n");
|
|
622 |
buffer.append("<p>"+header.getAllNonterminalFeaturesSize()+" "+StringUtils.join(header.getAllNTFeatureNames(), ", ")+"</p>\n"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
|
623 |
623 |
if (header.getAllNTFeatureNames().size() > 0) {
|
624 |
|
buffer.append("<ul>\n");
|
|
624 |
buffer.append("<ul>\n"); //$NON-NLS-1$
|
625 |
625 |
for (Object name : header.getAllNTFeatureNames()) {
|
626 |
626 |
Feature f = header.getNTFeature(name.toString());
|
627 |
|
buffer.append("<li>"+f.getName());
|
|
627 |
buffer.append("<li>"+f.getName()); //$NON-NLS-1$
|
628 |
628 |
List desc = f.getDescriptions();
|
629 |
629 |
List vals = f.getItems();
|
630 |
630 |
if (f.getItems().size() > 0) {
|
631 |
|
buffer.append(":");
|
|
631 |
buffer.append(":"); //$NON-NLS-1$
|
632 |
632 |
for (int i = 0 ; i < vals.size() ; i++) {
|
633 |
|
if (i > 0) buffer.append(",");
|
|
633 |
if (i > 0) buffer.append(","); //$NON-NLS-1$
|
634 |
634 |
if (vals.get(i).equals(desc.get(i))) {
|
635 |
|
buffer.append(" "+vals.get(i));
|
|
635 |
buffer.append(" "+vals.get(i)); //$NON-NLS-1$
|
636 |
636 |
} else {
|
637 |
|
buffer.append(" "+vals.get(i)+" ("+desc.get(i)+")");
|
|
637 |
buffer.append(" "+vals.get(i)+" ("+desc.get(i)+")"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
|
638 |
638 |
}
|
639 |
639 |
}
|
640 |
640 |
}
|
641 |
|
buffer.append("</li>\n");
|
|
641 |
buffer.append("</li>\n"); //$NON-NLS-1$
|
642 |
642 |
}
|
643 |
|
buffer.append("</ul>\n");
|
|
643 |
buffer.append("</ul>\n"); //$NON-NLS-1$
|
644 |
644 |
}
|
645 |
645 |
|
646 |
646 |
return buffer.toString();
|