Feature #1756

Updated by Sebastien Jacquot over 4 years ago

[WIP]

The partition creation can be long when there are a lot of parts.
After profiling the process, the bottleneck is located in native method org.txm.searchengine.cqp.MemCqiServer.cqpQuery(String, String, String)
Here are some tests, for trace, that have not been really significative on Windows (maybe 5 or 10% less duration on a partition with 2400 273 parts than current method) but measures are tricky. The tests consist to reduce the native calls number from Java that can be heavy through JNI by defining a native method to execute all the queries and by centralizing the multi-parts creation in Java side (rather than calling a native method for each part creation).

steps, Java side:
* create org.txm.searchengine.cqp.MemCqiServer.cqpQueries(String, String[], String[])
<pre>
public native Boolean cqpQueries(String arg0, String[] arg1, String[] arg2)
throws IOException, UnexpectedAnswerException, CqiServerError ;
</pre>
* create org.txm.searchengine.cqp.ICqiClient.cqpQueries(String, String[], String[])
<pre>
public void cqpQueries(String motherCorpus, String[] subcorpusNamers, String[] queries) throws IOException, UnexpectedAnswerException, CqiServerError;
</pre>

* create org.txm.searchengine.cqp.MemCqiClient.cqpQueries(String, String[], String[])
<pre>
@Override
public void cqpQueries(String arg0, String[] arg1, String[] arg2) throws IOException, UnexpectedAnswerException, CqiServerError {
Boolean ret = server.cqpQueries(arg0, arg1, arg2);
if(ret == null || !ret) {
int e = server.getErrorCode();
throwExceptionFromCqi(e);
}
}
</pre>

* create org.txm.searchengine.cqp.corpus.Partition.createParts(String, List<String>, List<String>)
<pre>
private ArrayList<Part> createParts(String partitionName, List<String> partNames, List<String> queries) throws CqiClientException {

ArrayList<Part> parts = new ArrayList<Part>(partNames.size());


//Log.finest(NLS.bind(Messages.CREATING_PART,partName, query));


// long start = System.currentTimeMillis();

ArrayList<String> cqpPartIds = new ArrayList<String>(partNames.size());

for(int i = 0; i < partNames.size(); i++) {
try {
String partCqpId = CqpObject.partNamePrefix + Corpus.getNextSubcorpusCounter();
cqpPartIds.add(partCqpId);
parts.add(new Part(partCqpId, partitionName, partNames.get(i), this, new Query(queries.get(i))));
}
catch(InvalidCqpIdException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}



try {
CorpusManager.getCorpusManager().getCqiClient().cqpQueries(this.corpus.getQualifiedCqpId(), cqpPartIds.toArray(new String[cqpPartIds.size()]), queries.toArray(new String[queries.size()]));

} catch (Exception e) {
try {
throw new CqiClientException(Messages.Partition_9 + partitionName + " last error: "+Toolbox.getCqiClient().getLastCQPError()); //$NON-NLS-1$ //$NON-NLS-2$
} catch (Exception e1) {
System.out.println(Messages.Partition_18+e1);
org.txm.utils.logger.Log.printStackTrace(e1);
return null;
}
}




// long end = System.currentTimeMillis();
//Log.finest(NLS.bind(Messages.PART_CREATED, partitionName + "_" + partName, (end - start))); //$NON-NLS-1$//$NON-NLS-2$

return parts;

}
</pre>
* create org.txm.searchengine.cqp.corpus.Partition.Partition(Corpus, String, List<String>, List<String>)
<pre>
public Partition(Corpus corpus, String name, List<String> queries,
List<String> partnames) throws CqiClientException {
this.corpus = corpus;
this.structure = null;
this.property = null;
if (name == null || name.trim().length() == 0)
name = "noname"; //$NON-NLS-1$
this.name = name;
Log.info(NLS.bind(Messages.NEW_PARTION, this.corpus, this.name));
long start = System.currentTimeMillis();
this.parts = new ArrayList<Part>();

// FIXME: tests optimisations CQP LIB
parts.addAll(createParts(name, partnames, queries));

// for (int i = 0; i < queries.size(); i++) {
// String queryS = queries.get(i);
// String partitionName = this.getName();
// String partName = partnames.get(i);
// if (partName.trim().length() == 0) partName = "-"; //$NON-NLS-1$
// Part part = createPart(partitionName, partName, queryS);
// parts.add(part);
// }
//
//

long end = System.currentTimeMillis();
Log.info(NLS.bind(Messages.PARTITION_CREATED, this.name, (end - start)));
}
</pre>

Steps, native side:
* create JNIEXPORT jobject JNICALL Java_org_txm_searchengine_cqp_MemCqiServer_cqpQueries
(JNIEnv *, jobject, jstring, jobjectArray, jobjectArray); in MemCqiServer.h/MemCqiServer.c

<pre>
JNIEXPORT jobject JNICALL Java_org_txm_searchengine_cqp_MemCqiServer_cqpQueries
(JNIEnv * env, jobject obj, jstring motherCorpus, jobjectArray subcorpusNames, jobjectArray jqueries) {

char *child, *mother, *query, *c, *sc;
jboolean iscopy;

mother = (*env)->GetStringUTFChars(env, motherCorpus, &iscopy);


int *children = (*env)->GetObjectArrayElement(env, subcorpusNames, NULL);
int childrenCount = (*env)->GetArrayLength(env, subcorpusNames);

int *queries = (*env)->GetObjectArrayElement(env, jqueries, NULL);

if (!split_subcorpus_spec(mother, &c, &sc)) {
(*env)->ReleaseStringChars(env, motherCorpus, mother);
//(*env)->ReleaseStringChars(env, subcorpus, child);
//(*env)->ReleaseStringChars(env, jquery, query);
return throwException(env, obj);
} else {



int test2 = cqi_activate_corpus(mother);
int i;
for(i = 0; i < childrenCount; i++) {

child = (*env)->GetStringUTFChars(env, (*env)->GetObjectArrayElement(env, subcorpusNames, i), &iscopy);

query = (*env)->GetStringUTFChars(env, (*env)->GetObjectArrayElement(env, jqueries, i), &iscopy);

/* printf("\n ******* i = ");
printf("%d", i);
printf(" ******* child ");
printf(child);
printf(" ******* query ");
printf(query);*/

// fflush(stdout);

char *cqp_query;
int len = strlen(child) + strlen(query) + 10;

cqp_query = (char *) cl_realloc(cqp_query, len);
int test1 = check_subcorpus_name(child);


//printf("\ntests results: subcorpus_name=%d activation=%d\n", test1, test2);
if (!test1 || !test2) {

(*env)->ReleaseStringChars(env, motherCorpus, mother);
//(*env)->ReleaseStringChars(env, subcorpus, child);
//(*env)->ReleaseStringChars(env, jquery, query);
return throwException(env, obj);
}
else {
query_lock = floor(1e9 * cl_runif()) + 1; // activate query lock mode with random key

//printf("CQPSERVER: query_lock = %d\n", query_lock);
if (query_has_semicolon(query))
sprintf(cqp_query, "%s = %s", child, query);
else
sprintf(cqp_query, "%s = %s;", child, query);

//printf("CQi: parsing %s\n", cqp_query);

if (!cqp_parse_string(cqp_query)) { // parser and execute
fprintf(stderr, "start of throw exeption");
return throwCLException(env, obj);
//fprintf(stderr, "End of throw exeption");
} else {
char *full_child = combine_subcorpus_spec(c, child); // c is the 'physical' part of the mother corpus
CorpusList *childcl = cqi_find_corpus(full_child);

if ((childcl) == NULL) {
(*env)->ReleaseStringChars(env, motherCorpus, mother);
//(*env)->ReleaseStringChars(env, subcorpus, child);
//(*env)->ReleaseStringChars(env, jquery, query);
return throwCLException(env, obj);
} else {
if (server_log) {
printf("'%s' ran the following query on %s\n", "cqplib", mother);
printf("\t%s\n", cqp_query);
printf("and got %d matches.\n", childcl->size);
}
}

if (full_child) cl_free(full_child);
}
query_lock = 0; // deactivate query lock mode
}

if (cqp_query) cl_free(cqp_query);

//(*env)->ReleaseStringUTFChars(env, string1, child);

//(*env)->ReleaseStringUTFChars(env, string2, query);
}
}
if (c) cl_free(c);
if (sc) cl_free(sc);

(*env)->ReleaseStringChars(env, motherCorpus, mother);
//(*env)->ReleaseStringChars(env, subcorpus, child);
//(*env)->ReleaseStringChars(env, jquery, query);
return toBoolean(env, obj, JNI_TRUE);
}
</pre>

Other tips:
* we may assume that we give an unique corpora id from Java layer to native layer and remove the tests check_subcorpus_name()
* passing array through JNI and getting array elements seems to be heavy maybe we may use buffer instead
* mange this test "if (query_has_semicolon(query))" in another way, eg. add a function parameter

Back