Feature #1756

Updated by Sebastien Jacquot over 4 years ago


The partition creation can be long when there are a lot of parts.
After profiling the process, the bottleneck is located in native method org.txm.searchengine.cqp.MemCqiServer.cqpQuery(String, String, String)
Here are some tests, for trace, that have not been really significative on Windows (maybe 5 or 10% less duration on a partition with 273 parts than current method) but measures are tricky. The tests consist to reduce the native calls number from Java that can be heavy through JNI by defining a native method to execute all the queries and by centralizing the multi-parts creation in Java side (rather than calling a native method for each part creation). side.

steps, Java side:
* create org.txm.searchengine.cqp.MemCqiServer.cqpQueries(String, String[], String[])
* create org.txm.searchengine.cqp.ICqiClient.cqpQueries(String, String[], String[])
* create org.txm.searchengine.cqp.MemCqiClient.cqpQueries(String, String[], String[])
public void cqpQueries(String arg0, String[] arg1, String[] arg2) throws IOException, UnexpectedAnswerException, CqiServerError {
Boolean ret = server.cqpQueries(arg0, arg1, arg2);
if(ret == null || !ret) {
int e = server.getErrorCode();

* create org.txm.searchengine.cqp.corpus.Partition.createParts(String, List<String>, List<String>)
private ArrayList<Part> createParts(String partitionName, List<String> partNames, List<String> queries) throws CqiClientException {

ArrayList<Part> parts = new ArrayList<Part>(partNames.size());

//Log.finest(NLS.bind(Messages.CREATING_PART,partName, query));

// long start = System.currentTimeMillis();

ArrayList<String> cqpPartIds = new ArrayList<String>(partNames.size());

for(int i = 0; i < partNames.size(); i++) {
try {
String partCqpId = CqpObject.partNamePrefix + Corpus.getNextSubcorpusCounter();
parts.add(new Part(partCqpId, partitionName, partNames.get(i), this, new Query(queries.get(i))));
catch(InvalidCqpIdException e) {
// TODO Auto-generated catch block

try {
CorpusManager.getCorpusManager().getCqiClient().cqpQueries(this.corpus.getQualifiedCqpId(), cqpPartIds.toArray(new String[cqpPartIds.size()]), queries.toArray(new String[queries.size()]));

} catch (Exception e) {
try {
throw new CqiClientException(Messages.Partition_9 + partitionName + " last error: "+Toolbox.getCqiClient().getLastCQPError()); //$NON-NLS-1$ //$NON-NLS-2$
} catch (Exception e1) {
return null;

// long end = System.currentTimeMillis();
//Log.finest(NLS.bind(Messages.PART_CREATED, partitionName + "_" + partName, (end - start))); //$NON-NLS-1$//$NON-NLS-2$

return parts;

* create org.txm.searchengine.cqp.corpus.Partition.Partition(Corpus, String, List<String>, List<String>)
public Partition(Corpus corpus, String name, List<String> queries,
List<String> partnames) throws CqiClientException {
this.corpus = corpus;
this.structure = null;
this.property = null;
if (name == null || name.trim().length() == 0)
name = "noname"; //$NON-NLS-1$
this.name = name;
Log.info(NLS.bind(Messages.NEW_PARTION, this.corpus, this.name));
long start = System.currentTimeMillis();
this.parts = new ArrayList<Part>();

// FIXME: tests optimisations CQP LIB
parts.addAll(createParts(name, partnames, queries));

// for (int i = 0; i < queries.size(); i++) {
// String queryS = queries.get(i);
// String partitionName = this.getName();
// String partName = partnames.get(i);
// if (partName.trim().length() == 0) partName = "-"; //$NON-NLS-1$
// Part part = createPart(partitionName, partName, queryS);
// parts.add(part);
// }

long end = System.currentTimeMillis();
Log.info(NLS.bind(Messages.PARTITION_CREATED, this.name, (end - start)));

Steps, native side:
* create JNIEXPORT jobject JNICALL Java_org_txm_searchengine_cqp_MemCqiServer_cqpQueries
(JNIEnv *, jobject, jstring, jobjectArray, jobjectArray); in MemCqiServer.h/MemCqiServer.c

JNIEXPORT jobject JNICALL Java_org_txm_searchengine_cqp_MemCqiServer_cqpQueries
(JNIEnv * env, jobject obj, jstring motherCorpus, jobjectArray subcorpusNames, jobjectArray jqueries) {

char *child, *mother, *query, *c, *sc;
jboolean iscopy;

mother = (*env)->GetStringUTFChars(env, motherCorpus, &iscopy);

int *children = (*env)->GetObjectArrayElement(env, subcorpusNames, NULL);
int childrenCount = (*env)->GetArrayLength(env, subcorpusNames);

int *queries = (*env)->GetObjectArrayElement(env, jqueries, NULL);

if (!split_subcorpus_spec(mother, &c, &sc)) {
(*env)->ReleaseStringChars(env, motherCorpus, mother);
//(*env)->ReleaseStringChars(env, subcorpus, child);
//(*env)->ReleaseStringChars(env, jquery, query);
return throwException(env, obj);
} else {

int test2 = cqi_activate_corpus(mother);
int i;
for(i = 0; i < childrenCount; i++) {

child = (*env)->GetStringUTFChars(env, (*env)->GetObjectArrayElement(env, subcorpusNames, i), &iscopy);

query = (*env)->GetStringUTFChars(env, (*env)->GetObjectArrayElement(env, jqueries, i), &iscopy);

/* printf("\n ******* i = ");
printf("%d", i);
printf(" ******* child ");
printf(" ******* query ");

// fflush(stdout);

char *cqp_query;
int len = strlen(child) + strlen(query) + 10;

cqp_query = (char *) cl_realloc(cqp_query, len);
int test1 = check_subcorpus_name(child);

//printf("\ntests results: subcorpus_name=%d activation=%d\n", test1, test2);
if (!test1 || !test2) {

(*env)->ReleaseStringChars(env, motherCorpus, mother);
//(*env)->ReleaseStringChars(env, subcorpus, child);
//(*env)->ReleaseStringChars(env, jquery, query);
return throwException(env, obj);
else {
query_lock = floor(1e9 * cl_runif()) + 1; // activate query lock mode with random key

//printf("CQPSERVER: query_lock = %d\n", query_lock);
if (query_has_semicolon(query))
sprintf(cqp_query, "%s = %s", child, query);
sprintf(cqp_query, "%s = %s;", child, query);

//printf("CQi: parsing %s\n", cqp_query);

if (!cqp_parse_string(cqp_query)) { // parser and execute
fprintf(stderr, "start of throw exeption");
return throwCLException(env, obj);
//fprintf(stderr, "End of throw exeption");
} else {
char *full_child = combine_subcorpus_spec(c, child); // c is the 'physical' part of the mother corpus
CorpusList *childcl = cqi_find_corpus(full_child);

if ((childcl) == NULL) {
(*env)->ReleaseStringChars(env, motherCorpus, mother);
//(*env)->ReleaseStringChars(env, subcorpus, child);
//(*env)->ReleaseStringChars(env, jquery, query);
return throwCLException(env, obj);
} else {
if (server_log) {
printf("'%s' ran the following query on %s\n", "cqplib", mother);
printf("\t%s\n", cqp_query);
printf("and got %d matches.\n", childcl->size);

if (full_child) cl_free(full_child);
query_lock = 0; // deactivate query lock mode

if (cqp_query) cl_free(cqp_query);

//(*env)->ReleaseStringUTFChars(env, string1, child);

//(*env)->ReleaseStringUTFChars(env, string2, query);
if (c) cl_free(c);
if (sc) cl_free(sc);

(*env)->ReleaseStringChars(env, motherCorpus, mother);
//(*env)->ReleaseStringChars(env, subcorpus, child);
//(*env)->ReleaseStringChars(env, jquery, query);
return toBoolean(env, obj, JNI_TRUE);

Other tips:
* we may assume that we give an unique corpora id from Java layer to native layer and remove the tests check_subcorpus_name()
* passing array through JNI seems to be heavy maybe we may use buffer instead
* mange this test "if (query_has_semicolon(query))" in another way, eg. add a function parameter