Task #1756

Improve partition creation duration process

Added by Sebastien Jacquot about 3 years ago. Updated about 3 years ago.

Status:New Start date:03/29/2016
Priority:Normal Due date:
Assignee:- % Done:


Category:- Spent time: -
Target version:TXM X.X



The partition creation can be long when there are a lot of parts.
After profiling the process, the bottleneck is located in native method org.txm.searchengine.cqp.MemCqiServer.cqpQuery(String, String, String)
Here are some tests, for trace, that have not been really significative on Windows (maybe 5 or 10% less duration on a partition with 2400 parts than current method) but measures are tricky. The tests consist to reduce the native calls number from Java that can be heavy through JNI by defining a native method to execute all the queries and by centralizing the multi-parts creation in Java side (rather than calling a native method for each part creation).

steps, Java side:
  • create org.txm.searchengine.cqp.MemCqiServer.cqpQueries(String, String[], String[])
        public native Boolean cqpQueries(String arg0, String[] arg1, String[] arg2)
                throws IOException, UnexpectedAnswerException, CqiServerError ;
  • create org.txm.searchengine.cqp.ICqiClient.cqpQueries(String, String[], String[])
    public void cqpQueries(String motherCorpus, String[] subcorpusNamers, String[] queries) throws IOException, UnexpectedAnswerException, CqiServerError;
  • create org.txm.searchengine.cqp.MemCqiClient.cqpQueries(String, String[], String[])
        public void cqpQueries(String arg0, String[] arg1, String[] arg2) throws IOException, UnexpectedAnswerException, CqiServerError {
            Boolean ret = server.cqpQueries(arg0, arg1, arg2);
            if(ret == null || !ret) {
                int e = server.getErrorCode();
  • create org.txm.searchengine.cqp.corpus.Partition.createParts(String, List<String>, List<String>)
        private ArrayList<Part> createParts(String partitionName, List<String> partNames, List<String> queries) throws CqiClientException {
            ArrayList<Part> parts = new ArrayList<Part>(partNames.size());
            //Log.finest(NLS.bind(Messages.CREATING_PART,partName, query));
    //        long start = System.currentTimeMillis();
            ArrayList<String> cqpPartIds = new ArrayList<String>(partNames.size());
            for(int i = 0; i < partNames.size(); i++) {
                try {
                    String partCqpId = CqpObject.partNamePrefix + Corpus.getNextSubcorpusCounter();
                    parts.add(new Part(partCqpId, partitionName, partNames.get(i), this, new Query(queries.get(i))));
                catch(InvalidCqpIdException e) {
                    // TODO Auto-generated catch block
            try {
                CorpusManager.getCorpusManager().getCqiClient().cqpQueries(this.corpus.getQualifiedCqpId(), cqpPartIds.toArray(new String[cqpPartIds.size()]), queries.toArray(new String[queries.size()]));        
            } catch (Exception e) {
                try {
                    throw new CqiClientException(Messages.Partition_9 + partitionName + " last error: "+Toolbox.getCqiClient().getLastCQPError()); //$NON-NLS-1$ //$NON-NLS-2$
                } catch (Exception e1) {
                    return null;
        //    long end = System.currentTimeMillis();
            //Log.finest(NLS.bind(Messages.PART_CREATED, partitionName + "_" + partName, (end - start))); //$NON-NLS-1$//$NON-NLS-2$
            return parts;
  • create org.txm.searchengine.cqp.corpus.Partition.Partition(Corpus, String, List<String>, List<String>)
        public Partition(Corpus corpus, String name, List<String> queries,
                List<String> partnames) throws CqiClientException {
            this.corpus = corpus;
            this.structure = null;
            this.property = null;
            if (name == null || name.trim().length() == 0)
                name = "noname"; //$NON-NLS-1$
            this.name = name;
            Log.info(NLS.bind(Messages.NEW_PARTION, this.corpus, this.name));
            long start = System.currentTimeMillis();
            this.parts = new ArrayList<Part>();
            // FIXME: tests optimisations CQP LIB
            parts.addAll(createParts(name, partnames, queries));
    //        for (int i = 0; i < queries.size(); i++) {
    //            String queryS = queries.get(i);
    //            String partitionName = this.getName();
    //            String partName = partnames.get(i);
    //            if (partName.trim().length() == 0) partName = "-"; //$NON-NLS-1$
    //            Part part = createPart(partitionName, partName, queryS);
    //            parts.add(part);
    //        }
            long end = System.currentTimeMillis();
            Log.info(NLS.bind(Messages.PARTITION_CREATED, this.name, (end - start)));
Steps, native side:
  • create JNIEXPORT jobject JNICALL Java_org_txm_searchengine_cqp_MemCqiServer_cqpQueries
    (JNIEnv *, jobject, jstring, jobjectArray, jobjectArray); in MemCqiServer.h/MemCqiServer.c
JNIEXPORT jobject JNICALL Java_org_txm_searchengine_cqp_MemCqiServer_cqpQueries
(JNIEnv * env, jobject obj, jstring motherCorpus, jobjectArray subcorpusNames, jobjectArray jqueries) {

    char *child, *mother, *query, *c, *sc;
    jboolean iscopy;

    mother = (*env)->GetStringUTFChars(env, motherCorpus, &iscopy);

    int *children = (*env)->GetObjectArrayElement(env, subcorpusNames, NULL);
    int childrenCount = (*env)->GetArrayLength(env, subcorpusNames);

    int *queries = (*env)->GetObjectArrayElement(env, jqueries, NULL);

    if (!split_subcorpus_spec(mother, &c, &sc)) {
        (*env)->ReleaseStringChars(env, motherCorpus, mother);
        //(*env)->ReleaseStringChars(env, subcorpus, child);
        //(*env)->ReleaseStringChars(env, jquery, query);
        return throwException(env, obj);
    } else {

        int test2 = cqi_activate_corpus(mother);
        int i;
        for(i = 0; i < childrenCount; i++)    {

            child = (*env)->GetStringUTFChars(env, (*env)->GetObjectArrayElement(env, subcorpusNames, i), &iscopy);

            query = (*env)->GetStringUTFChars(env, (*env)->GetObjectArrayElement(env, jqueries, i), &iscopy);

/*            printf("\n ******* i = ");
            printf("%d", i);
            printf(" ******* child ");
            printf(" ******* query ");

//            fflush(stdout);

            char *cqp_query;
            int len = strlen(child) + strlen(query) + 10;

            cqp_query = (char *) cl_realloc(cqp_query, len);
            int test1 = check_subcorpus_name(child);

            //printf("\ntests results: subcorpus_name=%d activation=%d\n", test1, test2);
            if (!test1 || !test2) {

                (*env)->ReleaseStringChars(env, motherCorpus, mother);
                //(*env)->ReleaseStringChars(env, subcorpus, child);
                //(*env)->ReleaseStringChars(env, jquery, query);
                return throwException(env, obj);
            else {
                query_lock = floor(1e9 * cl_runif()) + 1; // activate query lock mode with random key

                //printf("CQPSERVER: query_lock = %d\n", query_lock);
                if (query_has_semicolon(query))
                    sprintf(cqp_query, "%s = %s", child, query);
                    sprintf(cqp_query, "%s = %s;", child, query);

                //printf("CQi: parsing %s\n", cqp_query);

                if (!cqp_parse_string(cqp_query)) { // parser and execute
                    fprintf(stderr, "start of throw exeption");
                    return throwCLException(env, obj);
                    //fprintf(stderr, "End of throw exeption");
                } else {
                    char *full_child = combine_subcorpus_spec(c, child); // c is the 'physical' part of the mother corpus
                    CorpusList *childcl = cqi_find_corpus(full_child);

                    if ((childcl) == NULL) {
                        (*env)->ReleaseStringChars(env, motherCorpus, mother);
                        //(*env)->ReleaseStringChars(env, subcorpus, child);
                        //(*env)->ReleaseStringChars(env, jquery, query);
                        return throwCLException(env, obj);
                    } else {
                        if (server_log) {
                            printf("'%s' ran the following query on %s\n", "cqplib", mother);
                            printf("\t%s\n", cqp_query);
                            printf("and got %d matches.\n", childcl->size);

                    if (full_child) cl_free(full_child);
                query_lock = 0;           // deactivate query lock mode

            if (cqp_query) cl_free(cqp_query);

            //(*env)->ReleaseStringUTFChars(env, string1, child);

            //(*env)->ReleaseStringUTFChars(env, string2, query);
    if (c) cl_free(c);
    if (sc) cl_free(sc);

    (*env)->ReleaseStringChars(env, motherCorpus, mother);
    //(*env)->ReleaseStringChars(env, subcorpus, child);
    //(*env)->ReleaseStringChars(env, jquery, query);
    return toBoolean(env, obj, JNI_TRUE);
Other tips:
  • we may assume that we give an unique corpora id from Java layer to native layer and remove the tests check_subcorpus_name()
  • passing array through JNI and getting array elements seems to be heavy maybe we may use buffer instead
  • mange this test "if (query_has_semicolon(query))" in another way, eg. add a function parameter

Related issues

related to Feature #978: TBX: 0.7.6, lazy loading for subcorpora and partitions New 08/22/2014


#1 Updated by Sebastien Jacquot about 3 years ago

  • Description updated (diff)

#2 Updated by Sebastien Jacquot about 3 years ago

  • Description updated (diff)

#3 Updated by Sebastien Jacquot about 3 years ago

  • Description updated (diff)

Also available in: Atom PDF