Statistics
| Revision:

root / tmp / org.txm.analec.rcp / src / org / txm / macro / urs / AnalecUtils.groovy @ 2167

History | View | Annotate | Download (18.4 kB)

1
package org.txm.macro.urs
2

    
3
import org.txm.searchengine.cqp.corpus.Property
4
import org.txm.searchengine.cqp.corpus.Subcorpus
5
import org.txm.searchengine.cqp.corpus.query.Match
6
import org.txm.searchengine.cqp.corpus.query.CQLQuery
7
import visuAnalec.donnees.*
8
import visuAnalec.elements.*
9

    
10
import org.apache.commons.lang.StringUtils
11

    
12

    
13
static def isPropertyDefined(Class clazz, Corpus analecCorpus, String ursql) {
14
        if (ursql == null || ursql.length() == 0) return new HashSet()
15
        def params = getFilterParameters(ursql)
16
        def typeRegexp = params[0]
17
        def propRegexp = params[1]
18
        return isPropertyDefined(clazz, analecCorpus, typeRegexp, propRegexp)
19
}
20

    
21
static def isPropertyDefined(Class clazz, Corpus analecCorpus, String typeRegexp, String propRegexp) {
22
        def errors = new HashSet()
23
        if (propRegexp == null || propRegexp.length() == 0) return errors;
24
        Structure structure = analecCorpus.getStructure();
25
        for (def type : structure.getTypes(clazz)) {
26
                if (!type.matches(typeRegexp)) continue; // test only types matching with typeRegexp
27

    
28
                def props = structure.getNomsProps(clazz, type);
29
                boolean contains = false;
30
                for (def p : props) {
31
                        if (p.matches(propRegexp)) {
32
                                contains = true
33
                        }
34
                }
35
                if (!contains) errors << type
36
        }
37

    
38
        return errors
39
}
40

    
41
static def defineProperty(Class clazz, Corpus analecCorpus, String ursql, String newProperty) {
42
        def params = getFilterParameters(ursql)
43
        def typeRegexp = params[0]
44
        Structure structure = analecCorpus.getStructure();
45
        for (def type : structure.getTypes(clazz)) {
46
                if (!type.matches(typeRegexp)) continue; // test only types matching with typeRegexp
47
                def props = structure.getNomsProps(clazz, type)
48
                if (!props.contains(newProperty)) {
49
                        structure.ajouterProp(clazz, type, newProperty)
50
                }
51
        }
52
}
53

    
54
static def selectSchemas(def debug, Corpus analecCorpus, String schema_ursql, Integer minimum_schema_size, Integer maximum_schema_size) {
55
        if (maximum_schema_size <= 0) maximum_schema_size = Integer.MAX_VALUE;
56
        if (minimum_schema_size < 0) minimum_schema_size = 0;
57
        def allSchemas = []
58

    
59
        if (schema_ursql != null && schema_ursql.length() > 0) allSchemas = AnalecUtils.findAllInCorpus(debug, analecCorpus, Schema.class, schema_ursql)
60
        else allSchemas = analecCorpus.getTousSchemas()
61

    
62
        if (debug >= 2) println "allSchemas=${allSchemas.size()}"
63
        allSchemas = AnalecUtils.filterBySize(allSchemas, minimum_schema_size, maximum_schema_size);
64

    
65
        return allSchemas
66
}
67

    
68
static def selectSchemasInCorpus(def debug, Corpus analecCorpus, org.txm.searchengine.cqp.corpus.CQPCorpus corpus,
69
                String schema_ursql, Integer minimum_schema_size, Integer maximum_schema_size, boolean strictInclusion) {
70

    
71
        if (maximum_schema_size <= 0) maximum_schema_size = Integer.MAX_VALUE;
72
        if (minimum_schema_size < 0) minimum_schema_size = 0;
73

    
74
        def allSchemas = []
75
        if (schema_ursql != null && schema_ursql.length() > 0) allSchemas = AnalecUtils.findAllInCorpus(debug, analecCorpus, Schema.class, schema_ursql)
76
        else allSchemas = analecCorpus.getTousSchemas()
77

    
78
        def selectedSchemas = []
79
        for (Schema schema : allSchemas) {
80
                def selectedUnits = AnalecUtils.filterUniteByInclusion(debug, schema.getUnitesSousjacentes(), corpus.getMatches(), strictInclusion, 0)
81

    
82
                if (minimum_schema_size <= selectedUnits.size() && selectedUnits.size() <= maximum_schema_size ) {
83
                        selectedSchemas << schema
84
                }
85
        }
86

    
87
        return selectedSchemas
88
}
89

    
90
/**
91
 * select units from a selection of schema. If no schema critera are given, select all units then apply units critera
92
 * 
93
 * @param debug
94
 * @param analecCorpus
95
 * @param corpus
96
 * @param schema_ursql
97
 * @param minimum_schema_size
98
 * @param maximum_schema_size
99
 * @param unit_ursql
100
 * @param cql_limit
101
 * @param strict_inclusion
102
 * @param position
103
 * @return
104
 */
105
static def selectUnitsInSchema(def debug, Corpus analecCorpus, org.txm.searchengine.cqp.corpus.CQPCorpus corpus,
106
                String schema_ursql, Integer minimum_schema_size, Integer maximum_schema_size,
107
                String unit_ursql, Integer position_in_schema, CQLQuery cql_limit, Boolean strict_inclusion, int position_in_matches) {
108
        def groupedUnits = []
109
        if (schema_ursql != null && schema_ursql.length() > 0) {
110
                def allSchema = null;
111

    
112
                if (schema_ursql != null && schema_ursql.length() > 0) allSchema = AnalecUtils.findAllInCorpus(debug, analecCorpus, Schema.class, schema_ursql)
113
                else allSchema = analecCorpus.getTousSchemas()
114
                if (debug >= 2) println "allSchema=${allSchema.size()}"
115

    
116
                groupedUnits = AnalecUtils.groupAllUnitesInElements(debug, allSchema, unit_ursql)
117
                if (debug >= 2) println "groupedUnits=${groupedUnits.size()}"
118
                
119
                groupedUnits = AnalecUtils.filterUniteByInclusionInSchema(debug, groupedUnits, position_in_schema)
120
                if (debug >= 2) println "groupedUnits=${groupedUnits.size()}"
121

    
122

    
123
        } else {
124
                groupedUnits = ["all":AnalecUtils.findAllInCorpus(debug, analecCorpus, Unite.class, unit_ursql)]
125
        }
126
        if (debug >= 2) println "groupedUnits=${groupedUnits.size()}"
127

    
128
        // limit units to corpus or cql_limit matches
129
        def matches = null
130
        if (cql_limit != null && !cql_limit.getQueryString().equals("\"\"")) {
131
                Subcorpus limitssubcorpus = corpus.createSubcorpus(cql_limit, corpus.getID().toUpperCase())
132
                matches = limitssubcorpus.getMatches();
133
                limitssubcorpus.delete();
134
        } else {
135
                matches = corpus.getMatches()
136
        }
137
        if (debug >= 2) println "matches=${matches}"
138
        def allUnits = []
139
        for (def k : groupedUnits.keySet()) {
140
                def selectedUnits = AnalecUtils.filterUniteByInclusion(debug, groupedUnits[k], matches, strict_inclusion, position_in_matches)
141

    
142
                if (minimum_schema_size <= selectedUnits.size() && selectedUnits.size() <= maximum_schema_size ) {
143
                        allUnits.addAll(selectedUnits)
144
                } else {
145

    
146
                }
147
        }
148
        if (debug >= 2) println "selectedUnits=${allUnits.size()}"
149

    
150
        Collections.sort(allUnits)
151

    
152
        return allUnits
153
}
154
/**
155
 * filter groups elements with the elements positions
156
 * 
157
 * 
158
 * @param groups [schema:units list]
159
 * @param distance 0=no selection, 1=first, 2=second, -1 last, -2 last-last
160
 * @return
161
 */
162
static def filterUniteByInclusionInSchema(def debug, def groups, Integer distance) {
163
        println "dist=$distance"
164
        if (distance == 0) return groups;
165
        if (distance > 0) distance = distance-1;
166
        def newGroups = [:]
167
        for (def k : groups.keySet()) {
168
                def group = groups[k]
169
                if (group.size() == 0) {
170
                        newGroups[k] = group;
171
                        continue;
172
                }
173
                def indexes = null
174
                if (distance >= 0) {
175
                        indexes = 0..Math.min(distance, group.size())
176
                } else {
177
                        indexes = Math.max(distance, -group.size())..-1
178
                }
179

    
180
                newGroups[k] = group[indexes];
181
        }
182
        return newGroups
183
}
184

    
185
static def getStartsEndsTargetsArrays(def selectedUnits) {
186
        int[] starts = new int[selectedUnits.size()]
187
        int[] ends = new int[selectedUnits.size()]
188
        int n = 0;
189
        for (def unite : selectedUnits) {
190
                starts[n] = unite.getDeb();
191
                ends[n] = unite.getFin();
192
                n++
193
        }
194
        return [starts, ends, null]
195
}
196

    
197
static int[] toIntArray(Unite u) {
198
        if (u.getDeb() > u.getFin()) // error
199
                return (u.getFin()..u.getDeb()).toArray(new int[u.getDeb()-u.getFin()])
200
        else
201
                return (u.getDeb()..u.getFin()).toArray(new int[u.getFin()-u.getDeb()])
202
}
203

    
204
static String toString(Element e) {
205
        Schema r = null;
206

    
207
        if (e.getClass() == Unite.class)
208
                return sprintf("%d-%d, %s", e.getDeb(), e.getFin(), e.getProps().sort())
209
        else if (e.getClass() == Relation.class)
210
                return sprintf("%s=%s -> %s", toString(e.getElt1()), toString(e.getElt2()), e.getProps().sort())
211
        else if (e.getClass() == Schema.class)
212
                return sprintf("%s=%d", e.getContenu().size(), e.getProps().sort())
213
}
214

    
215
static String toString(def CQI, def wordProperty, Element e) {
216
        Schema r = null;
217

    
218
        if (e.getClass() == Unite.class) {
219
                def form = StringUtils.join(CQI.cpos2Str(wordProperty.getQualifiedName(), toIntArray(e)), " ")
220
                return sprintf("%s %d-%d, %s", form, e.getDeb(), e.getFin(), e.getProps().sort())
221
        } else if (e.getClass() == Relation.class) {
222
                def form1 = StringUtils.join(CQI.cpos2Str(wordProperty.getQualifiedName(), toIntArray(e.getElt1())), " ")
223
                def form2 = StringUtils.join(CQI.cpos2Str(wordProperty.getQualifiedName(), toIntArray(e.getElt2())), " ")
224
                return sprintf("%s=%s -> %s", form1+" "+toString(e.getElt1()), form2+" "+toString(e.getElt2()), e.getProps().sort())
225
        } else if (e.getClass() == Schema.class) {
226
                return sprintf("%s=%d", e.getContenu().size(), e.getProps().sort())
227
        }
228
}
229

    
230
static def findAllInCorpus(def debug, def analecCorpus, Class elemClazz, String URSQL) {
231
        def params = getFilterParameters(URSQL)
232
        if (debug >= 2) println "PARAMS=$params"
233
        return findAllInCorpus(debug, analecCorpus, elemClazz, params[0], params[1], params[2], params[3])
234
}
235

    
236
static def findAllInCorpus(def debug, Corpus analecCorpus, Class elemClazz, String typeRegex, String propName, boolean eq, String valueRegex) {
237
        def allElements = null;
238

    
239
        if (elemClazz != null) {
240
                if (elemClazz == Unite.class)
241
                        allElements = analecCorpus.getToutesUnites()
242
                else if (elemClazz == Relation.class)
243
                        allElements = analecCorpus.getToutesRelations()
244
                else if (elemClazz == Schema.class)
245
                        allElements = analecCorpus.getTousSchemas()
246
        } else {
247
                allElements = [];
248
                allElements.addAll(analecCorpus.getToutesUnites())
249
                allElements.addAll(analecCorpus.getToutesRelations())
250
                allElements.addAll(analecCorpus.getTousSchemas())
251
        }
252

    
253
        return filterElements(debug, allElements, typeRegex, propName, eq, valueRegex);
254
}
255

    
256
static def filterBySize(def elements, Integer minimum_schema_size, Integer maximum_schema_size) {
257
        if (maximum_schema_size == null || maximum_schema_size <= 0) maximum_schema_size = Integer.MAX_VALUE;
258
        if (minimum_schema_size == null || minimum_schema_size < 0) minimum_schema_size = 0;
259

    
260
        def filteredElements = []
261
        for (Element e : elements) {
262
                Unite[] selectedUnits = e.getUnitesSousjacentes();
263
                int size = selectedUnits.length;
264
                if (minimum_schema_size <= selectedUnits.size() && selectedUnits.size() <= maximum_schema_size ) {
265
                        filteredElements << e
266
                }
267
        }
268
        return filteredElements
269
}
270

    
271
/**
272
 * group units by CQP match
273
 * 
274
 * units are sorted for faster processing
275
 * 
276
 * @param allUnites
277
 * @param matches
278
 * @param strict_inclusion
279
 * @return
280
 */
281
static def groupByMatch(def debug, def allUnites, def matches, boolean strict_inclusion) {
282
        if (debug >= 2) println "group "+allUnites.size()+" units with "+matches.size()+" strict=$strict_inclusion"
283
        //println allUnites.collect() {it -> it.getDeb()}
284
        allUnites = allUnites.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() }
285
        //println allUnites.collect() {it -> it.getDeb()}
286
        def unitsSize = allUnites.size()
287
        def iCurrentUnit = 0
288
        def selectedUnits = []
289

    
290
        def matchesSize = matches.size()
291
        def iCurrentMatch = 0
292

    
293
        def selectedUnitsPerMatch = new LinkedHashMap()
294
        selectedUnitsPerMatch[iCurrentMatch] = selectedUnits
295

    
296
        while (iCurrentMatch < matchesSize && iCurrentUnit < unitsSize) {
297
                if (debug >= 3) println "** M $iCurrentMatch < $matchesSize && U $iCurrentUnit < $unitsSize"
298

    
299
                Unite unit = allUnites[iCurrentUnit]
300
                Match match = matches[iCurrentMatch]
301
                if (debug >= 3) println ""+unit.getDeb()+"->"+unit.getFin()+"        "+match.getStart()+"->"+match.getEnd()
302
                if (unit.getFin() < match.getStart()) {
303
                        if (debug >= 3) "println next unit"
304

    
305
                                iCurrentUnit++
306
                } else if (unit.getDeb() > match.getEnd()) {
307
                        if (debug >= 3) "println next match"
308

    
309
                                iCurrentMatch++
310
                        selectedUnits = []
311
                        selectedUnitsPerMatch[iCurrentMatch] = selectedUnits
312
                } else {
313
                        if (debug >= 3) println "iCurrentUnit=$iCurrentUnit        iCurrentMatch=$iCurrentMatch"
314
                        if (strict_inclusion) {
315

    
316
                                if (debug >= 3) println "m.start ${match.getStart()} <= u.deb ${unit.getDeb()} && u.fin ${unit.getFin()} <= m.end ${match.getEnd()}"
317
                                if (match.getStart() <= unit.getDeb() && unit.getFin() <= match.getEnd()) {
318
                                        selectedUnits << unit
319
                                }
320
                        } else {
321
                                selectedUnits << unit
322
                        }
323

    
324
                        iCurrentUnit++
325
                }
326
        }
327
        return selectedUnitsPerMatch
328
}
329

    
330
static def filterUniteByInclusion(def debug, def allUnites, def matches, boolean strict_inclusion, int position) {
331

    
332
        def selectedUnitsPerMatch = groupByMatch(debug, allUnites, matches, strict_inclusion);
333
        //println "selectedUnitsPerMatch size="+selectedUnitsPerMatch.size()
334
        def selectedUnits = []
335
        if (position != 0) {
336
                if (position > 0) position--
337

    
338
                for (def m : selectedUnitsPerMatch.keySet()) {
339
                        if (selectedUnitsPerMatch[m].size() > position && selectedUnitsPerMatch[m].size() > 0) {
340
                                def units = selectedUnitsPerMatch[m]
341
                                //println "$m -> "+units.collect() {it -> it.getDeb()}
342
                                units = units.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() }
343
                                //println "$m -> "+units.collect() {it -> it.getDeb()}
344
                                selectedUnits << units[position]
345
                                if (debug >=3) println "dist select: "+units[position].getDeb()
346
                        }
347
                }
348
        } else {
349
                for (def m : selectedUnitsPerMatch.keySet()) selectedUnits.addAll(selectedUnitsPerMatch[m])
350
        }
351

    
352
        return selectedUnits
353
}
354

    
355
static def findAllUnitesInElements(def debug, def elements, String URSQL) {
356
        def params = getFilterParameters(URSQL)
357
        return findAllUnitesInElements(debug, elements, params[0], params[1], params[2], params[3])
358
}
359

    
360
static def findAllUnitesInElements(def debug, def elements, String typeRegex, String propName, boolean eq, String valueRegex) {
361
        def allElements = []
362

    
363
        for (Element element : elements) {
364
                allElements.addAll(filterElements(debug, element.getUnitesSousjacentes(), typeRegex, propName, eq, valueRegex));
365
        }
366

    
367
        return allElements;
368
}
369

    
370
/**
371
 * group all units without selection
372
 * 
373
 * @param elements
374
 * @return
375
 */
376
static def groupAllUnitesInElements(def debug, def elements) {
377
        return groupAllUnitesInElements(debug, elements, "","","")
378
}
379

    
380
/**
381
 * group all units with URSQL selection
382
 * 
383
 * @param elements
384
 * @param URSQL
385
 * @return
386
 */
387
static def groupAllUnitesInElements(def debug, def elements, String URSQL) {
388
        def params = getFilterParameters(URSQL)
389
        return groupAllUnitesInElements(debug, elements, params[0], params[1], params[2], params[3])
390
}
391

    
392
static def groupAllUnitesInElements(def debug, def elements, String typeRegex, String propName, boolean eq, String valueRegex) {
393
        def allElements = [:]
394

    
395
        for (Element element : elements) {
396
                allElements[element] = filterElements(debug, element.getUnitesSousjacentes(), typeRegex, propName, eq, valueRegex);
397
        }
398

    
399
        return allElements;
400
}
401

    
402
static def getFilterParameters(String URSQL) {
403

    
404
        String type = "";
405
        String prop = "";
406
        String value = "";
407

    
408
        int atidx = URSQL.indexOf("@");
409
        int equal_start_idx = URSQL.indexOf("=");
410
        int equal_end_idx = equal_start_idx
411
        int differentidx = URSQL.indexOf("!=");
412
        boolean eq = differentidx < 0 || differentidx != equal_start_idx-1
413
        if (!eq) {
414
                equal_start_idx--
415
        }
416

    
417
        if (atidx >= 0 && equal_start_idx >= 0 && atidx < equal_start_idx) { // TYPE@PROP=VALUE
418
                type = URSQL.substring(0, atidx)
419
                prop = URSQL.substring(atidx+1, equal_start_idx)
420
                value = URSQL.substring(equal_end_idx+1)
421
        } else if (atidx >= 0) { // TYPE@PROP
422
                type = URSQL.substring(0, atidx)
423
                prop = URSQL.substring(atidx+1)
424
        } else if (equal_start_idx >= 0) { // TYPE=VALUE -> not well formed
425
                type = URSQL.substring(0, equal_start_idx)
426
                value = URSQL.substring(equal_end_idx+1)
427
        } else { // TYPE
428
                type = URSQL;
429
        }
430
        //        println(["'"+type+"'", "'"+prop+"'", "'"+value+"'"])
431

    
432
        return [type, prop, eq, value]
433
}
434

    
435
static def filterElements(def debug, def allElements, String URSQL) {
436
        def params = getFilterParameters(URSQL)
437
        return filterElements(debug, allElements, params[0], params[1], params[2], params[3])
438
}
439

    
440
static def filterElements(def debug, def allElements, String typeRegex, String propName, boolean eq, String valueRegex) {
441
        if (debug >= 3) println "filtering "+allElements.size()+" elements with typeRegex='$typeRegex' propName='$propName' and valueRegex='$valueRegex'"
442
        if (typeRegex != null && typeRegex.length() > 0) {
443
                def filteredElements = []
444
                def matcher = /$typeRegex/
445
                for (Element element : allElements) {
446
                        if (element.getType() ==~ matcher) {
447
                                filteredElements << element
448
                        }
449
                }
450

    
451
                allElements = filteredElements;
452
        }
453
        if (debug >= 3) println " type step result: "+allElements.size()
454

    
455
        if (propName != null && propName.length() > 0) {
456
                def filteredElements = []
457
                if (valueRegex != null && valueRegex.length() > 0) {  // select only elements with the prop&value
458
                        def matcher = /$valueRegex/
459
                        for (Element element : allElements) {
460
                                def value = element.getProp(propName)
461
                                if (value ==~ matcher) {
462
                                        if (eq)        filteredElements << element
463
                                } else {
464
                                        if (!eq) filteredElements << element
465
                                }
466
                        }
467
                } else { // select only elements with the prop
468
                        for (Element element : allElements) {
469
                                if (element.getProps().containsKey(propName)) {
470
                                        filteredElements << element
471
                                }
472
                        }
473
                }
474

    
475
                allElements = filteredElements;
476
        }
477
        if (debug >= 3) println " prop&value step result: "+allElements.size()
478
        return allElements;
479
}
480

    
481
static def getCQL(String name, def unites) {
482
        return getCQL(name, unites, false, true)
483
}
484

    
485
/**
486
 * 
487
 * @param name
488
 * @param unites
489
 * @param onePosition to return 1 token per patch
490
 * @return
491
 */
492
static def getCQL(String name, def unites, boolean onePosition, boolean limitNumberOfUnit) {
493
        //println "GETCQL of $name"
494
        def letters = "abcdefghijklmnopqrstu"//vwxyz0123456789"
495
        def MAXCQLQUERYSIZE = 1200 // 1150 // 1200 in fact
496

    
497
        HashSet<Integer> sizes = new HashSet<>()
498

    
499
        for (Unite unite : unites) {
500
                int size = unite.getFin() - unite.getDeb()+1
501
                if (size > letters.length()) size = letters.length()-1
502
                sizes.add(size)
503
        }
504

    
505
        int n = 0
506

    
507
        String totalleftquery = ""
508
        String totalrightquery = ""
509
        unites.sort() { it.getDeb() }
510
        def declaredsizes = []
511
        for (Unite unite : unites) {
512
                int size = unite.getFin() - unite.getDeb() + 1
513
                if (size < 0) {
514
                        println sprintf("** Warning: incoherent unit %s [%d, %d], size = "+size, unite.getProps(),unite.getDeb(), unite.getFin())
515
                        continue
516
                }
517
                if (onePosition) size = 1 // hack only the 1st position is needed for the Progression
518
                if (size > letters.length()) size = letters.length()-1
519
                String letter = ""+letters.charAt(size-1)
520
                String rightquery = letter+"="+unite.getDeb()
521

    
522
                String leftquery = ""
523
                if (!declaredsizes.contains(size)) {
524
                        declaredsizes << size
525

    
526
                        if (size == 1)
527
                                leftquery = letter+":[]"
528
                        else if (size == 2)
529
                                leftquery = letter+":[][]"
530
                        else if (size == 3)
531
                                leftquery = letter+":[][][]"             // [][][][]
532
                        else
533
                                leftquery = letter+":[][]{"+(size-1)+"}" // [][]{4}
534
                }
535

    
536
                if ((totalleftquery.length() + totalrightquery.length() + 2
537
                + leftquery.length() + rightquery.length()) >= MAXCQLQUERYSIZE) {
538
                        System.out.println("** $name : trop d'éléments pour la requête. Seuls les "+n+" premiers éléments sur ${unites.size()} seront affichés dans le graphique de progression.")
539
                        break
540
                }
541

    
542
                if (n > 0) {
543
                        if (leftquery.length() > 0) totalleftquery += "|"
544
                        totalrightquery += "|"
545
                }
546
                if (leftquery.length() > 0) totalleftquery += leftquery
547
                totalrightquery += rightquery
548

    
549
                n += 1
550
        }
551
        String query = totalleftquery+"::"+totalrightquery
552
        //println query
553
        return query
554
}