Statistics
| Revision:

root / tmp / org.txm.analec.rcp / src / org / txm / macro / urs / AnalecUtils.groovy @ 2144

History | View | Annotate | Download (17.7 kB)

1
package org.txm.macro.urs
2

    
3
import org.txm.searchengine.cqp.corpus.Property
4
import org.txm.searchengine.cqp.corpus.Subcorpus
5
import org.txm.searchengine.cqp.corpus.query.Match
6
import org.txm.searchengine.cqp.corpus.query.CQLQuery
7
import visuAnalec.donnees.*
8
import visuAnalec.elements.*
9

    
10
import org.apache.commons.lang.StringUtils
11

    
12

    
13
static def isPropertyDefined(Class clazz, Corpus analecCorpus, String ursql) {
14
        if (ursql == null || ursql.length() == 0) return new HashSet() 
15
        def params = getFilterParameters(ursql)
16
        def typeRegexp = params[0]
17
        def propRegexp = params[1]
18
        return isPropertyDefined(clazz, analecCorpus, typeRegexp, propRegexp)
19
}
20

    
21
static def isPropertyDefined(Class clazz, Corpus analecCorpus, String typeRegexp, String propRegexp) {
22
        def errors = new HashSet()
23
        if (propRegexp == null || propRegexp.length() == 0) return errors;
24
        Structure structure = analecCorpus.getStructure();
25
        for (def type : structure.getTypes(clazz)) {
26
                if (!type.matches(typeRegexp)) continue; // test only types matching with typeRegexp
27

    
28
                def props = structure.getNomsProps(clazz, type);
29
                boolean contains = false;
30
                for (def p : props) {
31
                        if (p.matches(propRegexp)) {
32
                                contains = true
33
                        }
34
                }
35
                if (!contains) errors << type
36
        }
37

    
38
        return errors
39
}
40

    
41
static def defineProperty(Class clazz, Corpus analecCorpus, String ursql, String newProperty) {
42
        def params = getFilterParameters(ursql)
43
        def typeRegexp = params[0]
44
        Structure structure = analecCorpus.getStructure();
45
        for (def type : structure.getTypes(clazz)) {
46
                if (!type.matches(typeRegexp)) continue; // test only types matching with typeRegexp
47
                def props = structure.getNomsProps(clazz, type)
48
                if (!props.contains(newProperty)) {
49
                        structure.ajouterProp(clazz, type, newProperty)
50
                }
51
        }
52
}
53

    
54
static def selectSchemas(def debug, Corpus analecCorpus, String schema_ursql, Integer minimum_schema_size, Integer maximum_schema_size) {
55
        if (maximum_schema_size <= 0) maximum_schema_size = Integer.MAX_VALUE;
56
        if (minimum_schema_size < 0) minimum_schema_size = 0;
57
        def allSchemas = []
58
        
59
        if (schema_ursql != null && schema_ursql.length() > 0) allSchemas = AnalecUtils.findAllInCorpus(debug, analecCorpus, Schema.class, schema_ursql)
60
        else allSchemas = analecCorpus.getTousSchemas()
61

    
62
        if (debug >= 2) println "allSchemas=${allSchemas.size()}"
63
        allSchemas = AnalecUtils.filterBySize(allSchemas, minimum_schema_size, maximum_schema_size);
64
        
65
        return allSchemas
66
}
67

    
68
static def selectSchemasInCorpus(def debug, Corpus analecCorpus, org.txm.searchengine.cqp.corpus.CQPCorpus corpus,
69
        String schema_ursql, Integer minimum_schema_size, Integer maximum_schema_size, boolean strictInclusion) {
70
                
71
        if (maximum_schema_size <= 0) maximum_schema_size = Integer.MAX_VALUE;
72
        if (minimum_schema_size < 0) minimum_schema_size = 0;
73
                
74
        def allSchemas = []
75
        if (schema_ursql != null && schema_ursql.length() > 0) allSchemas = AnalecUtils.findAllInCorpus(debug, analecCorpus, Schema.class, schema_ursql)
76
        else allSchemas = analecCorpus.getTousSchemas()
77
        
78
        def selectedSchemas = []
79
        for (Schema schema : allSchemas) {
80
                def selectedUnits = AnalecUtils.filterUniteByInclusion(debug, schema.getUnitesSousjacentes(), corpus.getMatches(), strictInclusion, 0)
81
                
82
                if (minimum_schema_size <= selectedUnits.size() && selectedUnits.size() <= maximum_schema_size ) {
83
                        selectedSchemas << schema
84
                }
85
        }
86
        
87
        return selectedSchemas
88
}
89

    
90
/**
91
 * select units from a selection of schema. If no schema critera are given, select all units then apply units critera
92
 * 
93
 * @param debug
94
 * @param analecCorpus
95
 * @param corpus
96
 * @param schema_ursql
97
 * @param minimum_schema_size
98
 * @param maximum_schema_size
99
 * @param unit_ursql
100
 * @param cql_limit
101
 * @param strict_inclusion
102
 * @param position
103
 * @return
104
 */
105
static def selectUnitsInSchema(def debug, Corpus analecCorpus, org.txm.searchengine.cqp.corpus.CQPCorpus corpus,
106
                String schema_ursql, Integer minimum_schema_size, Integer maximum_schema_size,
107
                String unit_ursql, Integer position_in_schema, CQLQuery cql_limit, Boolean strict_inclusion, int position) {
108
        def groupedUnits = []
109
        if (schema_ursql != null && schema_ursql.length() > 0 || minimum_schema_size > 1) {
110
                def allSchema = null;
111

    
112
                if (schema_ursql != null && schema_ursql.length() > 0) allSchema = AnalecUtils.findAllInCorpus(debug, analecCorpus, Schema.class, schema_ursql)
113
                else allSchema = analecCorpus.getTousSchemas()
114
                if (debug >= 2) println "allSchema=${allSchema.size()}"
115

    
116
                allSchema = AnalecUtils.filterBySize(allSchema, minimum_schema_size, maximum_schema_size);
117
                if (debug >= 2) println "allSchema=${allSchema.size()}"
118

    
119
                groupedUnits = AnalecUtils.groupAllUnitesInElements(debug, allSchema, unit_ursql)
120
                
121
                if (position_in_schema >= 0) groupedUnits = AnalecUtils.filterUniteByInclusionInSchema(debug, groupedUnits, position_in_schema)
122

    
123
        } else {
124
                groupedUnits = ["all":AnalecUtils.findAllInCorpus(debug, analecCorpus, Unite.class, unit_ursql)]
125
        }
126
        if (debug >= 2) println "groupedUnits=${groupedUnits.size()}"
127

    
128
        // limit units to corpus or cql_limit matches
129
        def matches = null
130
        if (cql_limit != null && !cql_limit.getQueryString().equals("\"\"")) {
131
                Subcorpus limitssubcorpus = corpus.createSubcorpus(cql_limit, corpus.getID().toUpperCase())
132
                matches = limitssubcorpus.getMatches();
133
                limitssubcorpus.delete();
134
        } else {
135
                matches = corpus.getMatches()
136
        }
137
        if (debug >= 2) println "matches=${matches}"
138
        def allUnits = []
139
        for (def k : groupedUnits.keySet()) {
140
                def selectedUnits = AnalecUtils.filterUniteByInclusion(debug, groupedUnits[k], matches, strict_inclusion, position)
141
                allUnits.addAll(selectedUnits)
142
        }
143
        if (debug >= 2) println "selectedUnits=${allUnits.size()}"
144

    
145
        Collections.sort(allUnits)
146
        
147
        return allUnits
148
}
149
/**
150
 * filter groups elements with the elements positions
151
 * 
152
 * 
153
 * @param groups
154
 * @param distance 0=no selection, 1=first, 2=second, -1 last, -2 last-last
155
 * @return
156
 */
157
static def filterUniteByInclusionInSchema(def debug, def groups, Integer distance) {
158
        if (distance == 0) return groups;
159
        
160
        distance = distance-1;
161
        def newGroups = [:]
162
        for (def k : groups.keySet()) {
163
                newGroups[k] = [];
164
                def group = groups[k]
165
                if (Math.abs(distance) < group.size())
166
                        newGroups[k] << group[distance]
167
        }
168
        return newGroups
169
}
170

    
171
static def getStartsEndsTargetsArrays(def selectedUnits) {
172
        int[] starts = new int[selectedUnits.size()]
173
        int[] ends = new int[selectedUnits.size()]
174
        int n = 0;
175
        for (def unite : selectedUnits) {
176
                starts[n] = unite.getDeb();
177
                ends[n] = unite.getFin();
178
                n++
179
        }
180
        return [starts, ends, null]
181
}
182

    
183
static int[] toIntArray(Unite u) {
184
        if (u.getDeb() > u.getFin()) // error
185
                return (u.getFin()..u.getDeb()).toArray(new int[u.getDeb()-u.getFin()])
186
        else
187
                return (u.getDeb()..u.getFin()).toArray(new int[u.getFin()-u.getDeb()])
188
}
189

    
190
static String toString(Element e) {
191
        Schema r = null;
192

    
193
        if (e.getClass() == Unite.class)
194
                return sprintf("%d-%d, %s", e.getDeb(), e.getFin(), e.getProps().sort())
195
        else if (e.getClass() == Relation.class)
196
                return sprintf("%s=%s -> %s", toString(e.getElt1()), toString(e.getElt2()), e.getProps().sort())
197
        else if (e.getClass() == Schema.class)
198
                return sprintf("%s=%d", e.getContenu().size(), e.getProps().sort())
199
}
200

    
201
static String toString(def CQI, def wordProperty, Element e) {
202
        Schema r = null;
203

    
204
        if (e.getClass() == Unite.class) {
205
                def form = StringUtils.join(CQI.cpos2Str(wordProperty.getQualifiedName(), toIntArray(e)), " ")
206
                return sprintf("%s %d-%d, %s", form, e.getDeb(), e.getFin(), e.getProps().sort())
207
        } else if (e.getClass() == Relation.class) {
208
                def form1 = StringUtils.join(CQI.cpos2Str(wordProperty.getQualifiedName(), toIntArray(e.getElt1())), " ")
209
                def form2 = StringUtils.join(CQI.cpos2Str(wordProperty.getQualifiedName(), toIntArray(e.getElt2())), " ")
210
                return sprintf("%s=%s -> %s", form1+" "+toString(e.getElt1()), form2+" "+toString(e.getElt2()), e.getProps().sort())
211
        } else if (e.getClass() == Schema.class) {
212
                return sprintf("%s=%d", e.getContenu().size(), e.getProps().sort())
213
        }
214
}
215

    
216
static def findAllInCorpus(def debug, def analecCorpus, Class elemClazz, String URSQL) {
217
        def params = getFilterParameters(URSQL)
218
        if (debug >= 2) println "PARAMS=$params"
219
        return findAllInCorpus(debug, analecCorpus, elemClazz, params[0], params[1], params[2])
220
}
221

    
222
static def findAllInCorpus(def debug, Corpus analecCorpus, Class elemClazz, String typeRegex, String propName, String valueRegex) {
223
        def allElements = null;
224

    
225
        if (elemClazz != null) {
226
                if (elemClazz == Unite.class)
227
                        allElements = analecCorpus.getToutesUnites()
228
                else if (elemClazz == Relation.class)
229
                        allElements = analecCorpus.getToutesRelations()
230
                else if (elemClazz == Schema.class)
231
                        allElements = analecCorpus.getTousSchemas()
232
        } else {
233
                allElements = [];
234
                allElements.addAll(analecCorpus.getToutesUnites())
235
                allElements.addAll(analecCorpus.getToutesRelations())
236
                allElements.addAll(analecCorpus.getTousSchemas())
237
        }
238

    
239
        return filterElements(debug, allElements, typeRegex, propName, valueRegex);
240
}
241

    
242
static def filterBySize(def elements, Integer minimum_schema_size, Integer maximum_schema_size) {
243
        if (maximum_schema_size == null || maximum_schema_size <= 0) maximum_schema_size = Integer.MAX_VALUE;
244
        if (minimum_schema_size == null || minimum_schema_size < 0) minimum_schema_size = 0;
245

    
246
        def filteredElements = []
247
        for (Element e : elements) {
248
                Unite[] units = e.getUnitesSousjacentes();
249
                int size = units.length;
250
                if (size < minimum_schema_size) continue;
251
                if (size > maximum_schema_size) continue;
252
                filteredElements << e;
253
        }
254
        return filteredElements
255
}
256

    
257
/**
258
 * group units by CQP match
259
 * 
260
 * units are sorted for faster processing
261
 * 
262
 * @param allUnites
263
 * @param matches
264
 * @param strict_inclusion
265
 * @return
266
 */
267
static def groupByMatch(def debug, def allUnites, def matches, boolean strict_inclusion) {
268
        if (debug >= 2) println "group "+allUnites.size()+" units with "+matches.size()+" strict=$strict_inclusion"
269
        //println allUnites.collect() {it -> it.getDeb()}
270
        allUnites = allUnites.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() }
271
        //println allUnites.collect() {it -> it.getDeb()}
272
        def unitsSize = allUnites.size()
273
        def iCurrentUnit = 0
274
        def selectedUnits = []
275

    
276
        def matchesSize = matches.size()
277
        def iCurrentMatch = 0
278

    
279
        def selectedUnitsPerMatch = new LinkedHashMap()
280
        selectedUnitsPerMatch[iCurrentMatch] = selectedUnits
281

    
282
        while (iCurrentMatch < matchesSize && iCurrentUnit < unitsSize) {
283
                if (debug >= 2) println "** M $iCurrentMatch < $matchesSize && U $iCurrentUnit < $unitsSize"
284

    
285
                Unite unit = allUnites[iCurrentUnit]
286
                Match match = matches[iCurrentMatch]
287
                if (debug >= 3) println ""+unit.getDeb()+"->"+unit.getFin()+"        "+match.getStart()+"->"+match.getEnd()
288
                if (unit.getFin() < match.getStart()) {
289
                        if (debug >= 3) "println next unit"
290
                        
291
                        iCurrentUnit++
292
                } else if (unit.getDeb() > match.getEnd()) {
293
                        if (debug >= 3) "println next match"
294
                        
295
                        iCurrentMatch++
296
                        selectedUnits = []
297
                        selectedUnitsPerMatch[iCurrentMatch] = selectedUnits
298
                } else {
299
                        if (debug >= 3) println "iCurrentUnit=$iCurrentUnit        iCurrentMatch=$iCurrentMatch"
300
                        if (strict_inclusion) {
301

    
302
                                if (debug >= 3) println "m.start ${match.getStart()} <= u.deb ${unit.getDeb()} && u.fin ${unit.getFin()} <= m.end ${match.getEnd()}"
303
                                if (match.getStart() <= unit.getDeb() && unit.getFin() <= match.getEnd()) {
304
                                        selectedUnits << unit
305
                                }
306
                        } else {
307
                                selectedUnits << unit
308
                        }
309

    
310
                        iCurrentUnit++
311
                }
312
        }
313
        return selectedUnitsPerMatch
314
}
315

    
316
static def filterUniteByInclusion(def debug, def allUnites, def matches, boolean strict_inclusion, int position) {
317

    
318
        def selectedUnitsPerMatch = groupByMatch(debug, allUnites, matches, strict_inclusion);
319
        //println "selectedUnitsPerMatch size="+selectedUnitsPerMatch.size()
320
        def selectedUnits = []
321
        if (position != 0) {
322
                if (position > 0) position--
323

    
324
                for (def m : selectedUnitsPerMatch.keySet()) {
325
                        if (selectedUnitsPerMatch[m].size() > position && selectedUnitsPerMatch[m].size() > 0) {
326
                                def units = selectedUnitsPerMatch[m]
327
                                //println "$m -> "+units.collect() {it -> it.getDeb()}
328
                                units = units.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() }
329
                                //println "$m -> "+units.collect() {it -> it.getDeb()}
330
                                selectedUnits << units[position]
331
                                if (debug >=3) println "dist select: "+units[position].getDeb()
332
                        }
333
                }
334
        } else {
335
                for (def m : selectedUnitsPerMatch.keySet()) selectedUnits.addAll(selectedUnitsPerMatch[m])
336
        }
337

    
338
        return selectedUnits
339
}
340

    
341
static def findAllUnitesInElements(def debug, def elements, String URSQL) {
342
        def params = getFilterParameters(URSQL)
343
        return findAllUnitesInElements(debug, elements, params[0], params[1], params[2])
344
}
345

    
346
static def findAllUnitesInElements(def debug, def elements, String typeRegex, String propName, String valueRegex) {
347
        def allElements = []
348

    
349
        for (Element element : elements) {
350
                allElements.addAll(filterElements(debug, element.getUnitesSousjacentes(), typeRegex, propName, valueRegex));
351
        }
352

    
353
        return allElements;
354
}
355

    
356
/**
357
 * group all units without selection
358
 * 
359
 * @param elements
360
 * @return
361
 */
362
static def groupAllUnitesInElements(def debug, def elements) {
363
        return groupAllUnitesInElements(debug, elements, "","","")
364
}
365

    
366
/**
367
 * group all units with URSQL selection
368
 * 
369
 * @param elements
370
 * @param URSQL
371
 * @return
372
 */
373
static def groupAllUnitesInElements(def debug, def elements, String URSQL) {
374
        def params = getFilterParameters(URSQL)
375
        return groupAllUnitesInElements(debug, elements, params[0], params[1], params[2])
376
}
377

    
378
static def groupAllUnitesInElements(def debug, def elements, String typeRegex, String propName, String valueRegex) {
379
        def allElements = [:]
380

    
381
        for (Element element : elements) {
382
                allElements[element] = filterElements(debug, element.getUnitesSousjacentes(), typeRegex, propName, valueRegex);
383
        }
384

    
385
        return allElements;
386
}
387

    
388
static def getFilterParameters(String URSQL) {
389

    
390
        String type = "";
391
        String prop = "";
392
        String value = "";
393

    
394
        int atidx = URSQL.indexOf("@");
395
        int equalidx = URSQL.indexOf("=");
396

    
397
        if (atidx >= 0 && equalidx >= 0 && atidx < equalidx) { // TYPE@PROP=VALUE
398
                type = URSQL.substring(0, atidx)
399
                prop = URSQL.substring(atidx+1, equalidx)
400
                value = URSQL.substring(equalidx+1)
401
        } else if (atidx >= 0) { // TYPE@PROP
402
                type = URSQL.substring(0, atidx)
403
                prop = URSQL.substring(atidx+1)
404
        } else if (equalidx >= 0) { // TYPE=VALUE -> not well formed
405
                type = URSQL.substring(0, equalidx)
406
                value = URSQL.substring(equalidx+1)
407
        } else { // TYPE
408
                type = URSQL;
409
        }
410
        //        println(["'"+type+"'", "'"+prop+"'", "'"+value+"'"])
411

    
412
        return [type, prop, value]
413
}
414

    
415
static def filterElements(def debug, def allElements, String URSQL) {
416
        def params = getFilterParameters(URSQL)
417
        return filterElements(debug, allElements, params[0], params[1], params[2])
418
}
419

    
420
static def filterElements(def debug, def allElements, String typeRegex, String propName, String valueRegex) {
421
        if (debug >= 2) println "filtering "+allElements.size()+" elements with typeRegex='$typeRegex' propName='$propName' and valueRegex='$valueRegex'"
422
        if (typeRegex != null && typeRegex.length() > 0) {
423
                def filteredElements = []
424
                def matcher = /$typeRegex/
425
                for (Element element : allElements) {
426
                        if (element.getType() ==~ matcher) {
427
                                filteredElements << element
428
                        }
429
                }
430

    
431
                allElements = filteredElements;
432
        }
433
        if (debug >= 2) println " type step result: "+allElements.size()
434

    
435
        if (propName != null && propName.length() > 0) {
436
                def filteredElements = []
437
                if (valueRegex != null && valueRegex.length() > 0) {  // select only elements with the prop&value
438
                        def matcher = /$valueRegex/
439
                        for (Element element : allElements) {
440
                                def value = element.getProp(propName)
441
                                if (value != null && value ==~ matcher) {
442
                                        filteredElements << element
443
                                }
444
                        }
445
                } else { // select only elements with the prop
446
                        for (Element element : allElements) {
447
                                if (element.getProps().containsKey(propName)) {
448
                                        filteredElements << element
449
                                }
450
                        }
451
                }
452

    
453
                allElements = filteredElements;
454
        }
455
        if (debug >= 2) println " prop&value step result: "+allElements.size()
456
        return allElements;
457
}
458

    
459
static def getCQL(String name, def unites) {
460
        return getCQL(name, unites, false, true)
461
}
462

    
463
/**
464
 * 
465
 * @param name
466
 * @param unites
467
 * @param onePosition to return 1 token per patch
468
 * @return
469
 */
470
static def getCQL(String name, def unites, boolean onePosition, boolean limitNumberOfUnit) {
471
        //println "GETCQL of $name"
472
        def letters = "abcdefghijklmnopqrstu"//vwxyz0123456789"
473
        def MAXCQLQUERYSIZE = 1200 // 1150 // 1200 in fact
474

    
475
        HashSet<Integer> sizes = new HashSet<>()
476

    
477
        for (Unite unite : unites) {
478
                int size = unite.getFin() - unite.getDeb()+1
479
                if (size > letters.length()) size = letters.length()-1
480
                sizes.add(size)
481
        }
482

    
483
        int n = 0
484

    
485
        String totalleftquery = ""
486
        String totalrightquery = ""
487
        unites.sort() { it.getDeb() }
488
        def declaredsizes = []
489
        for (Unite unite : unites) {
490
                int size = unite.getFin() - unite.getDeb() + 1
491
                if (size < 0) {
492
                        println sprintf("** Warning: incoherent unit %s [%d, %d], size = "+size, unite.getProps(),unite.getDeb(), unite.getFin())
493
                        continue
494
                }
495
                if (onePosition) size = 1 // hack only the 1st position is needed for the Progression
496
                if (size > letters.length()) size = letters.length()-1
497
                String letter = ""+letters.charAt(size-1)
498
                String rightquery = letter+"="+unite.getDeb()
499

    
500
                String leftquery = ""
501
                if (!declaredsizes.contains(size)) {
502
                        declaredsizes << size
503

    
504
                        if (size == 1)
505
                                leftquery = letter+":[]"
506
                        else if (size == 2)
507
                                leftquery = letter+":[][]"
508
                        else if (size == 3)
509
                                leftquery = letter+":[][][]"             // [][][][]
510
                        else
511
                                leftquery = letter+":[][]{"+(size-1)+"}" // [][]{4}
512
                }
513

    
514
                if ((totalleftquery.length() + totalrightquery.length() + 2
515
                + leftquery.length() + rightquery.length()) >= MAXCQLQUERYSIZE) {
516
                        System.out.println("** $name : trop d'éléments pour la requête. Seuls les "+n+" premiers éléments sur ${unites.size()} seront affichés dans le graphique de progression.")
517
                        break
518
                }
519

    
520
                if (n > 0) {
521
                        if (leftquery.length() > 0) totalleftquery += "|"
522
                        totalrightquery += "|"
523
                }
524
                if (leftquery.length() > 0) totalleftquery += leftquery
525
                totalrightquery += rightquery
526

    
527
                n += 1
528
        }
529
        String query = totalleftquery+"::"+totalrightquery
530
        //println query
531
        return query
532
}