Statistics
| Revision:

root / tmp / org.txm.analec.rcp / src / org / txm / macro / urs / AnalecUtils.groovy @ 2134

History | View | Annotate | Download (17.5 kB)

1
package org.txm.macro.urs
2

    
3
import org.txm.searchengine.cqp.corpus.Property
4
import org.txm.searchengine.cqp.corpus.Subcorpus
5
import org.txm.searchengine.cqp.corpus.query.Match
6
import org.txm.searchengine.cqp.corpus.query.CQLQuery
7
import visuAnalec.donnees.*
8
import visuAnalec.elements.*
9

    
10
import org.apache.commons.lang.StringUtils
11

    
12

    
13
static def isPropertyDefined(Class clazz, Corpus analecCorpus, String ursql) {
14
        if (ursql == null || ursql.length() == 0) return new HashSet() 
15
        def params = getFilterParameters(ursql)
16
        def typeRegexp = params[0]
17
        def propRegexp = params[1]
18
        return isPropertyDefined(clazz, analecCorpus, typeRegexp, propRegexp)
19
}
20

    
21
static def isPropertyDefined(Class clazz, Corpus analecCorpus, String typeRegexp, String propRegexp) {
22
        def errors = new HashSet()
23
        if (propRegexp == null || propRegexp.length() == 0) return errors;
24
        Structure structure = analecCorpus.getStructure();
25
        for (def type : structure.getTypes(clazz)) {
26
                if (!type.matches(typeRegexp)) continue; // test only types matching with typeRegexp
27

    
28
                def props = structure.getNomsProps(clazz, type);
29
                boolean contains = false;
30
                for (def p : props) {
31
                        if (p.matches(propRegexp)) {
32
                                contains = true
33
                        }
34
                }
35
                if (!contains) errors << type
36
        }
37

    
38
        return errors
39
}
40

    
41
static def defineProperty(Class clazz, Corpus analecCorpus, String ursql, String newProperty) {
42
        def params = getFilterParameters(ursql)
43
        def typeRegexp = params[0]
44
        Structure structure = analecCorpus.getStructure();
45
        for (def type : structure.getTypes(clazz)) {
46
                if (!type.matches(typeRegexp)) continue; // test only types matching with typeRegexp
47
                def props = structure.getNomsProps(clazz, type)
48
                if (!props.contains(newProperty)) {
49
                        structure.ajouterProp(clazz, type, newProperty)
50
                }
51
        }
52
}
53

    
54
static def selectSchemas(def debug, Corpus analecCorpus, String schema_ursql, Integer minimum_schema_size, Integer maximum_schema_size) {
55
        if (maximum_schema_size <= 0) maximum_schema_size = Integer.MAX_VALUE;
56
        def allSchemas = []
57
        if (schema_ursql != null && schema_ursql.length() > 0 || minimum_schema_size > 1 || maximum_schema_size >= 1) {
58

    
59
                if (schema_ursql != null && schema_ursql.length() > 0) allSchemas = AnalecUtils.findAllInCorpus(debug, analecCorpus, Schema.class, schema_ursql)
60
                else allSchemas = analecCorpus.getTousSchemas()
61

    
62
                if (debug >= 2) println "allSchemas=${allSchemas.size()}"
63
                if (minimum_schema_size > 1 || maximum_schema_size >= 1) allSchemas = AnalecUtils.filterBySize(allSchemas, minimum_schema_size, maximum_schema_size);
64
        } else {
65
                allSchemas = analecCorpus.getTousSchemas()
66
        }
67
        return allSchemas
68
}
69

    
70
static def selectSchemasInCorpus(def debug, Corpus analecCorpus, org.txm.searchengine.cqp.corpus.CQPCorpus corpus,
71
                String schema_ursql, Integer minimum_schema_size, Integer maximum_schema_size) {
72
                
73
        def allSchemas = AnalecUtils.selectSchemas(debug, analecCorpus, schema_ursql, minimum_schema_size, maximum_schema_size);
74
        
75
        def selectedSchemas = []
76
        for (Schema schema : allSchemas) {
77
                def selectedUnits = AnalecUtils.filterUniteByInclusion(debug, schema.getUnitesSousjacentes(), corpus.getMatches(), true, 0)
78
                if (selectedUnits.size() > 0 ) {
79
                        selectedSchemas << schema
80
                }
81
        }
82
        
83
        return selectedSchemas
84
}
85

    
86
/**
87
 * select units from a selection of schema. If no schema critera are given, select all units then apply units critera
88
 * 
89
 * @param debug
90
 * @param analecCorpus
91
 * @param corpus
92
 * @param schema_ursql
93
 * @param minimum_schema_size
94
 * @param maximum_schema_size
95
 * @param unit_ursql
96
 * @param cql_limit
97
 * @param strict_inclusion
98
 * @param position
99
 * @return
100
 */
101
static def selectUnitsInSchema(def debug, Corpus analecCorpus, org.txm.searchengine.cqp.corpus.CQPCorpus corpus,
102
                String schema_ursql, Integer minimum_schema_size, Integer maximum_schema_size,
103
                String unit_ursql, Integer position_in_schema, CQLQuery cql_limit, Boolean strict_inclusion, int position) {
104
        def groupedUnits = []
105
        if (schema_ursql != null && schema_ursql.length() > 0 || minimum_schema_size > 1) {
106
                def allSchema = null;
107

    
108
                if (schema_ursql != null && schema_ursql.length() > 0) allSchema = AnalecUtils.findAllInCorpus(debug, analecCorpus, Schema.class, schema_ursql)
109
                else allSchema = analecCorpus.getTousSchemas()
110
                if (debug >= 2) println "allSchema=${allSchema.size()}"
111

    
112
                if (minimum_schema_size > 1) allSchema = AnalecUtils.filterBySize(allSchema, minimum_schema_size, maximum_schema_size);
113
                if (debug >= 2) println "allSchema=${allSchema.size()}"
114

    
115
                groupedUnits = AnalecUtils.groupAllUnitesInElements(debug, allSchema, unit_ursql)
116
                
117
                if (position_in_schema >= 0) groupedUnits = AnalecUtils.filterUniteByInclusionInSchema(debug, groupedUnits, position_in_schema)
118

    
119
        } else {
120
                groupedUnits = ["all":AnalecUtils.findAllInCorpus(debug, analecCorpus, Unite.class, unit_ursql)]
121
        }
122
        if (debug >= 2) println "groupedUnits=${groupedUnits.size()}"
123

    
124
        def matches = null
125
        if (cql_limit != null && !cql_limit.getQueryString().equals("\"\"")) {
126
                Subcorpus limitssubcorpus = corpus.createSubcorpus(cql_limit, corpus.getID().toUpperCase())
127
                matches = limitssubcorpus.getMatches();
128
                limitssubcorpus.delete();
129
        } else {
130
                matches = corpus.getMatches()
131
        }
132
        if (debug >= 2) println "matches=${matches}"
133
        def allUnits = []
134
        for (def k : groupedUnits.keySet()) {
135
                def selectedUnits = AnalecUtils.filterUniteByInclusion(debug, groupedUnits[k], matches, strict_inclusion, position)
136
                allUnits.addAll(selectedUnits)
137
        }
138
        if (debug >= 2) println "selectedUnits=${allUnits.size()}"
139

    
140
        Collections.sort(allUnits)
141
        
142
        return allUnits
143
}
144
/**
145
 * filter groups elements with the elements positions
146
 * 
147
 * 
148
 * @param groups
149
 * @param distance 0=no selection, 1=first, 2=second, -1 last, -2 last-last
150
 * @return
151
 */
152
static def filterUniteByInclusionInSchema(def debug, def groups, Integer distance) {
153
        if (distance == 0) return groups;
154
        distance = distance-1;
155
        def newGroups = [:]
156
        for (def k : groups.keySet()) {
157
                newGroups[k] = [];
158
                def group = groups[k]
159
                if (Math.abs(distance) < group.size())
160
                        newGroups[k] << group[distance]
161
        }
162
        return newGroups
163
}
164

    
165
static def getStartsEndsTargetsArrays(def selectedUnits) {
166
        int[] starts = new int[selectedUnits.size()]
167
        int[] ends = new int[selectedUnits.size()]
168
        int n = 0;
169
        for (def unite : selectedUnits) {
170
                starts[n] = unite.getDeb();
171
                ends[n] = unite.getFin();
172
                n++
173
        }
174
        return [starts, ends, null]
175
}
176

    
177
static String toString(Element e) {
178
        Schema r = null;
179

    
180
        if (e.getClass() == Unite.class)
181
                return sprintf("%d-%d, %s", e.getDeb(), e.getFin(), e.getProps().sort())
182
        else if (e.getClass() == Relation.class)
183
                return sprintf("%s=%s -> %s", toString(e.getElt1()), toString(e.getElt2()), e.getProps().sort())
184
        else if (e.getClass() == Schema.class)
185
                return sprintf("%s=%d", e.getContenu().size(), e.getProps().sort())
186
}
187

    
188
static int[] toIntArray(Unite u) {
189
        if (u.getDeb() > u.getFin()) // error
190
                return (u.getFin()..u.getDeb()).toArray(new int[u.getDeb()-u.getFin()])
191
        else
192
                return (u.getDeb()..u.getFin()).toArray(new int[u.getFin()-u.getDeb()])
193
}
194

    
195
static String toString(def CQI, def wordProperty, Element e) {
196
        Schema r = null;
197

    
198
        if (e.getClass() == Unite.class) {
199
                def form = StringUtils.join(CQI.cpos2Str(wordProperty.getQualifiedName(), toIntArray(e)), " ")
200
                return sprintf("%s %d-%d, %s", form, e.getDeb(), e.getFin(), e.getProps().sort())
201
        } else if (e.getClass() == Relation.class) {
202
                def form1 = StringUtils.join(CQI.cpos2Str(wordProperty.getQualifiedName(), toIntArray(e.getElt1())), " ")
203
                def form2 = StringUtils.join(CQI.cpos2Str(wordProperty.getQualifiedName(), toIntArray(e.getElt2())), " ")
204
                return sprintf("%s=%s -> %s", form1+" "+toString(e.getElt1()), form2+" "+toString(e.getElt2()), e.getProps().sort())
205
        } else if (e.getClass() == Schema.class) {
206
                return sprintf("%s=%d", e.getContenu().size(), e.getProps().sort())
207
        }
208
}
209

    
210
static def findAllInCorpus(def debug, def analecCorpus, Class elemClazz, String URSQL) {
211
        def params = getFilterParameters(URSQL)
212
        if (debug >= 2) println "PARAMS=$params"
213
        return findAllInCorpus(debug, analecCorpus, elemClazz, params[0], params[1], params[2])
214
}
215

    
216
static def findAllInCorpus(def debug, Corpus analecCorpus, Class elemClazz, String typeRegex, String propName, String valueRegex) {
217
        def allElements = null;
218

    
219
        if (elemClazz != null) {
220
                if (elemClazz == Unite.class)
221
                        allElements = analecCorpus.getToutesUnites()
222
                else if (elemClazz == Relation.class)
223
                        allElements = analecCorpus.getToutesRelations()
224
                else if (elemClazz == Schema.class)
225
                        allElements = analecCorpus.getTousSchemas()
226
        } else {
227
                allElements = [];
228
                allElements.addAll(analecCorpus.getToutesUnites())
229
                allElements.addAll(analecCorpus.getToutesRelations())
230
                allElements.addAll(analecCorpus.getTousSchemas())
231
        }
232

    
233

    
234
        return filterElements(debug, allElements, typeRegex, propName, valueRegex);
235
}
236

    
237
static def filterBySize(def elements, Integer minSize, Integer maximum_schema_size) {
238
        if (maximum_schema_size == null || maximum_schema_size <= 0) maximum_schema_size = Integer.MAX_VALUE;
239
        if (minSize == null || minSize < 0) minSize = 0;
240

    
241
        def filteredElements = []
242
        for (Element e : elements) {
243
                Unite[] units = e.getUnitesSousjacentes();
244
                int size = units.length;
245
                if (size < minSize) continue;
246
                if (size > maximum_schema_size) continue;
247
                filteredElements << e;
248
        }
249
        return filteredElements
250
}
251

    
252
/**
253
 * group units by CQP match
254
 * 
255
 * units are sorted for faster processing
256
 * 
257
 * @param allUnites
258
 * @param matches
259
 * @param strict_inclusion
260
 * @return
261
 */
262
static def groupByMatch(def debug, def allUnites, def matches, boolean strict_inclusion) {
263
        if (debug >= 2) println "group "+allUnites.size()+" units with "+matches.size()+" strict=$strict_inclusion"
264
        //println allUnites.collect() {it -> it.getDeb()}
265
        allUnites = allUnites.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() }
266
        //println allUnites.collect() {it -> it.getDeb()}
267
        def unitsSize = allUnites.size()
268
        def iCurrentUnit = 0
269
        def selectedUnits = []
270

    
271
        def matchesSize = matches.size()
272
        def iCurrentMatch = 0
273

    
274
        def selectedUnitsPerMatch = new LinkedHashMap()
275
        selectedUnitsPerMatch[iCurrentMatch] = selectedUnits
276

    
277
        while (iCurrentMatch < matchesSize && iCurrentUnit < unitsSize) {
278
                if (debug >= 2) println "** M $iCurrentMatch < $matchesSize && U $iCurrentUnit < $unitsSize"
279

    
280
                Unite unit = allUnites[iCurrentUnit]
281
                Match match = matches[iCurrentMatch]
282
                if (debug >= 3) println ""+unit.getDeb()+"->"+unit.getFin()+"        "+match.getStart()+"->"+match.getEnd()
283
                if (unit.getFin() < match.getStart()) {
284
                        if (debug >= 3) "println next unit"
285
                        
286
                        iCurrentUnit++
287
                } else if (unit.getDeb() > match.getEnd()) {
288
                        if (debug >= 3) "println next match"
289
                        
290
                        iCurrentMatch++
291
                        selectedUnits = []
292
                        selectedUnitsPerMatch[iCurrentMatch] = selectedUnits
293
                } else {
294
                        if (debug >= 3) println "iCurrentUnit=$iCurrentUnit        iCurrentMatch=$iCurrentMatch"
295
                        if (strict_inclusion) {
296

    
297
                                if (debug >= 3) println "m.start ${match.getStart()} <= u.deb ${unit.getDeb()} && u.fin ${unit.getFin()} <= m.end ${match.getEnd()}"
298
                                if (match.getStart() <= unit.getDeb() && unit.getFin() <= match.getEnd()) {
299
                                        selectedUnits << unit
300
                                }
301
                        } else {
302
                                selectedUnits << unit
303
                        }
304

    
305
                        iCurrentUnit++
306
                }
307
        }
308
        return selectedUnitsPerMatch
309
}
310

    
311
static def filterUniteByInclusion(def debug, def allUnites, def matches, boolean strict_inclusion, int position) {
312

    
313
        def selectedUnitsPerMatch = groupByMatch(debug, allUnites, matches, strict_inclusion);
314
        //println "selectedUnitsPerMatch size="+selectedUnitsPerMatch.size()
315
        def selectedUnits = []
316
        if (position != 0) {
317
                if (position > 0) position--
318

    
319
                for (def m : selectedUnitsPerMatch.keySet()) {
320
                        if (selectedUnitsPerMatch[m].size() > position && selectedUnitsPerMatch[m].size() > 0) {
321
                                def units = selectedUnitsPerMatch[m]
322
                                //println "$m -> "+units.collect() {it -> it.getDeb()}
323
                                units = units.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() }
324
                                //println "$m -> "+units.collect() {it -> it.getDeb()}
325
                                selectedUnits << units[position]
326
                                if (debug >=3) println "dist select: "+units[position].getDeb()
327
                        }
328
                }
329
        } else {
330
                for (def m : selectedUnitsPerMatch.keySet()) selectedUnits.addAll(selectedUnitsPerMatch[m])
331
        }
332

    
333
        return selectedUnits
334
}
335

    
336
static def findAllUnitesInElements(def debug, def elements, String URSQL) {
337
        def params = getFilterParameters(URSQL)
338
        return findAllUnitesInElements(debug, elements, params[0], params[1], params[2])
339
}
340

    
341
static def findAllUnitesInElements(def debug, def elements, String typeRegex, String propName, String valueRegex) {
342
        def allElements = []
343

    
344
        for (Element element : elements) {
345
                allElements.addAll(filterElements(debug, element.getUnitesSousjacentes(), typeRegex, propName, valueRegex));
346
        }
347

    
348
        return allElements;
349
}
350

    
351
/**
352
 * group all units without selection
353
 * 
354
 * @param elements
355
 * @return
356
 */
357
static def groupAllUnitesInElements(def debug, def elements) {
358
        return groupAllUnitesInElements(debug, elements, "","","")
359
}
360

    
361
/**
362
 * group all units with URSQL selection
363
 * 
364
 * @param elements
365
 * @param URSQL
366
 * @return
367
 */
368
static def groupAllUnitesInElements(def debug, def elements, String URSQL) {
369
        def params = getFilterParameters(URSQL)
370
        return groupAllUnitesInElements(debug, elements, params[0], params[1], params[2])
371
}
372

    
373
static def groupAllUnitesInElements(def debug, def elements, String typeRegex, String propName, String valueRegex) {
374
        def allElements = [:]
375

    
376
        for (Element element : elements) {
377
                allElements[element] = filterElements(debug, element.getUnitesSousjacentes(), typeRegex, propName, valueRegex);
378
        }
379

    
380
        return allElements;
381
}
382

    
383
static def getFilterParameters(String URSQL) {
384

    
385
        String type = "";
386
        String prop = "";
387
        String value = "";
388

    
389
        int atidx = URSQL.indexOf("@");
390
        int equalidx = URSQL.indexOf("=");
391

    
392
        if (atidx >= 0 && equalidx >= 0 && atidx < equalidx) { // TYPE@PROP=VALUE
393
                type = URSQL.substring(0, atidx)
394
                prop = URSQL.substring(atidx+1, equalidx)
395
                value = URSQL.substring(equalidx+1)
396
        } else if (atidx >= 0) { // TYPE@PROP
397
                type = URSQL.substring(0, atidx)
398
                prop = URSQL.substring(atidx+1)
399
        } else if (equalidx >= 0) { // TYPE=VALUE -> not well formed
400
                type = URSQL.substring(0, equalidx)
401
                value = URSQL.substring(equalidx+1)
402
        } else { // TYPE
403
                type = URSQL;
404
        }
405
        //        println(["'"+type+"'", "'"+prop+"'", "'"+value+"'"])
406

    
407
        return [type, prop, value]
408
}
409

    
410
static def filterElements(def debug, def allElements, String URSQL) {
411
        def params = getFilterParameters(URSQL)
412
        return filterElements(debug, allElements, params[0], params[1], params[2])
413
}
414

    
415
static def filterElements(def debug, def allElements, String typeRegex, String propName, String valueRegex) {
416
        if (debug >= 2) println "filtering "+allElements.size()+" elements with typeRegex='$typeRegex' propName='$propName' and valueRegex='$valueRegex'"
417
        if (typeRegex != null && typeRegex.length() > 0) {
418
                def filteredElements = []
419
                def matcher = /$typeRegex/
420
                for (Element element : allElements) {
421
                        if (element.getType() ==~ matcher) {
422
                                filteredElements << element
423
                        }
424
                }
425

    
426
                allElements = filteredElements;
427
        }
428
        if (debug >= 2) println " type step result: "+allElements.size()
429

    
430
        if (propName != null && propName.length() > 0) {
431
                def filteredElements = []
432
                if (valueRegex != null && valueRegex.length() > 0) {  // select only elements with the prop&value
433
                        def matcher = /$valueRegex/
434
                        for (Element element : allElements) {
435
                                def value = element.getProp(propName)
436
                                if (value != null && value ==~ matcher) {
437
                                        filteredElements << element
438
                                }
439
                        }
440
                } else { // select only elements with the prop
441
                        for (Element element : allElements) {
442
                                if (element.getProps().containsKey(propName)) {
443
                                        filteredElements << element
444
                                }
445
                        }
446
                }
447

    
448
                allElements = filteredElements;
449
        }
450
        if (debug >= 2) println " prop&value step result: "+allElements.size()
451
        return allElements;
452
}
453

    
454
static def getCQL(String name, def unites) {
455
        return getCQL(name, unites, false, true)
456
}
457

    
458
/**
459
 * 
460
 * @param name
461
 * @param unites
462
 * @param onePosition to return 1 token per patch
463
 * @return
464
 */
465
static def getCQL(String name, def unites, boolean onePosition, boolean limitNumberOfUnit) {
466
        //println "GETCQL of $name"
467
        def letters = "abcdefghijklmnopqrstu"//vwxyz0123456789"
468
        def MAXCQLQUERYSIZE = 1200 // 1150 // 1200 in fact
469

    
470
        HashSet<Integer> sizes = new HashSet<>()
471

    
472
        for (Unite unite : unites) {
473
                int size = unite.getFin() - unite.getDeb()+1
474
                if (size > letters.length()) size = letters.length()-1
475
                sizes.add(size)
476
        }
477

    
478
        int n = 0
479

    
480
        String totalleftquery = ""
481
        String totalrightquery = ""
482
        unites.sort() { it.getDeb() }
483
        def declaredsizes = []
484
        for (Unite unite : unites) {
485
                int size = unite.getFin() - unite.getDeb() + 1
486
                if (size < 0) {
487
                        println sprintf("** Warning: incoherent unit %s [%d, %d], size = "+size, unite.getProps(),unite.getDeb(), unite.getFin())
488
                        continue
489
                }
490
                if (onePosition) size = 1 // hack only the 1st position is needed for the Progression
491
                if (size > letters.length()) size = letters.length()-1
492
                String letter = ""+letters.charAt(size-1)
493
                String rightquery = letter+"="+unite.getDeb()
494

    
495
                String leftquery = ""
496
                if (!declaredsizes.contains(size)) {
497
                        declaredsizes << size
498

    
499
                        if (size == 1)
500
                                leftquery = letter+":[]"
501
                        else if (size == 2)
502
                                leftquery = letter+":[][]"
503
                        else if (size == 3)
504
                                leftquery = letter+":[][][]"             // [][][][]
505
                        else
506
                                leftquery = letter+":[][]{"+(size-1)+"}" // [][]{4}
507
                }
508

    
509
                if ((totalleftquery.length() + totalrightquery.length() + 2
510
                + leftquery.length() + rightquery.length()) >= MAXCQLQUERYSIZE) {
511
                        System.out.println("** $name : trop d'éléments pour la requête. Seuls les "+n+" premiers éléments sur ${unites.size()} seront affichés dans le graphique de progression.")
512
                        break
513
                }
514

    
515
                if (n > 0) {
516
                        if (leftquery.length() > 0) totalleftquery += "|"
517
                        totalrightquery += "|"
518
                }
519
                if (leftquery.length() > 0) totalleftquery += leftquery
520
                totalrightquery += rightquery
521

    
522
                n += 1
523
        }
524
        String query = totalleftquery+"::"+totalrightquery
525
        //println query
526
        return query
527
}