Statistics
| Revision:

root / tmp / org.txm.analec.rcp / src / org / txm / macro / urs / AnalecUtils.groovy @ 1962

History | View | Annotate | Download (17.3 kB)

1
package org.txm.macro.urs
2

    
3
import org.txm.searchengine.cqp.corpus.Property
4
import org.txm.searchengine.cqp.corpus.Subcorpus
5
import org.txm.searchengine.cqp.corpus.query.Match
6
import org.txm.searchengine.cqp.corpus.query.CQLQuery
7
import visuAnalec.donnees.*
8
import visuAnalec.elements.*
9

    
10
import org.apache.commons.lang.StringUtils
11

    
12

    
13
static def isPropertyDefined(Class clazz, Corpus analecCorpus, String ursql) {
14
        def params = getFilterParameters(ursql)
15
        def typeRegexp = params[0]
16
        def propRegexp = params[1]
17
        return isPropertyDefined(clazz, analecCorpus, typeRegexp, propRegexp)
18
}
19

    
20
static def isPropertyDefined(Class clazz, Corpus analecCorpus, String typeRegexp, String propRegexp) {
21
        def errors = new HashSet()
22
        Structure structure = analecCorpus.getStructure();
23
        for (def type : structure.getTypes(clazz)) {
24
                if (!type.matches(typeRegexp)) continue; // test only types matching with typeRegexp
25

    
26
                def props = structure.getNomsProps(clazz, type);
27
                boolean contains = false;
28
                for (def p : props) {
29
                        if (p.matches(propRegexp)) {
30
                                contains = true
31
                        }
32
                }
33
                if (!contains) errors << type
34
        }
35

    
36
        return errors
37
}
38

    
39
static def defineProperty(Class clazz, Corpus analecCorpus, String ursql, String newProperty) {
40
        def params = getFilterParameters(ursql)
41
        def typeRegexp = params[0]
42
        Structure structure = analecCorpus.getStructure();
43
        for (def type : structure.getTypes(clazz)) {
44
                if (!type.matches(typeRegexp)) continue; // test only types matching with typeRegexp
45
                def props = structure.getNomsProps(clazz, type)
46
                if (!props.contains(newProperty)) {
47
                        structure.ajouterProp(clazz, type, newProperty)
48
                }
49
        }
50
}
51

    
52
static def selectSchemas(def debug, Corpus analecCorpus, String schema_ursql, Integer minimum_schema_size, Integer maximum_schema_size) {
53
        if (maximum_schema_size <= 0) maximum_schema_size = Integer.MAX_VALUE;
54
        def allSchemas = []
55
        if (schema_ursql != null && schema_ursql.length() > 0 || minimum_schema_size > 1) {
56

    
57
                if (schema_ursql != null && schema_ursql.length() > 0) allSchemas = AnalecUtils.findAllInCorpus(debug, analecCorpus, Schema.class, schema_ursql)
58
                else allSchemas = analecCorpus.getTousSchemas()
59

    
60
                if (debug >= 2) println "allSchemas=${allSchemas.size()}"
61
                if (minimum_schema_size > 1) allSchemas = AnalecUtils.filterBySize(allSchemas, minimum_schema_size, maximum_schema_size);
62
        } else {
63
                allSchemas = analecCorpus.getTousSchemas()
64
        }
65
        return allSchemas
66
}
67

    
68
static def selectSchemasInCorpus(def debug, Corpus analecCorpus, org.txm.searchengine.cqp.corpus.CQPCorpus corpus,
69
                String schema_ursql, Integer minimum_schema_size, Integer maximum_schema_size) {
70
                
71
        def allSchemas = AnalecUtils.selectSchemas(debug, analecCorpus, schema_ursql, minimum_schema_size, maximum_schema_size);
72
        
73
        def selectedSchemas = []
74
        for (Schema schema : allSchemas) {
75
                def selectedUnits = AnalecUtils.filterUniteByInclusion(debug, schema.getUnitesSousjacentes(), corpus.getMatches(), true, 0)
76
                if (selectedUnits.size() > 0 ) {
77
                        selectedSchemas << schema
78
                }
79
        }
80
        
81
        return selectedSchemas
82
}
83

    
84
/**
85
 * select units from a selection of schema. If no schema critera are given, select all units then apply units critera
86
 * 
87
 * @param debug
88
 * @param analecCorpus
89
 * @param corpus
90
 * @param schema_ursql
91
 * @param minimum_schema_size
92
 * @param maximum_schema_size
93
 * @param unit_ursql
94
 * @param limit_cql
95
 * @param strict_inclusion
96
 * @param limit_distance
97
 * @return
98
 */
99
static def selectUnitsInSchema(def debug, Corpus analecCorpus, org.txm.searchengine.cqp.corpus.CQPCorpus corpus,
100
                String schema_ursql, Integer minimum_schema_size, Integer maximum_schema_size,
101
                String unit_ursql, Integer limit_distance_in_schema, CQLQuery limit_cql, Boolean strict_inclusion, int limit_distance) {
102
        def groupedUnits = []
103
        if (schema_ursql != null && schema_ursql.length() > 0 || minimum_schema_size > 1) {
104
                def allSchema = null;
105

    
106
                if (schema_ursql != null && schema_ursql.length() > 0) allSchema = AnalecUtils.findAllInCorpus(debug, analecCorpus, Schema.class, schema_ursql)
107
                else allSchema = analecCorpus.getTousSchemas()
108
                if (debug >= 2) println "allSchema=${allSchema.size()}"
109

    
110
                if (minimum_schema_size > 1) allSchema = AnalecUtils.filterBySize(allSchema, minimum_schema_size, maximum_schema_size);
111
                if (debug >= 2) println "allSchema=${allSchema.size()}"
112

    
113
                groupedUnits = AnalecUtils.groupAllUnitesInElements(debug, allSchema, unit_ursql)
114
                
115
                if (limit_distance_in_schema >= 0) groupedUnits = AnalecUtils.filterUniteByInclusionInSchema(debug, groupedUnits, limit_distance_in_schema)
116

    
117
        } else {
118
                groupedUnits = ["all":AnalecUtils.findAllInCorpus(debug, analecCorpus, Unite.class, unit_ursql)]
119
        }
120
        if (debug >= 2) println "groupedUnits=${groupedUnits.size()}"
121

    
122
        def matches = null
123
        if (limit_cql != null && !limit_cql.getQueryString().equals("\"\"")) {
124
                Subcorpus limitssubcorpus = corpus.createSubcorpus(limit_cql, corpus.getID().toUpperCase())
125
                matches = limitssubcorpus.getMatches();
126
                limitssubcorpus.delete();
127
        } else {
128
                matches = corpus.getMatches()
129
        }
130
        if (debug >= 2) println "matches=${matches}"
131
        def allUnits = []
132
        for (def k : groupedUnits.keySet()) {
133
                def selectedUnits = AnalecUtils.filterUniteByInclusion(debug, groupedUnits[k], matches, strict_inclusion, limit_distance)
134
                allUnits.addAll(selectedUnits)
135
        }
136
        if (debug >= 2) println "selectedUnits=${allUnits.size()}"
137

    
138
        Collections.sort(allUnits)
139
        
140
        return allUnits
141
}
142
/**
143
 * filter groups elements with the elements positions
144
 * 
145
 * 
146
 * @param groups
147
 * @param distance 0=no selection, 1=first, 2=second, -1 last, -2 last-last
148
 * @return
149
 */
150
static def filterUniteByInclusionInSchema(def debug, def groups, Integer distance) {
151
        if (distance == 0) return groups;
152
        distance = distance-1;
153
        def newGroups = [:]
154
        for (def k : groups.keySet()) {
155
                newGroups[k] = [];
156
                def group = groups[k]
157
                if (Math.abs(distance) < group.size())
158
                        newGroups[k] << group[distance]
159
        }
160
        return newGroups
161
}
162

    
163
static def getStartsEndsTargetsArrays(def selectedUnits) {
164
        int[] starts = new int[selectedUnits.size()]
165
        int[] ends = new int[selectedUnits.size()]
166
        int n = 0;
167
        for (def unite : selectedUnits) {
168
                starts[n] = unite.getDeb();
169
                ends[n] = unite.getFin();
170
                n++
171
        }
172
        return [starts, ends, null]
173
}
174

    
175
static String toString(Element e) {
176
        Schema r = null;
177

    
178
        if (e.getClass() == Unite.class)
179
                return sprintf("%d-%d, %s", e.getDeb(), e.getFin(), e.getProps().sort())
180
        else if (e.getClass() == Relation.class)
181
                return sprintf("%s=%s -> %s", toString(e.getElt1()), toString(e.getElt2()), e.getProps().sort())
182
        else if (e.getClass() == Schema.class)
183
                return sprintf("%s=%d", e.getContenu().size(), e.getProps().sort())
184
}
185

    
186
static int[] toIntArray(Unite u) {
187
        if (u.getDeb() > u.getFin()) // error
188
                return (u.getFin()..u.getDeb()).toArray(new int[u.getDeb()-u.getFin()])
189
        else
190
                return (u.getDeb()..u.getFin()).toArray(new int[u.getFin()-u.getDeb()])
191
}
192

    
193
static String toString(def CQI, def wordProperty, Element e) {
194
        Schema r = null;
195

    
196
        if (e.getClass() == Unite.class) {
197
                def form = StringUtils.join(CQI.cpos2Str(wordProperty.getQualifiedName(), toIntArray(e)), " ")
198
                return sprintf("%s %d-%d, %s", form, e.getDeb(), e.getFin(), e.getProps().sort())
199
        } else if (e.getClass() == Relation.class) {
200
                def form1 = StringUtils.join(CQI.cpos2Str(wordProperty.getQualifiedName(), toIntArray(e.getElt1())), " ")
201
                def form2 = StringUtils.join(CQI.cpos2Str(wordProperty.getQualifiedName(), toIntArray(e.getElt2())), " ")
202
                return sprintf("%s=%s -> %s", form1+" "+toString(e.getElt1()), form2+" "+toString(e.getElt2()), e.getProps().sort())
203
        } else if (e.getClass() == Schema.class) {
204
                return sprintf("%s=%d", e.getContenu().size(), e.getProps().sort())
205
        }
206
}
207

    
208
static def findAllInCorpus(def debug, def analecCorpus, Class elemClazz, String URSQL) {
209
        def params = getFilterParameters(URSQL)
210
        if (debug >= 2) println "PARAMS=$params"
211
        return findAllInCorpus(debug, analecCorpus, elemClazz, params[0], params[1], params[2])
212
}
213

    
214
static def findAllInCorpus(def debug, Corpus analecCorpus, Class elemClazz, String typeRegex, String propName, String valueRegex) {
215
        def allElements = null;
216

    
217
        if (elemClazz != null) {
218
                if (elemClazz == Unite.class)
219
                        allElements = analecCorpus.getToutesUnites()
220
                else if (elemClazz == Relation.class)
221
                        allElements = analecCorpus.getToutesRelations()
222
                else if (elemClazz == Schema.class)
223
                        allElements = analecCorpus.getTousSchemas()
224
        } else {
225
                allElements = [];
226
                allElements.addAll(analecCorpus.getToutesUnites())
227
                allElements.addAll(analecCorpus.getToutesRelations())
228
                allElements.addAll(analecCorpus.getTousSchemas())
229
        }
230

    
231

    
232
        return filterElements(debug, allElements, typeRegex, propName, valueRegex);
233
}
234

    
235
static def filterBySize(def elements, Integer minSize, Integer maximum_schema_size) {
236
        if (maximum_schema_size == null || maximum_schema_size < 0) maximum_schema_size = Integer.MAX_VALUE;
237
        if (minSize == null || minSize < 0) minSize = 0;
238

    
239
        def filteredElements = []
240
        for (Element e : elements) {
241
                Unite[] units = e.getUnitesSousjacentes();
242
                int size = units.length;
243
                if (size < minSize) continue;
244
                if (size > maximum_schema_size) continue;
245
                filteredElements << e;
246
        }
247
        return filteredElements
248
}
249

    
250
/**
251
 * group units by CQP match
252
 * 
253
 * units are sorted for faster processing
254
 * 
255
 * @param allUnites
256
 * @param matches
257
 * @param strict_inclusion
258
 * @return
259
 */
260
static def groupByMatch(def debug, def allUnites, def matches, boolean strict_inclusion) {
261
        if (debug >= 2) println "group "+allUnites.size()+" units with "+matches.size()+" strict=$strict_inclusion"
262
        //println allUnites.collect() {it -> it.getDeb()}
263
        allUnites = allUnites.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() }
264
        //println allUnites.collect() {it -> it.getDeb()}
265
        def unitsSize = allUnites.size()
266
        def iCurrentUnit = 0
267
        def selectedUnits = []
268

    
269
        def matchesSize = matches.size()
270
        def iCurrentMatch = 0
271

    
272
        def selectedUnitsPerMatch = new LinkedHashMap()
273
        selectedUnitsPerMatch[iCurrentMatch] = selectedUnits
274

    
275
        while (iCurrentMatch < matchesSize && iCurrentUnit < unitsSize) {
276
                if (debug >= 2) println "** M $iCurrentMatch < $matchesSize && U $iCurrentUnit < $unitsSize"
277

    
278
                Unite unit = allUnites[iCurrentUnit]
279
                Match match = matches[iCurrentMatch]
280
                if (debug >= 3) println ""+unit.getDeb()+"->"+unit.getFin()+"        "+match.getStart()+"->"+match.getEnd()
281
                if (unit.getFin() < match.getStart()) {
282
                        if (debug >= 3) "println next unit"
283
                                iCurrentUnit++
284
                } else if (unit.getDeb() > match.getEnd()) {
285
                        if (debug >= 3) "println next match"
286
                                iCurrentMatch++
287
                        selectedUnits = []
288
                        selectedUnitsPerMatch[iCurrentMatch] = selectedUnits
289
                } else {
290
                        if (debug >= 3) println "iCurrentUnit=$iCurrentUnit        iCurrentMatch=$iCurrentMatch"
291
                        if (strict_inclusion) {
292

    
293
                                if (debug >= 3) println "m.start ${match.getStart()} <= u.deb ${unit.getDeb()} && u.fin ${unit.getFin()} <= m.end ${match.getEnd()}"
294
                                if (match.getStart() <= unit.getDeb() && unit.getFin() <= match.getEnd()) {
295
                                        selectedUnits << unit
296
                                }
297
                        } else {
298
                                selectedUnits << unit
299
                        }
300

    
301
                        iCurrentUnit++
302
                }
303
        }
304
        return selectedUnitsPerMatch
305
}
306

    
307
static def filterUniteByInclusion(def debug, def allUnites, def matches, boolean strict_inclusion, int limit_distance) {
308

    
309
        def selectedUnitsPerMatch = groupByMatch(debug, allUnites, matches, strict_inclusion);
310
        //println "selectedUnitsPerMatch size="+selectedUnitsPerMatch.size()
311
        def selectedUnits = []
312
        if (limit_distance != 0) {
313
                if (limit_distance > 0) limit_distance--
314

    
315
                for (def m : selectedUnitsPerMatch.keySet()) {
316
                        if (selectedUnitsPerMatch[m].size() > limit_distance && selectedUnitsPerMatch[m].size() > 0) {
317
                                def units = selectedUnitsPerMatch[m]
318
                                //println "$m -> "+units.collect() {it -> it.getDeb()}
319
                                units = units.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() }
320
                                //println "$m -> "+units.collect() {it -> it.getDeb()}
321
                                selectedUnits << units[limit_distance]
322
                                if (debug >=3) println "dist select: "+units[limit_distance].getDeb()
323
                        }
324
                }
325
        } else {
326
                for (def m : selectedUnitsPerMatch.keySet()) selectedUnits.addAll(selectedUnitsPerMatch[m])
327
        }
328

    
329
        return selectedUnits
330
}
331

    
332
static def findAllUnitesInElements(def debug, def elements, String URSQL) {
333
        def params = getFilterParameters(URSQL)
334
        return findAllUnitesInElements(debug, elements, params[0], params[1], params[2])
335
}
336

    
337
static def findAllUnitesInElements(def debug, def elements, String typeRegex, String propName, String valueRegex) {
338
        def allElements = []
339

    
340
        for (Element element : elements) {
341
                allElements.addAll(filterElements(debug, element.getUnitesSousjacentes(), typeRegex, propName, valueRegex));
342
        }
343

    
344
        return allElements;
345
}
346

    
347
/**
348
 * group all units without selection
349
 * 
350
 * @param elements
351
 * @return
352
 */
353
static def groupAllUnitesInElements(def debug, def elements) {
354
        return groupAllUnitesInElements(debug, elements, "","","")
355
}
356

    
357
/**
358
 * group all units with URSQL selection
359
 * 
360
 * @param elements
361
 * @param URSQL
362
 * @return
363
 */
364
static def groupAllUnitesInElements(def debug, def elements, String URSQL) {
365
        def params = getFilterParameters(URSQL)
366
        return groupAllUnitesInElements(debug, elements, params[0], params[1], params[2])
367
}
368

    
369
static def groupAllUnitesInElements(def debug, def elements, String typeRegex, String propName, String valueRegex) {
370
        def allElements = [:]
371

    
372
        for (Element element : elements) {
373
                allElements[element] = filterElements(debug, element.getUnitesSousjacentes(), typeRegex, propName, valueRegex);
374
        }
375

    
376
        return allElements;
377
}
378

    
379
static def getFilterParameters(String URSQL) {
380

    
381
        String type = "";
382
        String prop = "";
383
        String value = "";
384

    
385
        int atidx = URSQL.indexOf("@");
386
        int equalidx = URSQL.indexOf("=");
387

    
388
        if (atidx >= 0 && equalidx >= 0 && atidx < equalidx) {
389
                type = URSQL.substring(0, atidx)
390
                prop = URSQL.substring(atidx+1, equalidx)
391
                value = URSQL.substring(equalidx+1)
392
        } else if (atidx >= 0) {
393
                type = URSQL.substring(0, atidx)
394
                prop = URSQL.substring(atidx+1)
395
        } else if (equalidx >= 0) {
396
                type = URSQL.substring(0, equalidx)
397
                value = URSQL.substring(equalidx+1)
398
        } else {
399
                type = URSQL;
400
        }
401
        //        println(["'"+type+"'", "'"+prop+"'", "'"+value+"'"])
402

    
403
        return [type, prop, value]
404
}
405

    
406
static def filterElements(def debug, def allElements, String URSQL) {
407
        def params = getFilterParameters(URSQL)
408
        return filterElements(debug, allElements, params[0], params[1], params[2])
409
}
410

    
411
static def filterElements(def debug, def allElements, String typeRegex, String propName, String valueRegex) {
412
        if (debug >= 2) println "filtering "+allElements.size()+" elements with typeRegex='$typeRegex' propName='$propName' and valueRegex='$valueRegex'"
413
        if (typeRegex != null && typeRegex.length() > 0) {
414
                def filteredElements = []
415
                def matcher = /$typeRegex/
416
                for (Element element : allElements) {
417
                        if (element.getType() ==~ matcher) {
418
                                filteredElements << element
419
                        }
420
                }
421

    
422
                allElements = filteredElements;
423
        }
424
        if (debug >= 2) println " type step result: "+allElements.size()
425

    
426
        if (propName != null && propName.length() > 0) {
427
                def filteredElements = []
428
                if (valueRegex != null && valueRegex.length() > 0) {  // select only elements with the prop&value
429
                        def matcher = /$valueRegex/
430
                        for (Element element : allElements) {
431
                                def value = element.getProp(propName)
432
                                if (value != null && value ==~ matcher) {
433
                                        filteredElements << element
434
                                }
435
                        }
436
                } else { // select only elements with the prop
437
                        for (Element element : allElements) {
438
                                if (element.getProps().containsKey(propName)) {
439
                                        filteredElements << element
440
                                }
441
                        }
442
                }
443

    
444
                allElements = filteredElements;
445
        }
446
        if (debug >= 2) println " prop&value step result: "+allElements.size()
447
        return allElements;
448
}
449

    
450
static def getCQL(String name, def unites) {
451
        return getCQL(name, unites, false, true)
452
}
453

    
454
/**
455
 * 
456
 * @param name
457
 * @param unites
458
 * @param onePosition to return 1 token per patch
459
 * @return
460
 */
461
static def getCQL(String name, def unites, boolean onePosition, boolean limitNumberOfUnit) {
462
        //println "GETCQL of $name"
463
        def letters = "abcdefghijklmnopqrstu"//vwxyz0123456789"
464
        def MAXCQLQUERYSIZE = 1200 // 1150 // 1200 in fact
465

    
466
        HashSet<Integer> sizes = new HashSet<>()
467

    
468
        for (Unite unite : unites) {
469
                int size = unite.getFin() - unite.getDeb()+1
470
                if (size > letters.length()) size = letters.length()-1
471
                sizes.add(size)
472
        }
473

    
474
        int n = 0
475

    
476
        String totalleftquery = ""
477
        String totalrightquery = ""
478
        unites.sort() { it.getDeb() }
479
        def declaredsizes = []
480
        for (Unite unite : unites) {
481
                int size = unite.getFin() - unite.getDeb() + 1
482
                if (size < 0) {
483
                        println sprintf("** Warning: incoherent unit %s [%d, %d], size = "+size, unite.getProps(),unite.getDeb(), unite.getFin())
484
                        continue
485
                }
486
                if (onePosition) size = 1 // hack only the 1st position is needed for the Progression
487
                if (size > letters.length()) size = letters.length()-1
488
                String letter = ""+letters.charAt(size-1)
489
                String rightquery = letter+"="+unite.getDeb()
490

    
491
                String leftquery = ""
492
                if (!declaredsizes.contains(size)) {
493
                        declaredsizes << size
494

    
495
                        if (size == 1)
496
                                leftquery = letter+":[]"
497
                        else if (size == 2)
498
                                leftquery = letter+":[][]"
499
                        else if (size == 3)
500
                                leftquery = letter+":[][][]"             // [][][][]
501
                        else
502
                                leftquery = letter+":[][]{"+(size-1)+"}" // [][]{4}
503
                }
504

    
505
                if ((totalleftquery.length() + totalrightquery.length() + 2
506
                + leftquery.length() + rightquery.length()) >= MAXCQLQUERYSIZE) {
507
                        System.out.println("** $name : trop d'éléments pour la requête. Seuls les "+n+" premiers éléments sur ${unites.size()} seront affichés dans le graphique de progression.")
508
                        break
509
                }
510

    
511
                if (n > 0) {
512
                        if (leftquery.length() > 0) totalleftquery += "|"
513
                        totalrightquery += "|"
514
                }
515
                if (leftquery.length() > 0) totalleftquery += leftquery
516
                totalrightquery += rightquery
517

    
518
                n += 1
519
        }
520
        String query = totalleftquery+"::"+totalrightquery
521
        //println query
522
        return query
523
}