Statistics
| Revision:

root / tmp / org.txm.analec.rcp / src / org / txm / macro / analec / AnalecUtils.groovy @ 1094

History | View | Annotate | Download (17.4 kB)

1
package org.txm.macro.analec
2

    
3
import org.txm.searchengine.cqp.corpus.Property
4
import org.txm.searchengine.cqp.corpus.Subcorpus
5
import org.txm.searchengine.cqp.corpus.query.Match
6
import org.txm.searchengine.cqp.corpus.query.CQLQuery
7
import visuAnalec.donnees.*
8
import visuAnalec.elements.*
9

    
10
import org.apache.commons.lang.StringUtils
11

    
12

    
13
static def isPropertyDefined(Class clazz, Corpus analecCorpus, String ursql) {
14
        def params = getFilterParameters(ursql)
15
        def typeRegexp = params[0]
16
        def propRegexp = params[1]
17
        return isPropertyDefined(clazz, analecCorpus, typeRegexp, propRegexp)
18
}
19

    
20
static def isPropertyDefined(Class clazz, Corpus analecCorpus, String typeRegexp, String propRegexp) {
21
        def errors = new HashSet()
22
        Structure structure = analecCorpus.getStructure();
23
        for (def type : structure.getTypes(clazz)) {
24
                if (!type.matches(typeRegexp)) continue; // test only types matching with typeRegexp
25

    
26
                def props = structure.getNomsProps(clazz, type);
27
                boolean contains = false;
28
                for (def p : props) {
29
                        if (p.matches(propRegexp)) {
30
                                contains = true
31
                        }
32
                }
33
                if (!contains) errors << type
34
        }
35

    
36
        return errors
37
}
38

    
39
static def defineProperty(Class clazz, Corpus analecCorpus, String ursql, String newProperty) {
40
        def params = getFilterParameters(ursql)
41
        def typeRegexp = params[0]
42
        Structure structure = analecCorpus.getStructure();
43
        for (def type : structure.getTypes(clazz)) {
44
                if (!type.matches(typeRegexp)) continue; // test only types matching with typeRegexp
45
                def props = structure.getNomsProps(clazz, type)
46
                if (!props.contains(newProperty)) {
47
                        structure.ajouterProp(clazz, type, newProperty)
48
                }
49
        }
50
}
51

    
52
static def selectSchemas(def debug, Corpus analecCorpus, org.txm.searchengine.cqp.corpus.CQPCorpus corpus,
53
                String schema_ursql, Integer minimum_schema_size, Integer maximum_schema_size) {
54
        if (maximum_schema_size <= 0) maximum_schema_size = Integer.MAX_VALUE;
55
        def allSchemas = []
56
        if (schema_ursql != null && schema_ursql.length() > 0 || minimum_schema_size > 1) {
57

    
58
                if (schema_ursql != null && schema_ursql.length() > 0) allSchemas = AnalecUtils.findAllInCorpus(debug, analecCorpus, Schema.class, schema_ursql)
59
                else allSchemas = analecCorpus.getTousSchemas()
60

    
61
                if (debug >= 2) println "allSchemas=${allSchemas.size()}"
62
                if (minimum_schema_size > 1) allSchemas = AnalecUtils.filterBySize(allSchemas, minimum_schema_size, maximum_schema_size);
63
        } else {
64
                allSchemas = analecCorpus.getTousSchemas()
65
        }
66
        return allSchemas
67
}
68

    
69
static def selectSchemasInCorpus(def debug, Corpus analecCorpus, org.txm.searchengine.cqp.corpus.CQPCorpus corpus,
70
                String schema_ursql, Integer minimum_schema_size, Integer maximum_schema_size) {
71
                
72
        def allSchemas = AnalecUtils.selectSchemas(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size);
73
        
74
        def selectedSchemas = []
75
        for (Schema schema : allSchemas) {
76
                def selectedUnits = AnalecUtils.filterUniteByInclusion(debug, schema.getUnitesSousjacentes(), corpus.getMatches(), true, 0)
77
                if (selectedUnits.size() > 0 ) {
78
                        selectedSchemas << schema
79
                }
80
        }
81
        
82
        return selectedSchemas
83
}
84

    
85
/**
86
 * select units from a selection of schema. If no schema critera are given, select all units then apply units critera
87
 * 
88
 * @param debug
89
 * @param analecCorpus
90
 * @param corpus
91
 * @param schema_ursql
92
 * @param minimum_schema_size
93
 * @param maximum_schema_size
94
 * @param unit_ursql
95
 * @param limit_cql
96
 * @param strict_inclusion
97
 * @param limit_distance
98
 * @return
99
 */
100
static def selectUnitsInSchema(def debug, Corpus analecCorpus, org.txm.searchengine.cqp.corpus.CQPCorpus corpus,
101
                String schema_ursql, Integer minimum_schema_size, Integer maximum_schema_size,
102
                String unit_ursql, Integer limit_distance_in_schema, CQLQuery limit_cql, Boolean strict_inclusion, int limit_distance) {
103
        def groupedUnits = []
104
        if (schema_ursql != null && schema_ursql.length() > 0 || minimum_schema_size > 1) {
105
                def allSchema = null;
106

    
107
                if (schema_ursql != null && schema_ursql.length() > 0) allSchema = AnalecUtils.findAllInCorpus(debug, analecCorpus, Schema.class, schema_ursql)
108
                else allSchema = analecCorpus.getTousSchemas()
109
                if (debug >= 2) println "allSchema=${allSchema.size()}"
110

    
111
                if (minimum_schema_size > 1) allSchema = AnalecUtils.filterBySize(allSchema, minimum_schema_size, maximum_schema_size);
112
                if (debug >= 2) println "allSchema=${allSchema.size()}"
113

    
114
                groupedUnits = AnalecUtils.groupAllUnitesInElements(debug, allSchema, unit_ursql)
115
                
116
                if (limit_distance_in_schema >= 0) groupedUnits = AnalecUtils.filterUniteByInclusionInSchema(debug, groupedUnits, limit_distance_in_schema)
117

    
118
        } else {
119
                groupedUnits = ["all":AnalecUtils.findAllInCorpus(debug, analecCorpus, Unite.class, unit_ursql)]
120
        }
121
        if (debug >= 2) println "groupedUnits=${groupedUnits.size()}"
122

    
123
        def matches = null
124
        if (limit_cql != null && !limit_cql.getQueryString().equals("\"\"")) {
125
                Subcorpus limitssubcorpus = corpus.createSubcorpus(limit_cql, corpus.getName().toUpperCase())
126
                matches = limitssubcorpus.getMatches();
127
                limitssubcorpus.delete();
128
        } else {
129
                matches = corpus.getMatches()
130
        }
131
        if (debug >= 2) println "matches=${matches}"
132
        def allUnits = []
133
        for (def k : groupedUnits.keySet()) {
134
                def selectedUnits = AnalecUtils.filterUniteByInclusion(debug, groupedUnits[k], matches, strict_inclusion, limit_distance)
135
                allUnits.addAll(selectedUnits)
136
        }
137
        if (debug >= 2) println "selectedUnits=${allUnits.size()}"
138

    
139
        Collections.sort(allUnits)
140
        
141
        return allUnits
142
}
143
/**
144
 * filter groups elements with the elements positions
145
 * 
146
 * 
147
 * @param groups
148
 * @param distance 0=no selection, 1=first, 2=second, -1 last, -2 last-last
149
 * @return
150
 */
151
static def filterUniteByInclusionInSchema(def debug, def groups, Integer distance) {
152
        if (distance == 0) return groups;
153
        distance = distance-1;
154
        def newGroups = [:]
155
        for (def k : groups.keySet()) {
156
                newGroups[k] = [];
157
                def group = groups[k]
158
                if (Math.abs(distance) < group.size())
159
                        newGroups[k] << group[distance]
160
        }
161
        return newGroups
162
}
163

    
164
static def getStartsEndsTargetsArrays(def selectedUnits) {
165
        int[] starts = new int[selectedUnits.size()]
166
        int[] ends = new int[selectedUnits.size()]
167
        int n = 0;
168
        for (def unite : selectedUnits) {
169
                starts[n] = unite.getDeb();
170
                ends[n] = unite.getFin();
171
                n++
172
        }
173
        return [starts, ends, null]
174
}
175

    
176
static String toString(Element e) {
177
        Schema r = null;
178

    
179
        if (e.getClass() == Unite.class)
180
                return sprintf("%d-%d, %s", e.getDeb(), e.getFin(), e.getProps().sort())
181
        else if (e.getClass() == Relation.class)
182
                return sprintf("%s=%s -> %s", toString(e.getElt1()), toString(e.getElt2()), e.getProps().sort())
183
        else if (e.getClass() == Schema.class)
184
                return sprintf("%s=%d", e.getContenu().size(), e.getProps().sort())
185
}
186

    
187
static int[] toIntArray(Unite u) {
188
        if (u.getDeb() > u.getFin()) // error
189
                return (u.getFin()..u.getDeb()).toArray(new int[u.getDeb()-u.getFin()])
190
        else
191
                return (u.getDeb()..u.getFin()).toArray(new int[u.getFin()-u.getDeb()])
192
}
193

    
194
static String toString(def CQI, def wordProperty, Element e) {
195
        Schema r = null;
196

    
197
        if (e.getClass() == Unite.class) {
198
                def form = StringUtils.join(CQI.cpos2Str(wordProperty.getQualifiedName(), toIntArray(e)), " ")
199
                return sprintf("%s %d-%d, %s", form, e.getDeb(), e.getFin(), e.getProps().sort())
200
        } else if (e.getClass() == Relation.class) {
201
                def form1 = StringUtils.join(CQI.cpos2Str(wordProperty.getQualifiedName(), toIntArray(e.getElt1())), " ")
202
                def form2 = StringUtils.join(CQI.cpos2Str(wordProperty.getQualifiedName(), toIntArray(e.getElt2())), " ")
203
                return sprintf("%s=%s -> %s", form1+" "+toString(e.getElt1()), form2+" "+toString(e.getElt2()), e.getProps().sort())
204
        } else if (e.getClass() == Schema.class) {
205
                return sprintf("%s=%d", e.getContenu().size(), e.getProps().sort())
206
        }
207
}
208

    
209
static def findAllInCorpus(def debug, def analecCorpus, Class elemClazz, String URSQL) {
210
        def params = getFilterParameters(URSQL)
211
        if (debug >= 2) println "PARAMS=$params"
212
        return findAllInCorpus(debug, analecCorpus, elemClazz, params[0], params[1], params[2])
213
}
214

    
215
static def findAllInCorpus(def debug, Corpus analecCorpus, Class elemClazz, String typeRegex, String propName, String valueRegex) {
216
        def allElements = null;
217

    
218
        if (elemClazz != null) {
219
                if (elemClazz == Unite.class)
220
                        allElements = analecCorpus.getToutesUnites()
221
                else if (elemClazz == Relation.class)
222
                        allElements = analecCorpus.getToutesRelations()
223
                else if (elemClazz == Schema.class)
224
                        allElements = analecCorpus.getTousSchemas()
225
        } else {
226
                allElements = [];
227
                allElements.addAll(analecCorpus.getToutesUnites())
228
                allElements.addAll(analecCorpus.getToutesRelations())
229
                allElements.addAll(analecCorpus.getTousSchemas())
230
        }
231

    
232

    
233
        return filterElements(debug, allElements, typeRegex, propName, valueRegex);
234
}
235

    
236
static def filterBySize(def elements, Integer minSize, Integer maximum_schema_size) {
237
        if (maximum_schema_size == null || maximum_schema_size < 0) maximum_schema_size = Integer.MAX_VALUE;
238
        if (minSize == null || minSize < 0) minSize = 0;
239

    
240
        def filteredElements = []
241
        for (Element e : elements) {
242
                Unite[] units = e.getUnitesSousjacentes();
243
                int size = units.length;
244
                if (size < minSize) continue;
245
                if (size > maximum_schema_size) continue;
246
                filteredElements << e;
247
        }
248
        return filteredElements
249
}
250

    
251
/**
252
 * group units by CQP match
253
 * 
254
 * units are sorted for faster processing
255
 * 
256
 * @param allUnites
257
 * @param matches
258
 * @param strict_inclusion
259
 * @return
260
 */
261
static def groupByMatch(def debug, def allUnites, def matches, boolean strict_inclusion) {
262
        if (debug >= 2) println "group "+allUnites.size()+" units with "+matches.size()+" strict=$strict_inclusion"
263
        //println allUnites.collect() {it -> it.getDeb()}
264
        allUnites = allUnites.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() }
265
        //println allUnites.collect() {it -> it.getDeb()}
266
        def unitsSize = allUnites.size()
267
        def iCurrentUnit = 0
268
        def selectedUnits = []
269

    
270
        def matchesSize = matches.size()
271
        def iCurrentMatch = 0
272

    
273
        def selectedUnitsPerMatch = new LinkedHashMap()
274
        selectedUnitsPerMatch[iCurrentMatch] = selectedUnits
275

    
276
        while (iCurrentMatch < matchesSize && iCurrentUnit < unitsSize) {
277
                if (debug >= 2) println "** M $iCurrentMatch < $matchesSize && U $iCurrentUnit < $unitsSize"
278

    
279
                Unite unit = allUnites[iCurrentUnit]
280
                Match match = matches[iCurrentMatch]
281
                if (debug >= 3) println ""+unit.getDeb()+"->"+unit.getFin()+"        "+match.getStart()+"->"+match.getEnd()
282
                if (unit.getFin() < match.getStart()) {
283
                        if (debug >= 3) "println next unit"
284
                                iCurrentUnit++
285
                } else if (unit.getDeb() > match.getEnd()) {
286
                        if (debug >= 3) "println next match"
287
                                iCurrentMatch++
288
                        selectedUnits = []
289
                        selectedUnitsPerMatch[iCurrentMatch] = selectedUnits
290
                } else {
291
                        if (debug >= 3) println "iCurrentUnit=$iCurrentUnit        iCurrentMatch=$iCurrentMatch"
292
                        if (strict_inclusion) {
293

    
294
                                if (debug >= 3) println "m.start ${match.getStart()} <= u.deb ${unit.getDeb()} && u.fin ${unit.getFin()} <= m.end ${match.getEnd()}"
295
                                if (match.getStart() <= unit.getDeb() && unit.getFin() <= match.getEnd()) {
296
                                        selectedUnits << unit
297
                                }
298
                        } else {
299
                                selectedUnits << unit
300
                        }
301

    
302
                        iCurrentUnit++
303
                }
304
        }
305
        return selectedUnitsPerMatch
306
}
307

    
308
static def filterUniteByInclusion(def debug, def allUnites, def matches, boolean strict_inclusion, int limit_distance) {
309

    
310
        def selectedUnitsPerMatch = groupByMatch(debug, allUnites, matches, strict_inclusion);
311
        //println "selectedUnitsPerMatch size="+selectedUnitsPerMatch.size()
312
        def selectedUnits = []
313
        if (limit_distance != 0) {
314
                if (limit_distance > 0) limit_distance--
315

    
316
                for (def m : selectedUnitsPerMatch.keySet()) {
317
                        if (selectedUnitsPerMatch[m].size() > limit_distance && selectedUnitsPerMatch[m].size() > 0) {
318
                                def units = selectedUnitsPerMatch[m]
319
                                //println "$m -> "+units.collect() {it -> it.getDeb()}
320
                                units = units.sort() { a, b -> a.getDeb() <=> b.getDeb() ?: a.getFin() <=> b.getFin() }
321
                                //println "$m -> "+units.collect() {it -> it.getDeb()}
322
                                selectedUnits << units[limit_distance]
323
                                if (debug >=3) println "dist select: "+units[limit_distance].getDeb()
324
                        }
325
                }
326
        } else {
327
                for (def m : selectedUnitsPerMatch.keySet()) selectedUnits.addAll(selectedUnitsPerMatch[m])
328
        }
329

    
330
        return selectedUnits
331
}
332

    
333
static def findAllUnitesInElements(def debug, def elements, String URSQL) {
334
        def params = getFilterParameters(URSQL)
335
        return findAllUnitesInElements(debug, elements, params[0], params[1], params[2])
336
}
337

    
338
static def findAllUnitesInElements(def debug, def elements, String typeRegex, String propName, String valueRegex) {
339
        def allElements = []
340

    
341
        for (Element element : elements) {
342
                allElements.addAll(filterElements(debug, element.getUnitesSousjacentes(), typeRegex, propName, valueRegex));
343
        }
344

    
345
        return allElements;
346
}
347

    
348
/**
349
 * group all units without selection
350
 * 
351
 * @param elements
352
 * @return
353
 */
354
static def groupAllUnitesInElements(def debug, def elements) {
355
        return groupAllUnitesInElements(debug, elements, "","","")
356
}
357

    
358
/**
359
 * group all units with URSQL selection
360
 * 
361
 * @param elements
362
 * @param URSQL
363
 * @return
364
 */
365
static def groupAllUnitesInElements(def debug, def elements, String URSQL) {
366
        def params = getFilterParameters(URSQL)
367
        return groupAllUnitesInElements(debug, elements, params[0], params[1], params[2])
368
}
369

    
370
static def groupAllUnitesInElements(def debug, def elements, String typeRegex, String propName, String valueRegex) {
371
        def allElements = [:]
372

    
373
        for (Element element : elements) {
374
                allElements[element] = filterElements(debug, element.getUnitesSousjacentes(), typeRegex, propName, valueRegex);
375
        }
376

    
377
        return allElements;
378
}
379

    
380
static def getFilterParameters(String URSQL) {
381

    
382
        String type = "";
383
        String prop = "";
384
        String value = "";
385

    
386
        int atidx = URSQL.indexOf("@");
387
        int equalidx = URSQL.indexOf("=");
388

    
389
        if (atidx >= 0 && equalidx >= 0 && atidx < equalidx) {
390
                type = URSQL.substring(0, atidx)
391
                prop = URSQL.substring(atidx+1, equalidx)
392
                value = URSQL.substring(equalidx+1)
393
        } else if (atidx >= 0) {
394
                type = URSQL.substring(0, atidx)
395
                prop = URSQL.substring(atidx+1)
396
        } else if (equalidx >= 0) {
397
                type = URSQL.substring(0, equalidx)
398
                value = URSQL.substring(equalidx+1)
399
        } else {
400
                type = URSQL;
401
        }
402
        //        println(["'"+type+"'", "'"+prop+"'", "'"+value+"'"])
403

    
404
        return [type, prop, value]
405
}
406

    
407
static def filterElements(def debug, def allElements, String URSQL) {
408
        def params = getFilterParameters(URSQL)
409
        return filterElements(debug, allElements, params[0], params[1], params[2])
410
}
411

    
412
static def filterElements(def debug, def allElements, String typeRegex, String propName, String valueRegex) {
413
        if (debug >= 2) println "filtering "+allElements.size()+" elements with typeRegex='$typeRegex' propName='$propName' and valueRegex='$valueRegex'"
414
        if (typeRegex != null && typeRegex.length() > 0) {
415
                def filteredElements = []
416
                def matcher = /$typeRegex/
417
                for (Element element : allElements) {
418
                        if (element.getType() ==~ matcher) {
419
                                filteredElements << element
420
                        }
421
                }
422

    
423
                allElements = filteredElements;
424
        }
425
        if (debug >= 2) println " type step result: "+allElements.size()
426

    
427
        if (propName != null && propName.length() > 0) {
428
                def filteredElements = []
429
                if (valueRegex != null && valueRegex.length() > 0) {  // select only elements with the prop&value
430
                        def matcher = /$valueRegex/
431
                        for (Element element : allElements) {
432
                                def value = element.getProp(propName)
433
                                if (value != null && value ==~ matcher) {
434
                                        filteredElements << element
435
                                }
436
                        }
437
                } else { // select only elements with the prop
438
                        for (Element element : allElements) {
439
                                if (element.getProps().containsKey(propName)) {
440
                                        filteredElements << element
441
                                }
442
                        }
443
                }
444

    
445
                allElements = filteredElements;
446
        }
447
        if (debug >= 2) println " prop&value step result: "+allElements.size()
448
        return allElements;
449
}
450

    
451
static def getCQL(String name, def unites) {
452
        return getCQL(name, unites, false, true)
453
}
454

    
455
/**
456
 * 
457
 * @param name
458
 * @param unites
459
 * @param onePosition to return 1 token per patch
460
 * @return
461
 */
462
static def getCQL(String name, def unites, boolean onePosition, boolean limitNumberOfUnit) {
463
        //println "GETCQL of $name"
464
        def letters = "abcdefghijklmnopqrstu"//vwxyz0123456789"
465
        def MAXCQLQUERYSIZE = 1200 // 1150 // 1200 in fact
466

    
467
        HashSet<Integer> sizes = new HashSet<>()
468

    
469
        for (Unite unite : unites) {
470
                int size = unite.getFin() - unite.getDeb()+1
471
                if (size > letters.length()) size = letters.length()-1
472
                sizes.add(size)
473
        }
474

    
475
        int n = 0
476

    
477
        String totalleftquery = ""
478
        String totalrightquery = ""
479
        unites.sort() { it.getDeb() }
480
        def declaredsizes = []
481
        for (Unite unite : unites) {
482
                int size = unite.getFin() - unite.getDeb() + 1
483
                if (size < 0) {
484
                        println sprintf("** Warning: incoherent unit %s [%d, %d], size = "+size, unite.getProps(),unite.getDeb(), unite.getFin())
485
                        continue
486
                }
487
                if (onePosition) size = 1 // hack only the 1st position is needed for the Progression
488
                if (size > letters.length()) size = letters.length()-1
489
                String letter = ""+letters.charAt(size-1)
490
                String rightquery = letter+"="+unite.getDeb()
491

    
492
                String leftquery = ""
493
                if (!declaredsizes.contains(size)) {
494
                        declaredsizes << size
495

    
496
                        if (size == 1)
497
                                leftquery = letter+":[]"
498
                        else if (size == 2)
499
                                leftquery = letter+":[][]"
500
                        else if (size == 3)
501
                                leftquery = letter+":[][][]"             // [][][][]
502
                        else
503
                                leftquery = letter+":[][]{"+(size-1)+"}" // [][]{4}
504
                }
505

    
506
                if ((totalleftquery.length() + totalrightquery.length() + 2
507
                + leftquery.length() + rightquery.length()) >= MAXCQLQUERYSIZE) {
508
                        System.out.println("** $name : trop d'éléments pour la requête. Seuls les "+n+" premiers éléments sur ${unites.size()} seront affichés dans le graphique de progression.")
509
                        break
510
                }
511

    
512
                if (n > 0) {
513
                        if (leftquery.length() > 0) totalleftquery += "|"
514
                        totalrightquery += "|"
515
                }
516
                if (leftquery.length() > 0) totalleftquery += leftquery
517
                totalrightquery += rightquery
518

    
519
                n += 1
520
        }
521
        String query = totalleftquery+"::"+totalrightquery
522
        //println query
523
        return query
524
}