Revision 2167

tmp/org.txm.analec.rcp/src/org/txm/macro/urs/AnalecUtils.groovy (revision 2167)
11 11

  
12 12

  
13 13
static def isPropertyDefined(Class clazz, Corpus analecCorpus, String ursql) {
14
	if (ursql == null || ursql.length() == 0) return new HashSet() 
14
	if (ursql == null || ursql.length() == 0) return new HashSet()
15 15
	def params = getFilterParameters(ursql)
16 16
	def typeRegexp = params[0]
17 17
	def propRegexp = params[1]
18
	println "params=$params"
19 18
	return isPropertyDefined(clazz, analecCorpus, typeRegexp, propRegexp)
20 19
}
21 20

  
......
56 55
	if (maximum_schema_size <= 0) maximum_schema_size = Integer.MAX_VALUE;
57 56
	if (minimum_schema_size < 0) minimum_schema_size = 0;
58 57
	def allSchemas = []
59
	
58

  
60 59
	if (schema_ursql != null && schema_ursql.length() > 0) allSchemas = AnalecUtils.findAllInCorpus(debug, analecCorpus, Schema.class, schema_ursql)
61 60
	else allSchemas = analecCorpus.getTousSchemas()
62 61

  
63 62
	if (debug >= 2) println "allSchemas=${allSchemas.size()}"
64 63
	allSchemas = AnalecUtils.filterBySize(allSchemas, minimum_schema_size, maximum_schema_size);
65
	
64

  
66 65
	return allSchemas
67 66
}
68 67

  
69 68
static def selectSchemasInCorpus(def debug, Corpus analecCorpus, org.txm.searchengine.cqp.corpus.CQPCorpus corpus,
70
	String schema_ursql, Integer minimum_schema_size, Integer maximum_schema_size, boolean strictInclusion) {
71
		
69
		String schema_ursql, Integer minimum_schema_size, Integer maximum_schema_size, boolean strictInclusion) {
70

  
72 71
	if (maximum_schema_size <= 0) maximum_schema_size = Integer.MAX_VALUE;
73 72
	if (minimum_schema_size < 0) minimum_schema_size = 0;
74
		
73

  
75 74
	def allSchemas = []
76 75
	if (schema_ursql != null && schema_ursql.length() > 0) allSchemas = AnalecUtils.findAllInCorpus(debug, analecCorpus, Schema.class, schema_ursql)
77 76
	else allSchemas = analecCorpus.getTousSchemas()
78
	
77

  
79 78
	def selectedSchemas = []
80 79
	for (Schema schema : allSchemas) {
81 80
		def selectedUnits = AnalecUtils.filterUniteByInclusion(debug, schema.getUnitesSousjacentes(), corpus.getMatches(), strictInclusion, 0)
82
		
81

  
83 82
		if (minimum_schema_size <= selectedUnits.size() && selectedUnits.size() <= maximum_schema_size ) {
84 83
			selectedSchemas << schema
85 84
		}
86 85
	}
87
	
86

  
88 87
	return selectedSchemas
89 88
}
90 89

  
......
107 106
		String schema_ursql, Integer minimum_schema_size, Integer maximum_schema_size,
108 107
		String unit_ursql, Integer position_in_schema, CQLQuery cql_limit, Boolean strict_inclusion, int position_in_matches) {
109 108
	def groupedUnits = []
110
	if (schema_ursql != null && schema_ursql.length() > 0 || minimum_schema_size > 1) {
109
	if (schema_ursql != null && schema_ursql.length() > 0) {
111 110
		def allSchema = null;
112 111

  
113 112
		if (schema_ursql != null && schema_ursql.length() > 0) allSchema = AnalecUtils.findAllInCorpus(debug, analecCorpus, Schema.class, schema_ursql)
114 113
		else allSchema = analecCorpus.getTousSchemas()
115 114
		if (debug >= 2) println "allSchema=${allSchema.size()}"
116 115

  
117
		allSchema = AnalecUtils.filterBySize(allSchema, minimum_schema_size, maximum_schema_size);
118
		if (debug >= 2) println "allSchema=${allSchema.size()}"
119

  
120 116
		groupedUnits = AnalecUtils.groupAllUnitesInElements(debug, allSchema, unit_ursql)
117
		if (debug >= 2) println "groupedUnits=${groupedUnits.size()}"
121 118
		
122
		if (position_in_schema >= 0) groupedUnits = AnalecUtils.filterUniteByInclusionInSchema(debug, groupedUnits, position_in_schema)
119
		groupedUnits = AnalecUtils.filterUniteByInclusionInSchema(debug, groupedUnits, position_in_schema)
120
		if (debug >= 2) println "groupedUnits=${groupedUnits.size()}"
123 121

  
122

  
124 123
	} else {
125 124
		groupedUnits = ["all":AnalecUtils.findAllInCorpus(debug, analecCorpus, Unite.class, unit_ursql)]
126 125
	}
......
139 138
	def allUnits = []
140 139
	for (def k : groupedUnits.keySet()) {
141 140
		def selectedUnits = AnalecUtils.filterUniteByInclusion(debug, groupedUnits[k], matches, strict_inclusion, position_in_matches)
142
		allUnits.addAll(selectedUnits)
141

  
142
		if (minimum_schema_size <= selectedUnits.size() && selectedUnits.size() <= maximum_schema_size ) {
143
			allUnits.addAll(selectedUnits)
144
		} else {
145

  
146
		}
143 147
	}
144 148
	if (debug >= 2) println "selectedUnits=${allUnits.size()}"
145 149

  
146 150
	Collections.sort(allUnits)
147
	
151

  
148 152
	return allUnits
149 153
}
150 154
/**
......
156 160
 * @return
157 161
 */
158 162
static def filterUniteByInclusionInSchema(def debug, def groups, Integer distance) {
163
	println "dist=$distance"
159 164
	if (distance == 0) return groups;
160 165
	if (distance > 0) distance = distance-1;
161 166
	def newGroups = [:]
......
166 171
			continue;
167 172
		}
168 173
		def indexes = null
169
		if (distance > 0) {
174
		if (distance >= 0) {
170 175
			indexes = 0..Math.min(distance, group.size())
171 176
		} else {
172 177
			indexes = Math.max(distance, -group.size())..-1
173 178
		}
179

  
174 180
		newGroups[k] = group[indexes];
175 181
	}
176 182
	return newGroups
......
253 259

  
254 260
	def filteredElements = []
255 261
	for (Element e : elements) {
256
		Unite[] units = e.getUnitesSousjacentes();
257
		int size = units.length;
258
		if (size < minimum_schema_size) continue;
259
		if (size > maximum_schema_size) continue;
260
		filteredElements << e;
262
		Unite[] selectedUnits = e.getUnitesSousjacentes();
263
		int size = selectedUnits.length;
264
		if (minimum_schema_size <= selectedUnits.size() && selectedUnits.size() <= maximum_schema_size ) {
265
			filteredElements << e
266
		}
261 267
	}
262 268
	return filteredElements
263 269
}
......
288 294
	selectedUnitsPerMatch[iCurrentMatch] = selectedUnits
289 295

  
290 296
	while (iCurrentMatch < matchesSize && iCurrentUnit < unitsSize) {
291
		if (debug >= 2) println "** M $iCurrentMatch < $matchesSize && U $iCurrentUnit < $unitsSize"
297
		if (debug >= 3) println "** M $iCurrentMatch < $matchesSize && U $iCurrentUnit < $unitsSize"
292 298

  
293 299
		Unite unit = allUnites[iCurrentUnit]
294 300
		Match match = matches[iCurrentMatch]
295 301
		if (debug >= 3) println ""+unit.getDeb()+"->"+unit.getFin()+"	"+match.getStart()+"->"+match.getEnd()
296 302
		if (unit.getFin() < match.getStart()) {
297 303
			if (debug >= 3) "println next unit"
298
			
299
			iCurrentUnit++
304

  
305
				iCurrentUnit++
300 306
		} else if (unit.getDeb() > match.getEnd()) {
301 307
			if (debug >= 3) "println next match"
302
			
303
			iCurrentMatch++
308

  
309
				iCurrentMatch++
304 310
			selectedUnits = []
305 311
			selectedUnitsPerMatch[iCurrentMatch] = selectedUnits
306 312
		} else {
......
407 413
	if (!eq) {
408 414
		equal_start_idx--
409 415
	}
410
	 
416

  
411 417
	if (atidx >= 0 && equal_start_idx >= 0 && atidx < equal_start_idx) { // TYPE@PROP=VALUE
412 418
		type = URSQL.substring(0, atidx)
413 419
		prop = URSQL.substring(atidx+1, equal_start_idx)
......
432 438
}
433 439

  
434 440
static def filterElements(def debug, def allElements, String typeRegex, String propName, boolean eq, String valueRegex) {
435
	if (debug >= 2) println "filtering "+allElements.size()+" elements with typeRegex='$typeRegex' propName='$propName' and valueRegex='$valueRegex'"
441
	if (debug >= 3) println "filtering "+allElements.size()+" elements with typeRegex='$typeRegex' propName='$propName' and valueRegex='$valueRegex'"
436 442
	if (typeRegex != null && typeRegex.length() > 0) {
437 443
		def filteredElements = []
438 444
		def matcher = /$typeRegex/
......
444 450

  
445 451
		allElements = filteredElements;
446 452
	}
447
	if (debug >= 2) println " type step result: "+allElements.size()
453
	if (debug >= 3) println " type step result: "+allElements.size()
448 454

  
449 455
	if (propName != null && propName.length() > 0) {
450 456
		def filteredElements = []
......
468 474

  
469 475
		allElements = filteredElements;
470 476
	}
471
	if (debug >= 2) println " prop&value step result: "+allElements.size()
477
	if (debug >= 3) println " prop&value step result: "+allElements.size()
472 478
	return allElements;
473 479
}
474 480

  
tmp/org.txm.analec.rcp/src/org/txm/macro/urs/democrat/BuildSectionsMacro.groovy (revision 2167)
11 11
// Création des 4 sous-corpus de chapitres
12 12

  
13 13
[1, 2, 3, 4].each { chapitre ->
14
	res = gse.run(SubCorpusMacro, ["args":[ "corpus":"DIDEROTESSAIS", "name":"Chapitre "+chapitre, "query":"[_.div_n1=\""+chapitre+"\"] expand to div1" ],
14
	res = gse.run(SubCorpusMacro, ["args":["name":"Chapitre "+chapitre, "query":"[_.div_n1=\""+chapitre+"\"] expand to div1" ],
15 15
                                "selection":selection,
16 16
                                "selections":selections,
17 17
                                "corpusViewSelection":corpusViewSelection,
tmp/org.txm.analec.rcp/src/org/txm/macro/urs/exploit/SchemasSummaryMacro.groovy (revision 2167)
73 73

  
74 74
// tabulate summary
75 75
def keys = allresults.keySet().sort();
76
println "Statistiques des schémas de "+corpusViewSelections.join(",")
76
println "Statistiques des schémas de "+corpusViewSelections.join(", ")
77 77
println "\t"+keys.join("\t")
78 78

  
79 79
// number of schemas
tmp/org.txm.analec.rcp/src/org/txm/macro/urs/exploit/SchemaLengthsMacro.groovy (revision 2167)
74 74
		return;
75 75
	}
76 76

  
77
	//println([debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, strict_inclusion])
77 78
	def schemas = AnalecUtils.selectSchemasInCorpus(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, strict_inclusion);
78 79

  
79 80
	int nSchemas = 0;
......
97 98

  
98 99
		lens[nUnites] = lens[nUnites] + 1;
99 100
		String value = schema.getProp(schema_property_display);
100
		if (value.contains(",")) value = "\"$value\""
101
		//if (value.contains(",")) value = "\"$value\""
101 102
		lensnames[nUnites] << value
102 103
		nSchemas++;
103 104
	}
......
136 137
	println "Index:\nlen\tfreq\tcfreq"
137 138
	int nShown = 0;
138 139
	for( def it : slens) {
139
		println it.key+"	"+it.value+"	"+(cfreq+=it.value)+"	"+lensnames[it.key].join(", ")
140
		def names = lensnames[it.key]
141
		names = names.sort()
142
		println it.key+"	"+it.value+"	"+(cfreq+=it.value)+"	\""+names.join("\", \"")+"\""
140 143
		nShown++
141 144
		if (max_lines > 0 && nShown >= max_lines) {
142 145
			println "... (${slens.size() - max_lines})"
tmp/org.txm.analec.rcp/src/org/txm/macro/urs/exploit/UnitsIndexMacro.groovy (revision 2167)
173 173
else println "Index de la propriété $unit_property_display des unités $unit_ursql[$position_in_matches] de ${corpus.getMainCorpus()}.${corpusViewSelection}"
174 174
println "$unit_property_display\t"+selection.join("\t")
175 175

  
176
int total_freq = 0
177 176
keys.eachWithIndex { prop_val, i ->
178 177
	String line = ""
179 178
	if (prop_val.size() > 0) {
......
189 188

  
190 189
		matrix.set(i, j, freq)
191 190
		line += "\t"+freq
192
		total_freq += freq
193 191
		if (min < freq) min = freq
194 192
	}
195 193
	if (min >= output_fmin)
196 194
		println "$line"
197 195
}
198
println "\nTOTAL=$total_freq"
199 196
props = keys
200 197

  
201 198
def r = RWorkspace.getRWorkspaceInstance()
tmp/org.txm.analec.rcp/src/org/txm/macro/urs/exploit/SchemasIndexMacro.groovy (revision 2167)
1
// Copyright © 2019 ENS de Lyon, CNRS, University of Franche-Comté
2
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
3
// @author mdecorde
4
// @author sheiden
5

  
6
// STANDARD DECLARATIONS
7
package org.txm.macro.urs.exploit
8

  
9
import groovy.transform.Field
10

  
11
import org.apache.commons.lang.StringUtils
12
import org.kohsuke.args4j.*
13
import org.txm.Toolbox
14
import org.txm.annotation.urs.*
15
import org.txm.lexicaltable.core.functions.LexicalTable
16
import org.txm.lexicaltable.core.statsengine.r.data.LexicalTableImpl
17
import org.txm.macro.urs.*
18
import org.txm.rcp.commands.*
19
import org.txm.rcp.swt.widget.parameters.*
20
import org.txm.rcp.views.corpora.CorporaView
21
import org.txm.searchengine.cqp.corpus.*
22
import org.txm.searchengine.cqp.CQPSearchEngine
23
import org.txm.statsengine.r.core.RWorkspace
24

  
25
import visuAnalec.donnees.*
26
import visuAnalec.elements.*
27
import cern.colt.matrix.DoubleFactory2D
28
import cern.colt.matrix.DoubleMatrix2D
29

  
30
def selection = []
31
for (def s : corpusViewSelections) {
32
	if (s instanceof CQPCorpus) selection << s
33
	else if (s instanceof Partition) selection.addAll(s.getParts())
34
}
35

  
36
if (selection.size() == 0) {
37
	println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections
38
	return false
39
} else {
40
	for (def c : selection) c.compute(false)
41
}
42

  
43
// BEGINNING OF PARAMETERS
44
@Field @Option(name="schema_ursql", usage="TYPE@PROP=VALUE", widget="String", required=true, def="CHAINE")
45
String schema_ursql
46
@Field @Option(name="minimum_schema_size", usage="minimal schema size", widget="Integer", required=true, def="3")
47
int minimum_schema_size
48
@Field @Option(name="maximum_schema_size", usage="Maximum size needed to consider a schema", widget="Integer", required=true, def="9999999")
49
int maximum_schema_size
50
@Field @Option(name="schema_property_display", usage="schema property to show", widget="String", required=true, def="REF")
51
String schema_property_display
52
@Field @Option(name="unit_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="MENTION")
53
String unit_ursql
54
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true")
55
boolean strict_inclusion
56
@Field @Option(name="output_fmin", usage="minimal frequency displayed", widget="Integer", required=false, def="0")
57
output_fmin
58
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF	ON	ALL	REALLY ALL", required=true, def="OFF")
59
debug
60

  
61
if (!ParametersDialog.open(this)) return;
62
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
63

  
64
def allresults = [:]
65
for (def corpus : selection) {
66
	def analecCorpus = URSCorpora.getCorpus(corpus)
67
	
68
	def errors = AnalecUtils.isPropertyDefined(Schema.class, analecCorpus, schema_ursql)
69
	if (errors.size() > 0) {
70
		println "** The $schema_ursql schema URSQL cannot be computed in the corpus with types: $errors."
71
		return;
72
	}
73
	
74
	errors=AnalecUtils.isPropertyDefined(Schema.class, analecCorpus, schema_property_display)
75
	if (errors.size() > 0) {
76
		println "** $schema_property_display schema property not found in the corpus with types: $errors."
77
		return;
78
	}
79
	
80
	errors = AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, unit_ursql)
81
	if (errors.size() > 0) {
82
		println "** $unit_ursql unit URSQL cannot be computed in the corpus with types: $errors."
83
		return;
84
	}
85
	
86
	def CQI = CQPSearchEngine.getCqiClient()
87
		
88
	def schemas = AnalecUtils.selectSchemasInCorpus(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, strict_inclusion);
89
	schemas.sort() {it.getProps()}
90
	
91
	def counts = schemas.countBy { if (it.getProp(schema_property_display) == null) "<null>" else it.getProp(schema_property_display) }
92
	
93
	if (counts.containsKey("")) counts["<empty>"] = counts[""];
94
	counts.remove("")
95
	
96
	if (debug) {
97
		println "\n*** Statistics: "
98
		for (def k : counts.keySet()) {
99
			if (k.size() > 0) { println "$k\t"+counts[k] }else{ println "N/A\t"+counts[k] }
100
		}
101
	}
102

  
103
	allresults[corpus] = counts
104
}
105

  
106
if (debug) println "RESULTS=$allresults"
107
def keys = new HashSet();
108
for (def corpus : allresults.keySet()) keys.addAll(allresults[corpus].keySet())
109
if (keys.size() == 0) {
110
	println "No values for $schema_property_display property"
111
	return
112
}
113
keys = keys.sort()
114
mFactory = DoubleFactory2D.dense
115
matrix = mFactory.make(keys.size(), selection.size())
116

  
117
if (debug) println "KEYS=$keys"
118

  
119

  
120

  
121
def title = "${selection}\n${unit_ursql}"
122
title += " ${schema_property_display} frequencies"
123

  
124
println "Index de la propriété $schema_property_display des schemas $schema_ursql de ${selection}"
125
println "$schema_property_display\t"+selection.join("\t")
126

  
127
keys.eachWithIndex { prop_val, i ->
128
	String line = ""
129
	if (prop_val.size() > 0) {
130
		line += prop_val
131
	} else {
132
		line += "N/A"
133
	}
134

  
135
	int min = 0;
136
	allresults.keySet().eachWithIndex { c, j ->
137
		def freq = allresults[c][prop_val]
138
		if (freq == null) freq = 0
139

  
140
		matrix.set(i, j, freq)
141
		line += "\t"+freq
142
		if (min < freq) min = freq
143
	}
144
	if (min >= output_fmin)
145
		println "$line"
146
}
147
props = keys
148

  
149

  
tmp/org.txm.analec.rcp/src/org/txm/macro/urs/exploit/UnitsSummaryMacro.groovy (revision 2167)
72 72
	def selectedUnits = AnalecUtils.selectUnitsInSchema(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, 
73 73
	unit_ursql, position_in_schema, cql_limit, strict_inclusion, position_in_matches);
74 74

  
75
	if (debug) println "found ${selectedUnits.size()} units in $corpus}"
75 76
	allresults[corpus] = selectedUnits;
76 77
}
77 78

  
......
91 92
		for (def u : allresults[k]) {
92 93
			propsvalues.add(u.getProp(p))
93 94
		}
94
		if (debug) println "k @p values="+propsvalues
95 95
		print "\t"+propsvalues.size();
96
		def values = []
97
		values.addAll(propsvalues)
98
		values = values.sort()
99
		if (debug) print "\t\""+values.join("\", \"")+"\"";
96 100
	}
97 101
	println ""
98 102
}

Also available in: Unified diff