Révision 2147

tmp/org.txm.analec.rcp/src/org/txm/macro/urs/exploit/SchemasRatioMacro.groovy (revision 2147)
1
// Copyright © 2016 ENS de Lyon, CNRS, University of Franche-Comté
2
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
3
// @author mdecorde
4
// @author sheiden
5

  
6
// STANDARD DECLARATIONS
7
package org.txm.macro.urs.exploit
8

  
9
import org.kohsuke.args4j.*
10
import groovy.transform.Field
11
import org.txm.rcp.swt.widget.parameters.*
12
import org.txm.annotation.urs.*
13
import org.txm.macro.urs.AnalecUtils
14
import visuAnalec.elements.*
15
import org.txm.searchengine.cqp.corpus.*
16
import org.txm.Toolbox
17
import org.txm.rcp.commands.*
18
import org.txm.statsengine.r.core.RWorkspace
19

  
20
def selection = []
21
for (def s : corpusViewSelections) {
22
	if (s instanceof CQPCorpus) selection << s
23
	else if (s instanceof Partition) selection.addAll(s.getParts())
24
}
25

  
26
if (selection.size() == 0) {
27
	println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections
28
	return false
29
} else {
30
	for (def c : selection) c.compute(false)
31
}
32

  
33
@Field @Option(name="schema_ursql", usage="TYPE@PROP=REGEX", widget="String", required=true, def="CHAINE")
34
		String schema_ursql
35
@Field @Option(name="minimum_schema_size", usage="minimal schema size", widget="Integer", required=true, def="3")
36
		int minimum_schema_size
37
@Field @Option(name="maximum_schema_size", usage="Maximum size needed to consider a schema", widget="Integer", required=true, def="9999999")
38
		int maximum_schema_size
39
@Field @Option(name="schema_property_display", usage="schema property to show", widget="String", required=true, def="REF")
40
		String schema_property_display
41
		
42
@Field @Option(name="unit_ursql", usage="TYPE@PROP=REGEX", widget="String", required=false, def="MENTION")
43
		String unit_ursql
44
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true")
45
		boolean strict_inclusion
46
@Field @Option(name="position_in_matches", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
47
		int position_in_matches
48
		
49
@Field @Option(name="units_margin_ursql", usage="TYPE@PROP=REGEX", widget="String", required=false, def="MENTION")
50
		String units_margin_ursql
51
@Field @Option(name="margin_strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true")
52
		boolean margin_strict_inclusion
53
@Field @Option(name="margin_position_in_matches", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
54
		int margin_position_in_matches
55
		
56
@Field @Option(name="sep", usage="choose between the len or freq columns", widget="Separator", metaVar="freq	len", required=false, def="")
57
		def sep
58
@Field @Option(name="sort_column", usage="choose between the len or freq columns", widget="StringArray", metaVar="score	name", required=true, def="score")
59
		String sort_column
60
@Field @Option(name="max_lines", usage="cut the number of lines shown. '0' means no cut", widget="Integer", required=false, def="0")
61
		int max_lines
62
//@Field @Option(name="output_graph", usage="Show chart", widget="Boolean", required=true, def="false")
63
		output_graph = false
64
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF	ON	ALL	REALLY ALL", required=true, def="OFF")
65
		debug
66

  
67
if (!ParametersDialog.open(this)) return;
68
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
69

  
70
for (def corpus : selection) {
71
	def analecCorpus = URSCorpora.getCorpus(corpus)
72

  
73
	def errors = AnalecUtils.isPropertyDefined(Schema.class, analecCorpus, schema_ursql)
74
	if (errors.size() > 0) {
75
		println "** The $schema_ursql schema URSQL cannot be computed in the corpus with types: $errors."
76
		return;
77
	}
78

  
79
	errors=AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, unit_ursql)
80
	if (errors.size() > 0) {
81
		println "** $unit_ursql unit URSQL cannot be computed in the corpus with types: $errors."
82
		return;
83
	}
84

  
85
	def schemas = AnalecUtils.selectSchemasInCorpus(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, strict_inclusion);
86

  
87
	int nSchemas = 0;
88

  
89
	def scores = [:]
90
	def fs = [:]
91
	def fs_margin = [:]
92
	for (def schema : schemas) {
93

  
94
		def allUnites = schema.getUnitesSousjacentesNonTriees()
95
		def units = AnalecUtils.filterElements(debug, allUnites, unit_ursql)
96
		selectedUnits = AnalecUtils.filterUniteByInclusion(debug, units, corpus.getMatches(), strict_inclusion, position_in_matches)
97
		
98
		def units_margin = AnalecUtils.filterElements(debug, allUnites, units_margin_ursql)
99
		selectedUnits_margin = AnalecUtils.filterUniteByInclusion(debug, units_margin, corpus.getMatches(), margin_strict_inclusion, margin_position_in_matches)
100
		
101
		if (!scores.containsKey(schema)) {
102
			scores[schema] = 0;
103
		}
104

  
105
		scores[schema] = 0;
106
		fs[schema] = selectedUnits.size();
107
		fs_margin[schema] = selectedUnits_margin.size();
108
		if (selectedUnits_margin.size() > 0) {
109
			scores[schema] = (float)selectedUnits.size() / (float)selectedUnits_margin.size()
110
		}
111
		
112
		nSchemas++;
113
	}
114

  
115
	if ("score".equals(sort_column)) {
116
		scores = scores.sort { a, b -> -a.value <=> -b.value ?: -fs[a.key]<=> -fs[b.key] ?: a.key.getProps()[schema_property_display].compareTo(b.key.getProps()[schema_property_display]) }
117
	} else {
118
		scores = scores.sort { a, b -> a.key.getProps()[schema_property_display].compareTo(b.key.getProps()[schema_property_display]) ?: -a.value <=> -b.value ?: -fs[a.key]<=> -fs[b.key]}
119
	}
120
	
121
	println "Schema scores in '$corpus'."
122
	println "${schema_property_display}\tF\tF margin\tscore"
123
	int nShown = 0;
124
	for( def it : scores) {
125
		println ""+it.key.getProps()[schema_property_display]+"\t"+fs[it.key]+"\t"+fs_margin[it.key]+"\t"+String.format( "%.2f", it.value * 100.0d )+"%"
126
		nShown++
127
		if (max_lines > 0 && nShown >= max_lines) {
128
			println "... (${scores.size() - max_lines})"
129
			break;
130
		}
131
	}
132

  
133
	//return ["result":coef, "data":lens]
134
}
135

  
tmp/org.txm.analec.rcp/src/org/txm/macro/urs/exploit/SchemaLengthsMacro.groovy (revision 2147)
45 45
		boolean strict_inclusion
46 46
@Field @Option(name="position_in_matches", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
47 47
		int position_in_matches
48
@Field @Option(name="sep", usage="choose between the len or freq columns", widget="Separator", metaVar="freq	len", required=true, def="")
48
@Field @Option(name="sep", usage="choose between the len or freq columns", widget="Separator", metaVar="freq	len", required=false, def="")
49 49
		def sep
50 50
@Field @Option(name="sort_column", usage="choose between the len or freq columns", widget="StringArray", metaVar="freq	len", required=true, def="freq")
51 51
		String sort_column
tmp/org.txm.analec.rcp/src/org/txm/macroprototypes/urs/exploit/SchemaAccessibilityScoresMacro.groovy (revision 2147)
1
// Copyright © 2016 ENS de Lyon, CNRS, University of Franche-Comté
2
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
3
// @author mdecorde
4
// @author sheiden
5

  
6
// STANDARD DECLARATIONS
7
package org.txm.macroprototypes.urs.exploit
8

  
9
import org.kohsuke.args4j.*
10
import groovy.transform.Field
11
import org.txm.rcp.swt.widget.parameters.*
12
import org.txm.annotation.urs.*
13
import org.txm.macro.urs.AnalecUtils
14
import visuAnalec.elements.*
15
import org.txm.searchengine.cqp.corpus.*
16
import org.txm.Toolbox
17
import org.txm.rcp.commands.*
18
import org.txm.statsengine.r.core.RWorkspace
19

  
20
def selection = []
21
for (def s : corpusViewSelections) {
22
	if (s instanceof CQPCorpus) selection << s
23
	else if (s instanceof Partition) selection.addAll(s.getParts())
24
}
25

  
26
if (selection.size() == 0) {
27
	println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections
28
	return false
29
} else {
30
	for (def c : selection) c.compute(false)
31
}
32

  
33
@Field @Option(name="schema_ursql", usage="TYPE@PROP=REGEX", widget="String", required=true, def="CHAINE")
34
		String schema_ursql
35

  
36
@Field @Option(name="minimum_schema_size", usage="minimal schema size", widget="Integer", required=true, def="3")
37
		int minimum_schema_size
38
@Field @Option(name="maximum_schema_size", usage="Maximum size needed to consider a schema", widget="Integer", required=true, def="9999999")
39
		int maximum_schema_size
40
@Field @Option(name="schema_property_display", usage="schema property to show", widget="String", required=true, def="REF")
41
		String schema_property_display
42
@Field @Option(name="unit_ursql", usage="TYPE@PROP=REGEX", widget="String", required=false, def="MENTION")
43
		String unit_ursql
44
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true")
45
		boolean strict_inclusion
46
@Field @Option(name="position_in_matches", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
47
		int position_in_matches
48
@Field @Option(name="sep", usage="choose between the len or freq columns", widget="Separator", metaVar="freq	len", required=true, def="")
49
		def sep
50
@Field @Option(name="sort_column", usage="choose between the len or freq columns", widget="StringArray", metaVar="freq	len", required=true, def="freq")
51
		String sort_column
52
@Field @Option(name="max_lines", usage="cut the number of lines shown. '0' means no cut", widget="Integer", required=false, def="0")
53
		int max_lines
54
//@Field @Option(name="output_graph", usage="Show chart", widget="Boolean", required=true, def="false")
55
		output_graph = false
56
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF	ON	ALL	REALLY ALL", required=true, def="OFF")
57
		debug
58

  
59
if (!ParametersDialog.open(this)) return;
60
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
61

  
62
for (def corpus : selection) {
63
	def analecCorpus = URSCorpora.getCorpus(corpus)
64

  
65
	def errors = AnalecUtils.isPropertyDefined(Schema.class, analecCorpus, schema_ursql)
66
	if (errors.size() > 0) {
67
		println "** The $schema_ursql schema URSQL cannot be computed in the corpus with types: $errors."
68
		return;
69
	}
70

  
71
	errors=AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, unit_ursql)
72
	if (errors.size() > 0) {
73
		println "** $unit_ursql unit URSQL cannot be computed in the corpus with types: $errors."
74
		return;
75
	}
76

  
77
	def schemas = AnalecUtils.selectSchemasInCorpus(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, strict_inclusion);
78

  
79
	int nSchemas = 0;
80

  
81
	def scores = [:]
82
	def lensnames = [:]
83
	for (def schema : schemas) {
84

  
85
		def allUnites = schema.getUnitesSousjacentesNonTriees()
86
		
87
		def units = AnalecUtils.filterElements(debug, allUnites, unit_ursql)
88
		
89
		selectedUnits = AnalecUtils.filterUniteByInclusion(debug, units, corpus.getMatches(), strict_inclusion, position_in_matches)
90
		
91
		int nUnites = selectedUnits.size();
92

  
93
		if (!scores.containsKey(schema)) {
94
			scores[schema] = 0;
95
		}
96

  
97
		scores[schema] = units.su
98
		
99
		String value = schema.getProp(schema_property_display);
100
		nSchemas++;
101
	}
102

  
103
	//println "nSchemas=$nSchemas"
104
	def freqs = lens.keySet();
105
	freqs.sort();
106
	int t = 0;
107
	int n = 0;
108
	//println "Fréquences ("+freqs.size()+")"
109
	for (def f : freqs) {
110
		t += f * lens[f]
111
		n += lens[f]
112
	}
113

  
114
	if (n == 0) {
115
		println "No units selected for schemas=$schema_ursql and units=$unit_ursql"
116
		return false;
117
	}
118
	
119
	coef = (t/n)
120
	
121
	def slens = null
122
	if ("freq".equals(sort_column)) {
123
		slens = lens.sort { a, b -> -a.value <=> -b.value ?: -a.key <=> -b.key }
124
	} else {
125
		slens = lens.sort { a, b -> -a.key <=> -b.key ?: -a.value <=> -b.value }
126
	}
127
	
128
	def flens = []
129
	slens.each { key, value -> value.times { flens << key } }
130
	def nbins = flens.size()*2
131

  
132
	println "Schema scores in '$corpus'. The average schema score is $t/$n = "+coef
133
	println "Index:\nlen\tfreq\tcfreq"
134
	int nShown = 0;
135
	for( def it : slens) {
136
		println it.key+"	"+it.value+"	"+(cfreq+=it.value)+"	"+lensnames[it.key].join(", ")
137
		nShown++
138
		if (max_lines > 0 && nShown >= max_lines) {
139
			println "... (${slens.size() - max_lines})"
140
			break;
141
		}
142
	}
143

  
144
	
145
	//return ["result":coef, "data":lens]
146
}
147

  
148

  

Formats disponibles : Unified diff