Révision 2147
tmp/org.txm.analec.rcp/src/org/txm/macro/urs/exploit/SchemasRatioMacro.groovy (revision 2147) | ||
---|---|---|
1 |
// Copyright © 2016 ENS de Lyon, CNRS, University of Franche-Comté |
|
2 |
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses) |
|
3 |
// @author mdecorde |
|
4 |
// @author sheiden |
|
5 |
|
|
6 |
// STANDARD DECLARATIONS |
|
7 |
package org.txm.macro.urs.exploit |
|
8 |
|
|
9 |
import org.kohsuke.args4j.* |
|
10 |
import groovy.transform.Field |
|
11 |
import org.txm.rcp.swt.widget.parameters.* |
|
12 |
import org.txm.annotation.urs.* |
|
13 |
import org.txm.macro.urs.AnalecUtils |
|
14 |
import visuAnalec.elements.* |
|
15 |
import org.txm.searchengine.cqp.corpus.* |
|
16 |
import org.txm.Toolbox |
|
17 |
import org.txm.rcp.commands.* |
|
18 |
import org.txm.statsengine.r.core.RWorkspace |
|
19 |
|
|
20 |
def selection = [] |
|
21 |
for (def s : corpusViewSelections) { |
|
22 |
if (s instanceof CQPCorpus) selection << s |
|
23 |
else if (s instanceof Partition) selection.addAll(s.getParts()) |
|
24 |
} |
|
25 |
|
|
26 |
if (selection.size() == 0) { |
|
27 |
println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections |
|
28 |
return false |
|
29 |
} else { |
|
30 |
for (def c : selection) c.compute(false) |
|
31 |
} |
|
32 |
|
|
33 |
@Field @Option(name="schema_ursql", usage="TYPE@PROP=REGEX", widget="String", required=true, def="CHAINE") |
|
34 |
String schema_ursql |
|
35 |
@Field @Option(name="minimum_schema_size", usage="minimal schema size", widget="Integer", required=true, def="3") |
|
36 |
int minimum_schema_size |
|
37 |
@Field @Option(name="maximum_schema_size", usage="Maximum size needed to consider a schema", widget="Integer", required=true, def="9999999") |
|
38 |
int maximum_schema_size |
|
39 |
@Field @Option(name="schema_property_display", usage="schema property to show", widget="String", required=true, def="REF") |
|
40 |
String schema_property_display |
|
41 |
|
|
42 |
@Field @Option(name="unit_ursql", usage="TYPE@PROP=REGEX", widget="String", required=false, def="MENTION") |
|
43 |
String unit_ursql |
|
44 |
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true") |
|
45 |
boolean strict_inclusion |
|
46 |
@Field @Option(name="position_in_matches", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0") |
|
47 |
int position_in_matches |
|
48 |
|
|
49 |
@Field @Option(name="units_margin_ursql", usage="TYPE@PROP=REGEX", widget="String", required=false, def="MENTION") |
|
50 |
String units_margin_ursql |
|
51 |
@Field @Option(name="margin_strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true") |
|
52 |
boolean margin_strict_inclusion |
|
53 |
@Field @Option(name="margin_position_in_matches", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0") |
|
54 |
int margin_position_in_matches |
|
55 |
|
|
56 |
@Field @Option(name="sep", usage="choose between the len or freq columns", widget="Separator", metaVar="freq len", required=false, def="") |
|
57 |
def sep |
|
58 |
@Field @Option(name="sort_column", usage="choose between the len or freq columns", widget="StringArray", metaVar="score name", required=true, def="score") |
|
59 |
String sort_column |
|
60 |
@Field @Option(name="max_lines", usage="cut the number of lines shown. '0' means no cut", widget="Integer", required=false, def="0") |
|
61 |
int max_lines |
|
62 |
//@Field @Option(name="output_graph", usage="Show chart", widget="Boolean", required=true, def="false") |
|
63 |
output_graph = false |
|
64 |
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF ON ALL REALLY ALL", required=true, def="OFF") |
|
65 |
debug |
|
66 |
|
|
67 |
if (!ParametersDialog.open(this)) return; |
|
68 |
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3 |
|
69 |
|
|
70 |
for (def corpus : selection) { |
|
71 |
def analecCorpus = URSCorpora.getCorpus(corpus) |
|
72 |
|
|
73 |
def errors = AnalecUtils.isPropertyDefined(Schema.class, analecCorpus, schema_ursql) |
|
74 |
if (errors.size() > 0) { |
|
75 |
println "** The $schema_ursql schema URSQL cannot be computed in the corpus with types: $errors." |
|
76 |
return; |
|
77 |
} |
|
78 |
|
|
79 |
errors=AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, unit_ursql) |
|
80 |
if (errors.size() > 0) { |
|
81 |
println "** $unit_ursql unit URSQL cannot be computed in the corpus with types: $errors." |
|
82 |
return; |
|
83 |
} |
|
84 |
|
|
85 |
def schemas = AnalecUtils.selectSchemasInCorpus(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, strict_inclusion); |
|
86 |
|
|
87 |
int nSchemas = 0; |
|
88 |
|
|
89 |
def scores = [:] |
|
90 |
def fs = [:] |
|
91 |
def fs_margin = [:] |
|
92 |
for (def schema : schemas) { |
|
93 |
|
|
94 |
def allUnites = schema.getUnitesSousjacentesNonTriees() |
|
95 |
def units = AnalecUtils.filterElements(debug, allUnites, unit_ursql) |
|
96 |
selectedUnits = AnalecUtils.filterUniteByInclusion(debug, units, corpus.getMatches(), strict_inclusion, position_in_matches) |
|
97 |
|
|
98 |
def units_margin = AnalecUtils.filterElements(debug, allUnites, units_margin_ursql) |
|
99 |
selectedUnits_margin = AnalecUtils.filterUniteByInclusion(debug, units_margin, corpus.getMatches(), margin_strict_inclusion, margin_position_in_matches) |
|
100 |
|
|
101 |
if (!scores.containsKey(schema)) { |
|
102 |
scores[schema] = 0; |
|
103 |
} |
|
104 |
|
|
105 |
scores[schema] = 0; |
|
106 |
fs[schema] = selectedUnits.size(); |
|
107 |
fs_margin[schema] = selectedUnits_margin.size(); |
|
108 |
if (selectedUnits_margin.size() > 0) { |
|
109 |
scores[schema] = (float)selectedUnits.size() / (float)selectedUnits_margin.size() |
|
110 |
} |
|
111 |
|
|
112 |
nSchemas++; |
|
113 |
} |
|
114 |
|
|
115 |
if ("score".equals(sort_column)) { |
|
116 |
scores = scores.sort { a, b -> -a.value <=> -b.value ?: -fs[a.key]<=> -fs[b.key] ?: a.key.getProps()[schema_property_display].compareTo(b.key.getProps()[schema_property_display]) } |
|
117 |
} else { |
|
118 |
scores = scores.sort { a, b -> a.key.getProps()[schema_property_display].compareTo(b.key.getProps()[schema_property_display]) ?: -a.value <=> -b.value ?: -fs[a.key]<=> -fs[b.key]} |
|
119 |
} |
|
120 |
|
|
121 |
println "Schema scores in '$corpus'." |
|
122 |
println "${schema_property_display}\tF\tF margin\tscore" |
|
123 |
int nShown = 0; |
|
124 |
for( def it : scores) { |
|
125 |
println ""+it.key.getProps()[schema_property_display]+"\t"+fs[it.key]+"\t"+fs_margin[it.key]+"\t"+String.format( "%.2f", it.value * 100.0d )+"%" |
|
126 |
nShown++ |
|
127 |
if (max_lines > 0 && nShown >= max_lines) { |
|
128 |
println "... (${scores.size() - max_lines})" |
|
129 |
break; |
|
130 |
} |
|
131 |
} |
|
132 |
|
|
133 |
//return ["result":coef, "data":lens] |
|
134 |
} |
|
135 |
|
tmp/org.txm.analec.rcp/src/org/txm/macro/urs/exploit/SchemaLengthsMacro.groovy (revision 2147) | ||
---|---|---|
45 | 45 |
boolean strict_inclusion |
46 | 46 |
@Field @Option(name="position_in_matches", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0") |
47 | 47 |
int position_in_matches |
48 |
@Field @Option(name="sep", usage="choose between the len or freq columns", widget="Separator", metaVar="freq len", required=true, def="")
|
|
48 |
@Field @Option(name="sep", usage="choose between the len or freq columns", widget="Separator", metaVar="freq len", required=false, def="")
|
|
49 | 49 |
def sep |
50 | 50 |
@Field @Option(name="sort_column", usage="choose between the len or freq columns", widget="StringArray", metaVar="freq len", required=true, def="freq") |
51 | 51 |
String sort_column |
tmp/org.txm.analec.rcp/src/org/txm/macroprototypes/urs/exploit/SchemaAccessibilityScoresMacro.groovy (revision 2147) | ||
---|---|---|
1 |
// Copyright © 2016 ENS de Lyon, CNRS, University of Franche-Comté |
|
2 |
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses) |
|
3 |
// @author mdecorde |
|
4 |
// @author sheiden |
|
5 |
|
|
6 |
// STANDARD DECLARATIONS |
|
7 |
package org.txm.macroprototypes.urs.exploit |
|
8 |
|
|
9 |
import org.kohsuke.args4j.* |
|
10 |
import groovy.transform.Field |
|
11 |
import org.txm.rcp.swt.widget.parameters.* |
|
12 |
import org.txm.annotation.urs.* |
|
13 |
import org.txm.macro.urs.AnalecUtils |
|
14 |
import visuAnalec.elements.* |
|
15 |
import org.txm.searchengine.cqp.corpus.* |
|
16 |
import org.txm.Toolbox |
|
17 |
import org.txm.rcp.commands.* |
|
18 |
import org.txm.statsengine.r.core.RWorkspace |
|
19 |
|
|
20 |
def selection = [] |
|
21 |
for (def s : corpusViewSelections) { |
|
22 |
if (s instanceof CQPCorpus) selection << s |
|
23 |
else if (s instanceof Partition) selection.addAll(s.getParts()) |
|
24 |
} |
|
25 |
|
|
26 |
if (selection.size() == 0) { |
|
27 |
println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections |
|
28 |
return false |
|
29 |
} else { |
|
30 |
for (def c : selection) c.compute(false) |
|
31 |
} |
|
32 |
|
|
33 |
@Field @Option(name="schema_ursql", usage="TYPE@PROP=REGEX", widget="String", required=true, def="CHAINE") |
|
34 |
String schema_ursql |
|
35 |
|
|
36 |
@Field @Option(name="minimum_schema_size", usage="minimal schema size", widget="Integer", required=true, def="3") |
|
37 |
int minimum_schema_size |
|
38 |
@Field @Option(name="maximum_schema_size", usage="Maximum size needed to consider a schema", widget="Integer", required=true, def="9999999") |
|
39 |
int maximum_schema_size |
|
40 |
@Field @Option(name="schema_property_display", usage="schema property to show", widget="String", required=true, def="REF") |
|
41 |
String schema_property_display |
|
42 |
@Field @Option(name="unit_ursql", usage="TYPE@PROP=REGEX", widget="String", required=false, def="MENTION") |
|
43 |
String unit_ursql |
|
44 |
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true") |
|
45 |
boolean strict_inclusion |
|
46 |
@Field @Option(name="position_in_matches", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0") |
|
47 |
int position_in_matches |
|
48 |
@Field @Option(name="sep", usage="choose between the len or freq columns", widget="Separator", metaVar="freq len", required=true, def="") |
|
49 |
def sep |
|
50 |
@Field @Option(name="sort_column", usage="choose between the len or freq columns", widget="StringArray", metaVar="freq len", required=true, def="freq") |
|
51 |
String sort_column |
|
52 |
@Field @Option(name="max_lines", usage="cut the number of lines shown. '0' means no cut", widget="Integer", required=false, def="0") |
|
53 |
int max_lines |
|
54 |
//@Field @Option(name="output_graph", usage="Show chart", widget="Boolean", required=true, def="false") |
|
55 |
output_graph = false |
|
56 |
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF ON ALL REALLY ALL", required=true, def="OFF") |
|
57 |
debug |
|
58 |
|
|
59 |
if (!ParametersDialog.open(this)) return; |
|
60 |
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3 |
|
61 |
|
|
62 |
for (def corpus : selection) { |
|
63 |
def analecCorpus = URSCorpora.getCorpus(corpus) |
|
64 |
|
|
65 |
def errors = AnalecUtils.isPropertyDefined(Schema.class, analecCorpus, schema_ursql) |
|
66 |
if (errors.size() > 0) { |
|
67 |
println "** The $schema_ursql schema URSQL cannot be computed in the corpus with types: $errors." |
|
68 |
return; |
|
69 |
} |
|
70 |
|
|
71 |
errors=AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, unit_ursql) |
|
72 |
if (errors.size() > 0) { |
|
73 |
println "** $unit_ursql unit URSQL cannot be computed in the corpus with types: $errors." |
|
74 |
return; |
|
75 |
} |
|
76 |
|
|
77 |
def schemas = AnalecUtils.selectSchemasInCorpus(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, strict_inclusion); |
|
78 |
|
|
79 |
int nSchemas = 0; |
|
80 |
|
|
81 |
def scores = [:] |
|
82 |
def lensnames = [:] |
|
83 |
for (def schema : schemas) { |
|
84 |
|
|
85 |
def allUnites = schema.getUnitesSousjacentesNonTriees() |
|
86 |
|
|
87 |
def units = AnalecUtils.filterElements(debug, allUnites, unit_ursql) |
|
88 |
|
|
89 |
selectedUnits = AnalecUtils.filterUniteByInclusion(debug, units, corpus.getMatches(), strict_inclusion, position_in_matches) |
|
90 |
|
|
91 |
int nUnites = selectedUnits.size(); |
|
92 |
|
|
93 |
if (!scores.containsKey(schema)) { |
|
94 |
scores[schema] = 0; |
|
95 |
} |
|
96 |
|
|
97 |
scores[schema] = units.su |
|
98 |
|
|
99 |
String value = schema.getProp(schema_property_display); |
|
100 |
nSchemas++; |
|
101 |
} |
|
102 |
|
|
103 |
//println "nSchemas=$nSchemas" |
|
104 |
def freqs = lens.keySet(); |
|
105 |
freqs.sort(); |
|
106 |
int t = 0; |
|
107 |
int n = 0; |
|
108 |
//println "Fréquences ("+freqs.size()+")" |
|
109 |
for (def f : freqs) { |
|
110 |
t += f * lens[f] |
|
111 |
n += lens[f] |
|
112 |
} |
|
113 |
|
|
114 |
if (n == 0) { |
|
115 |
println "No units selected for schemas=$schema_ursql and units=$unit_ursql" |
|
116 |
return false; |
|
117 |
} |
|
118 |
|
|
119 |
coef = (t/n) |
|
120 |
|
|
121 |
def slens = null |
|
122 |
if ("freq".equals(sort_column)) { |
|
123 |
slens = lens.sort { a, b -> -a.value <=> -b.value ?: -a.key <=> -b.key } |
|
124 |
} else { |
|
125 |
slens = lens.sort { a, b -> -a.key <=> -b.key ?: -a.value <=> -b.value } |
|
126 |
} |
|
127 |
|
|
128 |
def flens = [] |
|
129 |
slens.each { key, value -> value.times { flens << key } } |
|
130 |
def nbins = flens.size()*2 |
|
131 |
|
|
132 |
println "Schema scores in '$corpus'. The average schema score is $t/$n = "+coef |
|
133 |
println "Index:\nlen\tfreq\tcfreq" |
|
134 |
int nShown = 0; |
|
135 |
for( def it : slens) { |
|
136 |
println it.key+" "+it.value+" "+(cfreq+=it.value)+" "+lensnames[it.key].join(", ") |
|
137 |
nShown++ |
|
138 |
if (max_lines > 0 && nShown >= max_lines) { |
|
139 |
println "... (${slens.size() - max_lines})" |
|
140 |
break; |
|
141 |
} |
|
142 |
} |
|
143 |
|
|
144 |
|
|
145 |
//return ["result":coef, "data":lens] |
|
146 |
} |
|
147 |
|
|
148 |
|
Formats disponibles : Unified diff