root / tmp / org.txm.analec.rcp / src / org / txm / macro / urs / exploit / SchemaLengthsMacro.groovy @ 2167
History | View | Annotate | Download (6.8 kB)
1 |
// Copyright © 2016 ENS de Lyon, CNRS, University of Franche-Comté
|
---|---|
2 |
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
|
3 |
// @author mdecorde
|
4 |
// @author sheiden
|
5 |
|
6 |
// STANDARD DECLARATIONS
|
7 |
package org.txm.macro.urs.exploit
|
8 |
|
9 |
import org.kohsuke.args4j.* |
10 |
import groovy.transform.Field |
11 |
import org.txm.rcp.swt.widget.parameters.* |
12 |
import org.txm.annotation.urs.* |
13 |
import org.txm.macro.urs.AnalecUtils |
14 |
import visuAnalec.elements.* |
15 |
import org.txm.searchengine.cqp.corpus.* |
16 |
import org.txm.Toolbox |
17 |
import org.txm.rcp.commands.* |
18 |
import org.txm.statsengine.r.core.RWorkspace |
19 |
|
20 |
def selection = [] |
21 |
for (def s : corpusViewSelections) { |
22 |
if (s instanceof CQPCorpus) selection << s |
23 |
else if (s instanceof Partition) selection.addAll(s.getParts()) |
24 |
} |
25 |
|
26 |
if (selection.size() == 0) { |
27 |
println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections
|
28 |
return false |
29 |
} else {
|
30 |
for (def c : selection) c.compute(false) |
31 |
} |
32 |
|
33 |
@Field @Option(name="schema_ursql", usage="TYPE@PROP=REGEX", widget="String", required=true, def="CHAINE") |
34 |
String schema_ursql
|
35 |
|
36 |
@Field @Option(name="minimum_schema_size", usage="minimal schema size", widget="Integer", required=true, def="3") |
37 |
int minimum_schema_size
|
38 |
@Field @Option(name="maximum_schema_size", usage="Maximum size needed to consider a schema", widget="Integer", required=true, def="9999999") |
39 |
int maximum_schema_size
|
40 |
@Field @Option(name="schema_property_display", usage="schema property to show", widget="String", required=true, def="REF") |
41 |
String schema_property_display
|
42 |
@Field @Option(name="unit_ursql", usage="TYPE@PROP=REGEX", widget="String", required=false, def="MENTION") |
43 |
String unit_ursql
|
44 |
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true") |
45 |
boolean strict_inclusion
|
46 |
@Field @Option(name="position_in_matches", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0") |
47 |
int position_in_matches
|
48 |
@Field @Option(name="sep", usage="choose between the len or freq columns", widget="Separator", metaVar="freq len", required=false, def="") |
49 |
def sep
|
50 |
@Field @Option(name="sort_column", usage="choose between the len or freq columns", widget="StringArray", metaVar="freq len", required=true, def="freq") |
51 |
String sort_column
|
52 |
@Field @Option(name="max_lines", usage="cut the number of lines shown. '0' means no cut", widget="Integer", required=false, def="0") |
53 |
int max_lines
|
54 |
//@Field @Option(name="output_graph", usage="Show chart", widget="Boolean", required=true, def="false")
|
55 |
output_graph = false
|
56 |
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF ON ALL REALLY ALL", required=true, def="OFF") |
57 |
debug |
58 |
|
59 |
if (!ParametersDialog.open(this)) return; |
60 |
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3 |
61 |
|
62 |
for (def corpus : selection) { |
63 |
def analecCorpus = URSCorpora.getCorpus(corpus)
|
64 |
|
65 |
def errors = AnalecUtils.isPropertyDefined(Schema.class, analecCorpus, schema_ursql) |
66 |
if (errors.size() > 0) { |
67 |
println "** The $schema_ursql schema URSQL cannot be computed in the corpus with types: $errors."
|
68 |
return; |
69 |
} |
70 |
|
71 |
errors=AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, unit_ursql) |
72 |
if (errors.size() > 0) { |
73 |
println "** $unit_ursql unit URSQL cannot be computed in the corpus with types: $errors."
|
74 |
return; |
75 |
} |
76 |
|
77 |
//println([debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, strict_inclusion])
|
78 |
def schemas = AnalecUtils.selectSchemasInCorpus(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, strict_inclusion);
|
79 |
|
80 |
int nSchemas = 0; |
81 |
|
82 |
def lens = [:]
|
83 |
def lensnames = [:]
|
84 |
for (def schema : schemas) { |
85 |
|
86 |
def allUnites = schema.getUnitesSousjacentesNonTriees()
|
87 |
|
88 |
def units = AnalecUtils.filterElements(debug, allUnites, unit_ursql)
|
89 |
|
90 |
selectedUnits = AnalecUtils.filterUniteByInclusion(debug, units, corpus.getMatches(), strict_inclusion, position_in_matches) |
91 |
|
92 |
int nUnites = selectedUnits.size();
|
93 |
|
94 |
if (!lens.containsKey(nUnites)) {
|
95 |
lens[nUnites] = 0;
|
96 |
lensnames[nUnites] = [];
|
97 |
} |
98 |
|
99 |
lens[nUnites] = lens[nUnites] + 1;
|
100 |
String value = schema.getProp(schema_property_display);
|
101 |
//if (value.contains(",")) value = "\"$value\""
|
102 |
lensnames[nUnites] << value |
103 |
nSchemas++; |
104 |
} |
105 |
|
106 |
//println "nSchemas=$nSchemas"
|
107 |
def freqs = lens.keySet();
|
108 |
freqs.sort(); |
109 |
int t = 0; |
110 |
int n = 0; |
111 |
//println "Fréquences ("+freqs.size()+")"
|
112 |
for (def f : freqs) { |
113 |
t += f * lens[f] |
114 |
n += lens[f] |
115 |
} |
116 |
|
117 |
if (n == 0) { |
118 |
println "No units selected for schemas=$schema_ursql and units=$unit_ursql"
|
119 |
return false; |
120 |
} |
121 |
|
122 |
coef = (t/n) |
123 |
|
124 |
def slens = null |
125 |
if ("freq".equals(sort_column)) { |
126 |
slens = lens.sort { a, b -> -a.value <=> -b.value ?: -a.key <=> -b.key } |
127 |
} else {
|
128 |
slens = lens.sort { a, b -> -a.key <=> -b.key ?: -a.value <=> -b.value } |
129 |
} |
130 |
|
131 |
def flens = [] |
132 |
slens.each { key, value -> value.times { flens << key } } |
133 |
def nbins = flens.size()*2 |
134 |
|
135 |
def cfreq = 0 |
136 |
println "Schema lengths in '$corpus'. The average schema length is $t/$n = "+coef
|
137 |
println "Index:\nlen\tfreq\tcfreq"
|
138 |
int nShown = 0; |
139 |
for( def it : slens) { |
140 |
def names = lensnames[it.key] |
141 |
names = names.sort() |
142 |
println it.key+" "+it.value+" "+(cfreq+=it.value)+" \""+names.join("\", \"")+"\"" |
143 |
nShown++ |
144 |
if (max_lines > 0 && nShown >= max_lines) { |
145 |
println "... (${slens.size() - max_lines})"
|
146 |
break;
|
147 |
} |
148 |
} |
149 |
|
150 |
def slens2 = slens.sort { a, b -> -a.key <=> -b.key }
|
151 |
if (output_graph) {
|
152 |
def r = RWorkspace.getRWorkspaceInstance()
|
153 |
|
154 |
r.addVectorToWorkspace("len", slens2.keySet() as int[]) |
155 |
r.addVectorToWorkspace("freq", slens2.values() as int[]) |
156 |
r.addVectorToWorkspace("flen", flens as int[]) |
157 |
|
158 |
def corpusName = corpus.getName()
|
159 |
println "corpusName=$corpusName"
|
160 |
def PNGFile = File.createTempFile("txm_"+corpus.getID()+"_", ".png", new File(Toolbox.getTxmHomePath(), "results")) |
161 |
def PNGFilePath = PNGFile.getAbsolutePath()
|
162 |
println "PNG file: "+PNGFilePath
|
163 |
|
164 |
def SVGFile = File.createTempFile("txm_"+corpus.getID()+"_", ".svg", new File(Toolbox.getTxmHomePath(), "results")) |
165 |
def SVGFilePath = SVGFile.getAbsolutePath()
|
166 |
println "SVG file: "+SVGFilePath
|
167 |
|
168 |
/// BEGINNING OF R SCRIPT
|
169 |
def script =""" |
170 |
hist(flen, xaxt='n', col="gray", xlab="Length", breaks=$nbins, main="$corpusName Longueur des chaînes ($nbins bins)")
|
171 |
axis(side=1, at=len)
|
172 |
dev.off()
|
173 |
"""
|
174 |
/// END OF R SCRIPT
|
175 |
|
176 |
// execute R script
|
177 |
r.plot(PNGFile, "png(file = \"${PNGFilePath}\"); "+script)
|
178 |
r.plot(SVGFile, "svg(file = \"${SVGFilePath}\"); "+script)
|
179 |
|
180 |
//display the SVG results graphic
|
181 |
monitor.syncExec(new Runnable() { |
182 |
@Override
|
183 |
public void run() { OpenBrowser.openfile(SVGFile.getAbsolutePath(), corpusName+" Longueur des chaînes") } |
184 |
}) |
185 |
} |
186 |
//return ["result":coef, "data":lens]
|
187 |
} |
188 |
|
189 |
|