root / tmp / org.txm.analec.rcp / src / org / txm / macro / urs / exploit / UnitsIndexMacro.groovy @ 2167
History | View | Annotate | Download (11.2 kB)
1 |
// Copyright © 2016 ENS de Lyon, CNRS, University of Franche-Comté
|
---|---|
2 |
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
|
3 |
// @author mdecorde
|
4 |
// @author sheiden
|
5 |
// STANDARD DECLARATIONS
|
6 |
package org.txm.macro.urs.exploit
|
7 |
|
8 |
import groovy.transform.Field |
9 |
|
10 |
import org.apache.commons.lang.StringUtils |
11 |
import org.kohsuke.args4j.* |
12 |
import org.txm.Toolbox |
13 |
import org.txm.annotation.urs.* |
14 |
import org.txm.lexicaltable.core.functions.LexicalTable |
15 |
import org.txm.lexicaltable.core.statsengine.r.data.LexicalTableImpl |
16 |
import org.txm.macro.urs.* |
17 |
import org.txm.rcp.commands.* |
18 |
import org.txm.rcp.swt.widget.parameters.* |
19 |
import org.txm.rcp.views.corpora.CorporaView |
20 |
import org.txm.searchengine.cqp.corpus.* |
21 |
import org.txm.searchengine.cqp.CQPSearchEngine |
22 |
import org.txm.statsengine.r.core.RWorkspace |
23 |
|
24 |
import visuAnalec.donnees.* |
25 |
import visuAnalec.elements.* |
26 |
import cern.colt.matrix.DoubleFactory2D |
27 |
import cern.colt.matrix.DoubleMatrix2D |
28 |
|
29 |
def scriptName = this.class.getSimpleName() |
30 |
|
31 |
def selection = [] |
32 |
for (def s : corpusViewSelections) { |
33 |
if (s instanceof CQPCorpus) selection << s |
34 |
else if (s instanceof Partition) selection.addAll(s.getParts()) |
35 |
} |
36 |
|
37 |
if (selection.size() == 0) { |
38 |
println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections
|
39 |
return false |
40 |
} else {
|
41 |
for (def c : selection) c.compute(false) |
42 |
} |
43 |
|
44 |
// BEGINNING OF PARAMETERS
|
45 |
@Field @Option(name="schema_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="CHAINE") |
46 |
String schema_ursql
|
47 |
@Field @Option(name="minimum_schema_size", usage="Minimum size needed to consider a schema", widget="Integer", required=false, def="3") |
48 |
int minimum_schema_size
|
49 |
@Field @Option(name="maximum_schema_size", usage="Maximum size needed to consider a schema", widget="Integer", required=false, def="9999999") |
50 |
int maximum_schema_size
|
51 |
@Field @Option(name="unit_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="MENTION") |
52 |
String unit_ursql
|
53 |
@Field @Option(name="unit_property_display", usage="Unit property to count", widget="String", required=false, def="CATEGORIE") |
54 |
String unit_property_display
|
55 |
@Field @Option(name="word_property_display", usage="Word property to display instead of the unit property", widget="String", required=false, def="") |
56 |
String word_property_display
|
57 |
@Field @Option(name="position_in_schema", usage="Unit distance in schema (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=false, def="0") |
58 |
int position_in_schema
|
59 |
@Field @Option(name="cql_limit", usage="CQL to build structure limits", widget="Query", required=false, def="") |
60 |
cql_limit |
61 |
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=false, def="true") |
62 |
strict_inclusion |
63 |
@Field @Option(name="position_in_matches", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=false, def="0") |
64 |
position_in_matches |
65 |
//@Field @Option(name="output_2D", usage="output barplot or 3D plot", widget="Boolean", required=true, def="true")
|
66 |
output_2D = true
|
67 |
@Field @Option(name="output_showlegend", usage="output barplot or 3D plot", widget="Boolean", required=false, def="true") |
68 |
output_showlegend |
69 |
@Field @Option(name="output_fmin", usage="minimal frequency displayed", widget="Integer", required=false, def="0") |
70 |
output_fmin |
71 |
@Field @Option(name="output_histogram", usage="show or not a histogram of the result", widget="Boolean", required=false, def="true") |
72 |
output_histogram |
73 |
@Field @Option(name="output_lexicaltable", usage="create or not a lexical table with the result", widget="Boolean", required=false, def="false") |
74 |
output_lexicaltable |
75 |
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF ON ALL REALLY ALL", required=false, def="OFF") |
76 |
debug |
77 |
if (!ParametersDialog.open(this)) return |
78 |
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3 |
79 |
|
80 |
|
81 |
if (word_property_display != null && word_property_display.length() > 0 && unit_property_display != null && unit_property_display.length() > 0) { |
82 |
println "Warning: both unit_property_display=$unit_property_display and word_property_display=$word_property_display are set. The index will be computed with the $word_property_display CQP property."
|
83 |
} |
84 |
|
85 |
if (word_property_display.length() == 0 && unit_property_display.length() == 0) { |
86 |
println "Error: no analysis property specified in unit_property_display or word_property_display. Aborting."
|
87 |
return
|
88 |
} |
89 |
|
90 |
def CQI = CQPSearchEngine.getCqiClient()
|
91 |
|
92 |
def allresults = [:]
|
93 |
for (def corpus : selection) { |
94 |
|
95 |
def word = corpus.getWordProperty()
|
96 |
def analecCorpus = URSCorpora.getCorpus(corpus)
|
97 |
|
98 |
def errors = AnalecUtils.isPropertyDefined(Schema.class, analecCorpus, schema_ursql) |
99 |
if (errors.size() > 0) { |
100 |
println "** The $schema_ursql schema URSQL cannot be computed in the corpus with types: $errors."
|
101 |
return; |
102 |
} |
103 |
|
104 |
errors=AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, unit_ursql) |
105 |
if (errors.size() > 0) { |
106 |
println "** $unit_ursql unit URSQL cannot be computed in the corpus with types: $errors."
|
107 |
return; |
108 |
} |
109 |
|
110 |
if (unit_property_display.length() > 0) { |
111 |
errors = AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, AnalecUtils.getFilterParameters(unit_ursql)[0], unit_property_display).size()
|
112 |
if (errors > 0) { |
113 |
println "Error: some Unit types don't contain the $unit_property_display property: $errors"
|
114 |
return
|
115 |
} |
116 |
} |
117 |
|
118 |
def selectedUnits = AnalecUtils.selectUnitsInSchema(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size,
|
119 |
unit_ursql, position_in_schema, cql_limit, strict_inclusion, position_in_matches); |
120 |
selectedUnits = new HashSet(selectedUnits) |
121 |
def counts = null |
122 |
if (word_property_display != null && word_property_display.length() > 0) { |
123 |
Property p = corpus.getProperty(word_property_display) |
124 |
int[] pos = null |
125 |
counts = [:] |
126 |
for (def unit : selectedUnits) { |
127 |
|
128 |
if (unit.getDeb() == unit.getFin()) pos = [unit.getDeb()]
|
129 |
else pos = (unit.getDeb()..unit.getFin())
|
130 |
|
131 |
String s = StringUtils.join(CQI.cpos2Str(p.getQualifiedName(), pos), " ") |
132 |
if (!counts.containsKey(s)) counts[s] = 0 |
133 |
counts[s] = counts[s] + 1
|
134 |
} |
135 |
} else { // use Analec unit property |
136 |
counts = selectedUnits.countBy { if (it.getProp(unit_property_display) == null) "<null>" else it.getProp(unit_property_display) } |
137 |
} |
138 |
if (counts.containsKey("")) counts["<empty>"] = counts[""]; |
139 |
counts.remove("")
|
140 |
if (debug) {
|
141 |
println "\n*** Statistics: "
|
142 |
for (def k : counts.keySet()) { |
143 |
if (k.size() > 0) { println "$k\t"+counts[k] }else{ println "N/A\t"+counts[k] } |
144 |
} |
145 |
} |
146 |
|
147 |
allresults[corpus] = counts |
148 |
} |
149 |
if (debug) println "RESULTS=$allresults" |
150 |
def keys = new HashSet(); |
151 |
for (def corpus : allresults.keySet()) keys.addAll(allresults[corpus].keySet()) |
152 |
if (keys.size() == 0) { |
153 |
println "No values for $unit_property_display property"
|
154 |
return
|
155 |
} |
156 |
keys = keys.sort() |
157 |
mFactory = DoubleFactory2D.dense |
158 |
matrix = mFactory.make(keys.size(), selection.size()) |
159 |
|
160 |
if (debug) println "KEYS=$keys" |
161 |
|
162 |
def corpus = null |
163 |
if (corpusViewSelection instanceof Partition) { corpus = corpusViewSelection.getCorpus() } |
164 |
else { corpus = corpusViewSelection }
|
165 |
|
166 |
def title = "${corpus.getMainCorpus()}.${corpusViewSelection}\n${unit_ursql}" |
167 |
title += "[${position_in_matches}]."
|
168 |
if (word_property_display.length() > 0) title += "${word_property_display} frequencies" |
169 |
else if (unit_property_display.length() > 0) title += "${unit_property_display} frequencies" |
170 |
if (cql_limit != null && !cql_limit.getQueryString().equals("\"\"")) title += "\n(${cql_limit} limits)" |
171 |
|
172 |
if (word_property_display.length() > 0) println "Index de la propriété $word_property_display des mots des unités $unit_ursql[$position_in_matches] de ${corpus.getMainCorpus()}.${corpusViewSelection}" |
173 |
else println "Index de la propriété $unit_property_display des unités $unit_ursql[$position_in_matches] de ${corpus.getMainCorpus()}.${corpusViewSelection}" |
174 |
println "$unit_property_display\t"+selection.join("\t") |
175 |
|
176 |
keys.eachWithIndex { prop_val, i -> |
177 |
String line = "" |
178 |
if (prop_val.size() > 0) { |
179 |
line += prop_val |
180 |
} else {
|
181 |
line += "N/A"
|
182 |
} |
183 |
|
184 |
int min = 0; |
185 |
allresults.keySet().eachWithIndex { c, j -> |
186 |
def freq = allresults[c][prop_val]
|
187 |
if (freq == null) freq = 0 |
188 |
|
189 |
matrix.set(i, j, freq) |
190 |
line += "\t"+freq
|
191 |
if (min < freq) min = freq
|
192 |
} |
193 |
if (min >= output_fmin)
|
194 |
println "$line"
|
195 |
} |
196 |
props = keys |
197 |
|
198 |
def r = RWorkspace.getRWorkspaceInstance()
|
199 |
def colnames = selection as String[] |
200 |
r.addVectorToWorkspace("texts", colnames)
|
201 |
def rownames = props as String[] |
202 |
r.addVectorToWorkspace("props", rownames)
|
203 |
r.addMatrixToWorkspace("textpropfreqs", matrix as DoubleMatrix2D) |
204 |
r.eval("rownames(textpropfreqs) = props")
|
205 |
r.eval("colnames(textpropfreqs) = texts")
|
206 |
|
207 |
def resultsDir = new File(Toolbox.getTxmHomePath(), "results") |
208 |
def file = File.createTempFile("txm", ".svg", resultsDir) |
209 |
|
210 |
|
211 |
//if (corpusViewSelection.getClass() == org.txm.searchengine.cqp.corpus.Partition) { corpus = corpusViewSelection.getParent() } else { corpus = corpusViewSelection }
|
212 |
if (output_histogram) {
|
213 |
def plotScript
|
214 |
String legend = "rownames(textpropfreqs)" |
215 |
if (!output_showlegend) legend = "NULL" |
216 |
|
217 |
if (output_2D) {
|
218 |
plotScript = """library(RColorBrewer);
|
219 |
barplot(height=textpropfreqs, main="${title}", xlab="Parties", legend=$legend, beside=TRUE, col=rainbow(length(rownames(textpropfreqs))))"""
|
220 |
} else {
|
221 |
plotScript = """
|
222 |
library(latticeExtra)
|
223 |
library(lattice)
|
224 |
library(RColorBrewer)
|
225 |
cloud(value~col+row, ..., panel.3d.cloud=panel.3dbars, col.facet='grey', xbase=0.4, ybase=0.4, scales=list(arrows=FALSE, col=1), par.settings = list(axis.line = list(col = "transparent")))
|
226 |
"""
|
227 |
} |
228 |
|
229 |
// execute R script
|
230 |
r.plot(file, plotScript) |
231 |
|
232 |
//println "s1: "+r.eval("s1").asString()
|
233 |
|
234 |
//display the SVG results graphic
|
235 |
|
236 |
} |
237 |
|
238 |
def lt = null; |
239 |
if (output_lexicaltable) {
|
240 |
def parent = selection[0] |
241 |
println "parent="+parent.getClass()
|
242 |
if (parent instanceof Part) { |
243 |
println "part="+parent
|
244 |
parent = parent.getParent() |
245 |
println "partition="+parent
|
246 |
lt = new LexicalTable(parent);
|
247 |
lt.setData(new LexicalTableImpl(matrix as DoubleMatrix2D, rownames, colnames)) |
248 |
|
249 |
} else {
|
250 |
lt = new LexicalTable(parent);
|
251 |
lt.setData(new LexicalTableImpl(matrix as DoubleMatrix2D, rownames, colnames)) |
252 |
} |
253 |
lt.setUserName(schema_ursql+">"+unit_ursql)
|
254 |
} |
255 |
|
256 |
if (file.exists() && file.getTotalSpace() > 0) { |
257 |
println "SVG file: "+file.getAbsolutePath()
|
258 |
|
259 |
monitor.syncExec(new Runnable() { |
260 |
@Override
|
261 |
public void run() { try { |
262 |
if (UnitsIndexMacro.this.output_histogram) {
|
263 |
//OpenSVGGraph.OpenSVGFile(file.getAbsolutePath(), selection.toString()+" Units")
|
264 |
OpenBrowser.openfile(file.getAbsolutePath()) |
265 |
} |
266 |
if (UnitsIndexMacro.this.output_lexicaltable) {
|
267 |
CorporaView.refreshObject(corpus) |
268 |
CorporaView.expand(lt) |
269 |
} |
270 |
} catch (e) { println "** ERROR: "+e }} |
271 |
}) |
272 |
} else {
|
273 |
println "Error: cannot show histogram result with $file"
|
274 |
} |
275 |
|
276 |
|
277 |
/*
|
278 |
d <- read.table(text=' x y z
|
279 |
t1 5 high
|
280 |
t1 2 low
|
281 |
t1 4 med
|
282 |
t2 8 high
|
283 |
t2 1 low
|
284 |
t2 3 med
|
285 |
t3 50 high
|
286 |
t3 12 med
|
287 |
t3 35 low', header=TRUE)
|
288 |
library(latticeExtra)
|
289 |
cloud(y~x+z, d, panel.3d.cloud=panel.3dbars, col.facet='grey',
|
290 |
xbase=0.4, ybase=0.4, scales=list(arrows=FALSE, col=1),
|
291 |
par.settings = list(axis.line = list(col = "transparent")))
|
292 |
*/
|