root / tmp / org.txm.analec.rcp / src / org / txm / macro / urs / exploit / UnitsIndexMacro.groovy @ 2167
History | View | Annotate | Download (11.2 kB)
1 | 671 | mdecorde | // Copyright © 2016 ENS de Lyon, CNRS, University of Franche-Comté
|
---|---|---|---|
2 | 671 | mdecorde | // Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
|
3 | 671 | mdecorde | // @author mdecorde
|
4 | 671 | mdecorde | // @author sheiden
|
5 | 671 | mdecorde | // STANDARD DECLARATIONS
|
6 | 1217 | mdecorde | package org.txm.macro.urs.exploit
|
7 | 671 | mdecorde | |
8 | 671 | mdecorde | import groovy.transform.Field |
9 | 671 | mdecorde | |
10 | 671 | mdecorde | import org.apache.commons.lang.StringUtils |
11 | 671 | mdecorde | import org.kohsuke.args4j.* |
12 | 671 | mdecorde | import org.txm.Toolbox |
13 | 1217 | mdecorde | import org.txm.annotation.urs.* |
14 | 2105 | mdecorde | import org.txm.lexicaltable.core.functions.LexicalTable |
15 | 671 | mdecorde | import org.txm.lexicaltable.core.statsengine.r.data.LexicalTableImpl |
16 | 1979 | mdecorde | import org.txm.macro.urs.* |
17 | 671 | mdecorde | import org.txm.rcp.commands.* |
18 | 671 | mdecorde | import org.txm.rcp.swt.widget.parameters.* |
19 | 671 | mdecorde | import org.txm.rcp.views.corpora.CorporaView |
20 | 671 | mdecorde | import org.txm.searchengine.cqp.corpus.* |
21 | 1979 | mdecorde | import org.txm.searchengine.cqp.CQPSearchEngine |
22 | 671 | mdecorde | import org.txm.statsengine.r.core.RWorkspace |
23 | 671 | mdecorde | |
24 | 671 | mdecorde | import visuAnalec.donnees.* |
25 | 671 | mdecorde | import visuAnalec.elements.* |
26 | 671 | mdecorde | import cern.colt.matrix.DoubleFactory2D |
27 | 671 | mdecorde | import cern.colt.matrix.DoubleMatrix2D |
28 | 671 | mdecorde | |
29 | 671 | mdecorde | def scriptName = this.class.getSimpleName() |
30 | 671 | mdecorde | |
31 | 671 | mdecorde | def selection = [] |
32 | 671 | mdecorde | for (def s : corpusViewSelections) { |
33 | 1094 | mdecorde | if (s instanceof CQPCorpus) selection << s |
34 | 671 | mdecorde | else if (s instanceof Partition) selection.addAll(s.getParts()) |
35 | 671 | mdecorde | } |
36 | 671 | mdecorde | |
37 | 671 | mdecorde | if (selection.size() == 0) { |
38 | 671 | mdecorde | println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections
|
39 | 671 | mdecorde | return false |
40 | 2105 | mdecorde | } else {
|
41 | 2105 | mdecorde | for (def c : selection) c.compute(false) |
42 | 671 | mdecorde | } |
43 | 671 | mdecorde | |
44 | 671 | mdecorde | // BEGINNING OF PARAMETERS
|
45 | 2105 | mdecorde | @Field @Option(name="schema_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="CHAINE") |
46 | 671 | mdecorde | String schema_ursql
|
47 | 2105 | mdecorde | @Field @Option(name="minimum_schema_size", usage="Minimum size needed to consider a schema", widget="Integer", required=false, def="3") |
48 | 671 | mdecorde | int minimum_schema_size
|
49 | 2105 | mdecorde | @Field @Option(name="maximum_schema_size", usage="Maximum size needed to consider a schema", widget="Integer", required=false, def="9999999") |
50 | 671 | mdecorde | int maximum_schema_size
|
51 | 671 | mdecorde | @Field @Option(name="unit_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="MENTION") |
52 | 671 | mdecorde | String unit_ursql
|
53 | 2105 | mdecorde | @Field @Option(name="unit_property_display", usage="Unit property to count", widget="String", required=false, def="CATEGORIE") |
54 | 671 | mdecorde | String unit_property_display
|
55 | 2099 | mdecorde | @Field @Option(name="word_property_display", usage="Word property to display instead of the unit property", widget="String", required=false, def="") |
56 | 2099 | mdecorde | String word_property_display
|
57 | 2105 | mdecorde | @Field @Option(name="position_in_schema", usage="Unit distance in schema (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=false, def="0") |
58 | 2094 | mdecorde | int position_in_schema
|
59 | 2105 | mdecorde | @Field @Option(name="cql_limit", usage="CQL to build structure limits", widget="Query", required=false, def="") |
60 | 2094 | mdecorde | cql_limit |
61 | 2105 | mdecorde | @Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=false, def="true") |
62 | 671 | mdecorde | strict_inclusion |
63 | 2105 | mdecorde | @Field @Option(name="position_in_matches", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=false, def="0") |
64 | 2099 | mdecorde | position_in_matches |
65 | 2094 | mdecorde | //@Field @Option(name="output_2D", usage="output barplot or 3D plot", widget="Boolean", required=true, def="true")
|
66 | 2094 | mdecorde | output_2D = true
|
67 | 2105 | mdecorde | @Field @Option(name="output_showlegend", usage="output barplot or 3D plot", widget="Boolean", required=false, def="true") |
68 | 671 | mdecorde | output_showlegend |
69 | 2105 | mdecorde | @Field @Option(name="output_fmin", usage="minimal frequency displayed", widget="Integer", required=false, def="0") |
70 | 671 | mdecorde | output_fmin |
71 | 2105 | mdecorde | @Field @Option(name="output_histogram", usage="show or not a histogram of the result", widget="Boolean", required=false, def="true") |
72 | 671 | mdecorde | output_histogram |
73 | 2105 | mdecorde | @Field @Option(name="output_lexicaltable", usage="create or not a lexical table with the result", widget="Boolean", required=false, def="false") |
74 | 671 | mdecorde | output_lexicaltable |
75 | 2105 | mdecorde | @Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF ON ALL REALLY ALL", required=false, def="OFF") |
76 | 671 | mdecorde | debug |
77 | 671 | mdecorde | if (!ParametersDialog.open(this)) return |
78 | 671 | mdecorde | if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3 |
79 | 671 | mdecorde | |
80 | 671 | mdecorde | |
81 | 2099 | mdecorde | if (word_property_display != null && word_property_display.length() > 0 && unit_property_display != null && unit_property_display.length() > 0) { |
82 | 2099 | mdecorde | println "Warning: both unit_property_display=$unit_property_display and word_property_display=$word_property_display are set. The index will be computed with the $word_property_display CQP property."
|
83 | 671 | mdecorde | } |
84 | 671 | mdecorde | |
85 | 2099 | mdecorde | if (word_property_display.length() == 0 && unit_property_display.length() == 0) { |
86 | 2099 | mdecorde | println "Error: no analysis property specified in unit_property_display or word_property_display. Aborting."
|
87 | 671 | mdecorde | return
|
88 | 671 | mdecorde | } |
89 | 671 | mdecorde | |
90 | 788 | mdecorde | def CQI = CQPSearchEngine.getCqiClient()
|
91 | 671 | mdecorde | |
92 | 671 | mdecorde | def allresults = [:]
|
93 | 671 | mdecorde | for (def corpus : selection) { |
94 | 671 | mdecorde | |
95 | 2144 | mdecorde | def word = corpus.getWordProperty()
|
96 | 2144 | mdecorde | def analecCorpus = URSCorpora.getCorpus(corpus)
|
97 | 671 | mdecorde | |
98 | 2093 | mdecorde | def errors = AnalecUtils.isPropertyDefined(Schema.class, analecCorpus, schema_ursql) |
99 | 2093 | mdecorde | if (errors.size() > 0) { |
100 | 2093 | mdecorde | println "** The $schema_ursql schema URSQL cannot be computed in the corpus with types: $errors."
|
101 | 2093 | mdecorde | return; |
102 | 2093 | mdecorde | } |
103 | 2093 | mdecorde | |
104 | 2093 | mdecorde | errors=AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, unit_ursql) |
105 | 2093 | mdecorde | if (errors.size() > 0) { |
106 | 2093 | mdecorde | println "** $unit_ursql unit URSQL cannot be computed in the corpus with types: $errors."
|
107 | 2093 | mdecorde | return; |
108 | 2093 | mdecorde | } |
109 | 2093 | mdecorde | |
110 | 671 | mdecorde | if (unit_property_display.length() > 0) { |
111 | 2093 | mdecorde | errors = AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, AnalecUtils.getFilterParameters(unit_ursql)[0], unit_property_display).size()
|
112 | 671 | mdecorde | if (errors > 0) { |
113 | 671 | mdecorde | println "Error: some Unit types don't contain the $unit_property_display property: $errors"
|
114 | 671 | mdecorde | return
|
115 | 671 | mdecorde | } |
116 | 671 | mdecorde | } |
117 | 671 | mdecorde | |
118 | 671 | mdecorde | def selectedUnits = AnalecUtils.selectUnitsInSchema(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size,
|
119 | 2099 | mdecorde | unit_ursql, position_in_schema, cql_limit, strict_inclusion, position_in_matches); |
120 | 671 | mdecorde | selectedUnits = new HashSet(selectedUnits) |
121 | 671 | mdecorde | def counts = null |
122 | 2099 | mdecorde | if (word_property_display != null && word_property_display.length() > 0) { |
123 | 2099 | mdecorde | Property p = corpus.getProperty(word_property_display) |
124 | 671 | mdecorde | int[] pos = null |
125 | 671 | mdecorde | counts = [:] |
126 | 671 | mdecorde | for (def unit : selectedUnits) { |
127 | 671 | mdecorde | |
128 | 671 | mdecorde | if (unit.getDeb() == unit.getFin()) pos = [unit.getDeb()]
|
129 | 671 | mdecorde | else pos = (unit.getDeb()..unit.getFin())
|
130 | 671 | mdecorde | |
131 | 671 | mdecorde | String s = StringUtils.join(CQI.cpos2Str(p.getQualifiedName(), pos), " ") |
132 | 671 | mdecorde | if (!counts.containsKey(s)) counts[s] = 0 |
133 | 671 | mdecorde | counts[s] = counts[s] + 1
|
134 | 671 | mdecorde | } |
135 | 671 | mdecorde | } else { // use Analec unit property |
136 | 671 | mdecorde | counts = selectedUnits.countBy { if (it.getProp(unit_property_display) == null) "<null>" else it.getProp(unit_property_display) } |
137 | 671 | mdecorde | } |
138 | 671 | mdecorde | if (counts.containsKey("")) counts["<empty>"] = counts[""]; |
139 | 671 | mdecorde | counts.remove("")
|
140 | 671 | mdecorde | if (debug) {
|
141 | 671 | mdecorde | println "\n*** Statistics: "
|
142 | 671 | mdecorde | for (def k : counts.keySet()) { |
143 | 671 | mdecorde | if (k.size() > 0) { println "$k\t"+counts[k] }else{ println "N/A\t"+counts[k] } |
144 | 671 | mdecorde | } |
145 | 671 | mdecorde | } |
146 | 671 | mdecorde | |
147 | 671 | mdecorde | allresults[corpus] = counts |
148 | 671 | mdecorde | } |
149 | 671 | mdecorde | if (debug) println "RESULTS=$allresults" |
150 | 671 | mdecorde | def keys = new HashSet(); |
151 | 671 | mdecorde | for (def corpus : allresults.keySet()) keys.addAll(allresults[corpus].keySet()) |
152 | 671 | mdecorde | if (keys.size() == 0) { |
153 | 671 | mdecorde | println "No values for $unit_property_display property"
|
154 | 671 | mdecorde | return
|
155 | 671 | mdecorde | } |
156 | 671 | mdecorde | keys = keys.sort() |
157 | 671 | mdecorde | mFactory = DoubleFactory2D.dense |
158 | 671 | mdecorde | matrix = mFactory.make(keys.size(), selection.size()) |
159 | 671 | mdecorde | |
160 | 671 | mdecorde | if (debug) println "KEYS=$keys" |
161 | 671 | mdecorde | |
162 | 671 | mdecorde | def corpus = null |
163 | 671 | mdecorde | if (corpusViewSelection instanceof Partition) { corpus = corpusViewSelection.getCorpus() } |
164 | 671 | mdecorde | else { corpus = corpusViewSelection }
|
165 | 671 | mdecorde | |
166 | 671 | mdecorde | def title = "${corpus.getMainCorpus()}.${corpusViewSelection}\n${unit_ursql}" |
167 | 2099 | mdecorde | title += "[${position_in_matches}]."
|
168 | 2099 | mdecorde | if (word_property_display.length() > 0) title += "${word_property_display} frequencies" |
169 | 671 | mdecorde | else if (unit_property_display.length() > 0) title += "${unit_property_display} frequencies" |
170 | 2094 | mdecorde | if (cql_limit != null && !cql_limit.getQueryString().equals("\"\"")) title += "\n(${cql_limit} limits)" |
171 | 671 | mdecorde | |
172 | 2099 | mdecorde | if (word_property_display.length() > 0) println "Index de la propriété $word_property_display des mots des unités $unit_ursql[$position_in_matches] de ${corpus.getMainCorpus()}.${corpusViewSelection}" |
173 | 2099 | mdecorde | else println "Index de la propriété $unit_property_display des unités $unit_ursql[$position_in_matches] de ${corpus.getMainCorpus()}.${corpusViewSelection}" |
174 | 671 | mdecorde | println "$unit_property_display\t"+selection.join("\t") |
175 | 671 | mdecorde | |
176 | 671 | mdecorde | keys.eachWithIndex { prop_val, i -> |
177 | 671 | mdecorde | String line = "" |
178 | 671 | mdecorde | if (prop_val.size() > 0) { |
179 | 671 | mdecorde | line += prop_val |
180 | 671 | mdecorde | } else {
|
181 | 671 | mdecorde | line += "N/A"
|
182 | 671 | mdecorde | } |
183 | 671 | mdecorde | |
184 | 671 | mdecorde | int min = 0; |
185 | 671 | mdecorde | allresults.keySet().eachWithIndex { c, j -> |
186 | 671 | mdecorde | def freq = allresults[c][prop_val]
|
187 | 671 | mdecorde | if (freq == null) freq = 0 |
188 | 671 | mdecorde | |
189 | 671 | mdecorde | matrix.set(i, j, freq) |
190 | 671 | mdecorde | line += "\t"+freq
|
191 | 671 | mdecorde | if (min < freq) min = freq
|
192 | 671 | mdecorde | } |
193 | 671 | mdecorde | if (min >= output_fmin)
|
194 | 671 | mdecorde | println "$line"
|
195 | 671 | mdecorde | } |
196 | 671 | mdecorde | props = keys |
197 | 671 | mdecorde | |
198 | 671 | mdecorde | def r = RWorkspace.getRWorkspaceInstance()
|
199 | 671 | mdecorde | def colnames = selection as String[] |
200 | 671 | mdecorde | r.addVectorToWorkspace("texts", colnames)
|
201 | 671 | mdecorde | def rownames = props as String[] |
202 | 671 | mdecorde | r.addVectorToWorkspace("props", rownames)
|
203 | 671 | mdecorde | r.addMatrixToWorkspace("textpropfreqs", matrix as DoubleMatrix2D) |
204 | 671 | mdecorde | r.eval("rownames(textpropfreqs) = props")
|
205 | 671 | mdecorde | r.eval("colnames(textpropfreqs) = texts")
|
206 | 671 | mdecorde | |
207 | 875 | sjacqu01 | def resultsDir = new File(Toolbox.getTxmHomePath(), "results") |
208 | 671 | mdecorde | def file = File.createTempFile("txm", ".svg", resultsDir) |
209 | 671 | mdecorde | |
210 | 671 | mdecorde | |
211 | 671 | mdecorde | //if (corpusViewSelection.getClass() == org.txm.searchengine.cqp.corpus.Partition) { corpus = corpusViewSelection.getParent() } else { corpus = corpusViewSelection }
|
212 | 671 | mdecorde | if (output_histogram) {
|
213 | 671 | mdecorde | def plotScript
|
214 | 671 | mdecorde | String legend = "rownames(textpropfreqs)" |
215 | 671 | mdecorde | if (!output_showlegend) legend = "NULL" |
216 | 671 | mdecorde | |
217 | 671 | mdecorde | if (output_2D) {
|
218 | 671 | mdecorde | plotScript = """library(RColorBrewer);
|
219 | 671 | mdecorde | barplot(height=textpropfreqs, main="${title}", xlab="Parties", legend=$legend, beside=TRUE, col=rainbow(length(rownames(textpropfreqs))))"""
|
220 | 671 | mdecorde | } else {
|
221 | 671 | mdecorde | plotScript = """
|
222 | 671 | mdecorde | library(latticeExtra)
|
223 | 671 | mdecorde | library(lattice)
|
224 | 671 | mdecorde | library(RColorBrewer)
|
225 | 2094 | mdecorde | cloud(value~col+row, ..., panel.3d.cloud=panel.3dbars, col.facet='grey', xbase=0.4, ybase=0.4, scales=list(arrows=FALSE, col=1), par.settings = list(axis.line = list(col = "transparent")))
|
226 | 671 | mdecorde | """
|
227 | 671 | mdecorde | } |
228 | 671 | mdecorde | |
229 | 671 | mdecorde | // execute R script
|
230 | 671 | mdecorde | r.plot(file, plotScript) |
231 | 671 | mdecorde | |
232 | 671 | mdecorde | //println "s1: "+r.eval("s1").asString()
|
233 | 671 | mdecorde | |
234 | 671 | mdecorde | //display the SVG results graphic
|
235 | 671 | mdecorde | |
236 | 671 | mdecorde | } |
237 | 671 | mdecorde | |
238 | 671 | mdecorde | def lt = null; |
239 | 671 | mdecorde | if (output_lexicaltable) {
|
240 | 2105 | mdecorde | def parent = selection[0] |
241 | 2105 | mdecorde | println "parent="+parent.getClass()
|
242 | 2105 | mdecorde | if (parent instanceof Part) { |
243 | 2105 | mdecorde | println "part="+parent
|
244 | 2105 | mdecorde | parent = parent.getParent() |
245 | 2105 | mdecorde | println "partition="+parent
|
246 | 2105 | mdecorde | lt = new LexicalTable(parent);
|
247 | 2105 | mdecorde | lt.setData(new LexicalTableImpl(matrix as DoubleMatrix2D, rownames, colnames)) |
248 | 2105 | mdecorde | |
249 | 671 | mdecorde | } else {
|
250 | 2105 | mdecorde | lt = new LexicalTable(parent);
|
251 | 2105 | mdecorde | lt.setData(new LexicalTableImpl(matrix as DoubleMatrix2D, rownames, colnames)) |
252 | 671 | mdecorde | } |
253 | 2105 | mdecorde | lt.setUserName(schema_ursql+">"+unit_ursql)
|
254 | 671 | mdecorde | } |
255 | 671 | mdecorde | |
256 | 671 | mdecorde | if (file.exists() && file.getTotalSpace() > 0) { |
257 | 671 | mdecorde | println "SVG file: "+file.getAbsolutePath()
|
258 | 671 | mdecorde | |
259 | 671 | mdecorde | monitor.syncExec(new Runnable() { |
260 | 671 | mdecorde | @Override
|
261 | 671 | mdecorde | public void run() { try { |
262 | 671 | mdecorde | if (UnitsIndexMacro.this.output_histogram) {
|
263 | 2094 | mdecorde | //OpenSVGGraph.OpenSVGFile(file.getAbsolutePath(), selection.toString()+" Units")
|
264 | 2094 | mdecorde | OpenBrowser.openfile(file.getAbsolutePath()) |
265 | 671 | mdecorde | } |
266 | 671 | mdecorde | if (UnitsIndexMacro.this.output_lexicaltable) {
|
267 | 671 | mdecorde | CorporaView.refreshObject(corpus) |
268 | 671 | mdecorde | CorporaView.expand(lt) |
269 | 671 | mdecorde | } |
270 | 671 | mdecorde | } catch (e) { println "** ERROR: "+e }} |
271 | 671 | mdecorde | }) |
272 | 671 | mdecorde | } else {
|
273 | 671 | mdecorde | println "Error: cannot show histogram result with $file"
|
274 | 671 | mdecorde | } |
275 | 671 | mdecorde | |
276 | 671 | mdecorde | |
277 | 671 | mdecorde | /*
|
278 | 671 | mdecorde | d <- read.table(text=' x y z
|
279 | 671 | mdecorde | t1 5 high
|
280 | 671 | mdecorde | t1 2 low
|
281 | 671 | mdecorde | t1 4 med
|
282 | 671 | mdecorde | t2 8 high
|
283 | 671 | mdecorde | t2 1 low
|
284 | 671 | mdecorde | t2 3 med
|
285 | 671 | mdecorde | t3 50 high
|
286 | 671 | mdecorde | t3 12 med
|
287 | 671 | mdecorde | t3 35 low', header=TRUE)
|
288 | 671 | mdecorde | library(latticeExtra)
|
289 | 671 | mdecorde | cloud(y~x+z, d, panel.3d.cloud=panel.3dbars, col.facet='grey',
|
290 | 671 | mdecorde | xbase=0.4, ybase=0.4, scales=list(arrows=FALSE, col=1),
|
291 | 671 | mdecorde | par.settings = list(axis.line = list(col = "transparent")))
|
292 | 671 | mdecorde | */ |