Statistics
| Revision:

root / tmp / org.txm.analec.rcp / src / org / txm / macro / urs / exploit / UnitsIndexMacro.groovy @ 2167

History | View | Annotate | Download (11.2 kB)

1 671 mdecorde
// Copyright © 2016 ENS de Lyon, CNRS, University of Franche-Comté
2 671 mdecorde
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
3 671 mdecorde
// @author mdecorde
4 671 mdecorde
// @author sheiden
5 671 mdecorde
// STANDARD DECLARATIONS
6 1217 mdecorde
package org.txm.macro.urs.exploit
7 671 mdecorde
8 671 mdecorde
import groovy.transform.Field
9 671 mdecorde
10 671 mdecorde
import org.apache.commons.lang.StringUtils
11 671 mdecorde
import org.kohsuke.args4j.*
12 671 mdecorde
import org.txm.Toolbox
13 1217 mdecorde
import org.txm.annotation.urs.*
14 2105 mdecorde
import org.txm.lexicaltable.core.functions.LexicalTable
15 671 mdecorde
import org.txm.lexicaltable.core.statsengine.r.data.LexicalTableImpl
16 1979 mdecorde
import org.txm.macro.urs.*
17 671 mdecorde
import org.txm.rcp.commands.*
18 671 mdecorde
import org.txm.rcp.swt.widget.parameters.*
19 671 mdecorde
import org.txm.rcp.views.corpora.CorporaView
20 671 mdecorde
import org.txm.searchengine.cqp.corpus.*
21 1979 mdecorde
import org.txm.searchengine.cqp.CQPSearchEngine
22 671 mdecorde
import org.txm.statsengine.r.core.RWorkspace
23 671 mdecorde
24 671 mdecorde
import visuAnalec.donnees.*
25 671 mdecorde
import visuAnalec.elements.*
26 671 mdecorde
import cern.colt.matrix.DoubleFactory2D
27 671 mdecorde
import cern.colt.matrix.DoubleMatrix2D
28 671 mdecorde
29 671 mdecorde
def scriptName = this.class.getSimpleName()
30 671 mdecorde
31 671 mdecorde
def selection = []
32 671 mdecorde
for (def s : corpusViewSelections) {
33 1094 mdecorde
        if (s instanceof CQPCorpus) selection << s
34 671 mdecorde
        else if (s instanceof Partition) selection.addAll(s.getParts())
35 671 mdecorde
}
36 671 mdecorde
37 671 mdecorde
if (selection.size() == 0) {
38 671 mdecorde
        println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections
39 671 mdecorde
        return false
40 2105 mdecorde
} else {
41 2105 mdecorde
        for (def c : selection) c.compute(false)
42 671 mdecorde
}
43 671 mdecorde
44 671 mdecorde
// BEGINNING OF PARAMETERS
45 2105 mdecorde
@Field @Option(name="schema_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="CHAINE")
46 671 mdecorde
                String schema_ursql
47 2105 mdecorde
@Field @Option(name="minimum_schema_size", usage="Minimum size needed to consider a schema", widget="Integer", required=false, def="3")
48 671 mdecorde
                int minimum_schema_size
49 2105 mdecorde
@Field @Option(name="maximum_schema_size", usage="Maximum size needed to consider a schema", widget="Integer", required=false, def="9999999")
50 671 mdecorde
                int maximum_schema_size
51 671 mdecorde
@Field @Option(name="unit_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="MENTION")
52 671 mdecorde
                String unit_ursql
53 2105 mdecorde
@Field @Option(name="unit_property_display", usage="Unit property to count", widget="String", required=false, def="CATEGORIE")
54 671 mdecorde
                String unit_property_display
55 2099 mdecorde
@Field @Option(name="word_property_display", usage="Word property to display instead of the unit property", widget="String", required=false, def="")
56 2099 mdecorde
                String word_property_display
57 2105 mdecorde
@Field @Option(name="position_in_schema", usage="Unit distance in schema (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=false, def="0")
58 2094 mdecorde
                int position_in_schema
59 2105 mdecorde
@Field @Option(name="cql_limit", usage="CQL to build structure limits", widget="Query", required=false, def="")
60 2094 mdecorde
                cql_limit
61 2105 mdecorde
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=false, def="true")
62 671 mdecorde
                strict_inclusion
63 2105 mdecorde
@Field @Option(name="position_in_matches", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=false, def="0")
64 2099 mdecorde
                position_in_matches
65 2094 mdecorde
//@Field @Option(name="output_2D", usage="output barplot or 3D plot", widget="Boolean", required=true, def="true")
66 2094 mdecorde
                output_2D = true
67 2105 mdecorde
@Field @Option(name="output_showlegend", usage="output barplot or 3D plot", widget="Boolean", required=false, def="true")
68 671 mdecorde
                output_showlegend
69 2105 mdecorde
@Field @Option(name="output_fmin", usage="minimal frequency displayed", widget="Integer", required=false, def="0")
70 671 mdecorde
                output_fmin
71 2105 mdecorde
@Field @Option(name="output_histogram", usage="show or not a histogram of the result", widget="Boolean", required=false, def="true")
72 671 mdecorde
                output_histogram
73 2105 mdecorde
@Field @Option(name="output_lexicaltable", usage="create or not a lexical table with the result", widget="Boolean", required=false, def="false")
74 671 mdecorde
                output_lexicaltable
75 2105 mdecorde
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF        ON        ALL        REALLY ALL", required=false, def="OFF")
76 671 mdecorde
                debug
77 671 mdecorde
if (!ParametersDialog.open(this)) return
78 671 mdecorde
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
79 671 mdecorde
80 671 mdecorde
81 2099 mdecorde
if (word_property_display != null && word_property_display.length() > 0 && unit_property_display != null && unit_property_display.length() > 0) {
82 2099 mdecorde
        println "Warning: both unit_property_display=$unit_property_display and word_property_display=$word_property_display are set. The index will be computed with the $word_property_display CQP property."
83 671 mdecorde
}
84 671 mdecorde
85 2099 mdecorde
if (word_property_display.length() == 0 && unit_property_display.length() == 0) {
86 2099 mdecorde
        println "Error: no analysis property specified in unit_property_display or word_property_display. Aborting."
87 671 mdecorde
        return
88 671 mdecorde
}
89 671 mdecorde
90 788 mdecorde
def CQI = CQPSearchEngine.getCqiClient()
91 671 mdecorde
92 671 mdecorde
def allresults = [:]
93 671 mdecorde
for (def corpus : selection) {
94 671 mdecorde
95 2144 mdecorde
        def word = corpus.getWordProperty()
96 2144 mdecorde
        def analecCorpus = URSCorpora.getCorpus(corpus)
97 671 mdecorde
98 2093 mdecorde
        def errors = AnalecUtils.isPropertyDefined(Schema.class, analecCorpus, schema_ursql)
99 2093 mdecorde
        if (errors.size() > 0) {
100 2093 mdecorde
                println "** The $schema_ursql schema URSQL cannot be computed in the corpus with types: $errors."
101 2093 mdecorde
                return;
102 2093 mdecorde
        }
103 2093 mdecorde
104 2093 mdecorde
        errors=AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, unit_ursql)
105 2093 mdecorde
        if (errors.size() > 0) {
106 2093 mdecorde
                println "** $unit_ursql unit URSQL cannot be computed in the corpus with types: $errors."
107 2093 mdecorde
                return;
108 2093 mdecorde
        }
109 2093 mdecorde
110 671 mdecorde
        if (unit_property_display.length() > 0) {
111 2093 mdecorde
                errors = AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, AnalecUtils.getFilterParameters(unit_ursql)[0], unit_property_display).size()
112 671 mdecorde
                if (errors > 0) {
113 671 mdecorde
                        println "Error: some Unit types don't contain the $unit_property_display property: $errors"
114 671 mdecorde
                        return
115 671 mdecorde
                }
116 671 mdecorde
        }
117 671 mdecorde
118 671 mdecorde
        def selectedUnits = AnalecUtils.selectUnitsInSchema(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size,
119 2099 mdecorde
                        unit_ursql, position_in_schema, cql_limit, strict_inclusion, position_in_matches);
120 671 mdecorde
        selectedUnits = new HashSet(selectedUnits)
121 671 mdecorde
        def counts = null
122 2099 mdecorde
        if (word_property_display != null && word_property_display.length() > 0) {
123 2099 mdecorde
                Property p = corpus.getProperty(word_property_display)
124 671 mdecorde
                int[] pos = null
125 671 mdecorde
                counts = [:]
126 671 mdecorde
                for (def unit : selectedUnits) {
127 671 mdecorde
128 671 mdecorde
                        if (unit.getDeb() == unit.getFin()) pos = [unit.getDeb()]
129 671 mdecorde
                        else pos = (unit.getDeb()..unit.getFin())
130 671 mdecorde
131 671 mdecorde
                        String s = StringUtils.join(CQI.cpos2Str(p.getQualifiedName(), pos), " ")
132 671 mdecorde
                        if (!counts.containsKey(s)) counts[s] = 0
133 671 mdecorde
                        counts[s] = counts[s] + 1
134 671 mdecorde
                }
135 671 mdecorde
        } else { // use Analec unit property
136 671 mdecorde
                counts = selectedUnits.countBy { if (it.getProp(unit_property_display) == null) "<null>" else it.getProp(unit_property_display) }
137 671 mdecorde
        }
138 671 mdecorde
        if (counts.containsKey("")) counts["<empty>"] = counts[""];
139 671 mdecorde
        counts.remove("")
140 671 mdecorde
        if (debug) {
141 671 mdecorde
                println "\n*** Statistics: "
142 671 mdecorde
                for (def k : counts.keySet()) {
143 671 mdecorde
                        if (k.size() > 0) { println "$k\t"+counts[k] }else{ println "N/A\t"+counts[k] }
144 671 mdecorde
                }
145 671 mdecorde
        }
146 671 mdecorde
147 671 mdecorde
        allresults[corpus] = counts
148 671 mdecorde
}
149 671 mdecorde
if (debug) println "RESULTS=$allresults"
150 671 mdecorde
def keys = new HashSet();
151 671 mdecorde
for (def corpus : allresults.keySet()) keys.addAll(allresults[corpus].keySet())
152 671 mdecorde
if (keys.size() == 0) {
153 671 mdecorde
        println "No values for $unit_property_display property"
154 671 mdecorde
        return
155 671 mdecorde
}
156 671 mdecorde
keys = keys.sort()
157 671 mdecorde
mFactory = DoubleFactory2D.dense
158 671 mdecorde
matrix = mFactory.make(keys.size(), selection.size())
159 671 mdecorde
160 671 mdecorde
if (debug) println "KEYS=$keys"
161 671 mdecorde
162 671 mdecorde
def corpus = null
163 671 mdecorde
if (corpusViewSelection instanceof Partition) { corpus = corpusViewSelection.getCorpus() }
164 671 mdecorde
else { corpus = corpusViewSelection }
165 671 mdecorde
166 671 mdecorde
def title = "${corpus.getMainCorpus()}.${corpusViewSelection}\n${unit_ursql}"
167 2099 mdecorde
title += "[${position_in_matches}]."
168 2099 mdecorde
if (word_property_display.length() > 0) title += "${word_property_display} frequencies"
169 671 mdecorde
else if (unit_property_display.length() > 0) title += "${unit_property_display} frequencies"
170 2094 mdecorde
if (cql_limit != null && !cql_limit.getQueryString().equals("\"\"")) title += "\n(${cql_limit} limits)"
171 671 mdecorde
172 2099 mdecorde
if (word_property_display.length() > 0) println "Index de la propriété $word_property_display des mots des unités $unit_ursql[$position_in_matches] de ${corpus.getMainCorpus()}.${corpusViewSelection}"
173 2099 mdecorde
else println "Index de la propriété $unit_property_display des unités $unit_ursql[$position_in_matches] de ${corpus.getMainCorpus()}.${corpusViewSelection}"
174 671 mdecorde
println "$unit_property_display\t"+selection.join("\t")
175 671 mdecorde
176 671 mdecorde
keys.eachWithIndex { prop_val, i ->
177 671 mdecorde
        String line = ""
178 671 mdecorde
        if (prop_val.size() > 0) {
179 671 mdecorde
                line += prop_val
180 671 mdecorde
        } else {
181 671 mdecorde
                line += "N/A"
182 671 mdecorde
        }
183 671 mdecorde
184 671 mdecorde
        int min = 0;
185 671 mdecorde
        allresults.keySet().eachWithIndex { c, j ->
186 671 mdecorde
                def freq = allresults[c][prop_val]
187 671 mdecorde
                if (freq == null) freq = 0
188 671 mdecorde
189 671 mdecorde
                matrix.set(i, j, freq)
190 671 mdecorde
                line += "\t"+freq
191 671 mdecorde
                if (min < freq) min = freq
192 671 mdecorde
        }
193 671 mdecorde
        if (min >= output_fmin)
194 671 mdecorde
                println "$line"
195 671 mdecorde
}
196 671 mdecorde
props = keys
197 671 mdecorde
198 671 mdecorde
def r = RWorkspace.getRWorkspaceInstance()
199 671 mdecorde
def colnames = selection as String[]
200 671 mdecorde
r.addVectorToWorkspace("texts", colnames)
201 671 mdecorde
def rownames = props as String[]
202 671 mdecorde
r.addVectorToWorkspace("props", rownames)
203 671 mdecorde
r.addMatrixToWorkspace("textpropfreqs", matrix as DoubleMatrix2D)
204 671 mdecorde
r.eval("rownames(textpropfreqs) = props")
205 671 mdecorde
r.eval("colnames(textpropfreqs) = texts")
206 671 mdecorde
207 875 sjacqu01
def resultsDir = new File(Toolbox.getTxmHomePath(), "results")
208 671 mdecorde
def file = File.createTempFile("txm", ".svg", resultsDir)
209 671 mdecorde
210 671 mdecorde
211 671 mdecorde
//if (corpusViewSelection.getClass() == org.txm.searchengine.cqp.corpus.Partition) { corpus = corpusViewSelection.getParent() } else { corpus = corpusViewSelection }
212 671 mdecorde
if (output_histogram) {
213 671 mdecorde
        def plotScript
214 671 mdecorde
        String legend = "rownames(textpropfreqs)"
215 671 mdecorde
        if (!output_showlegend) legend = "NULL"
216 671 mdecorde
217 671 mdecorde
        if (output_2D) {
218 671 mdecorde
                plotScript = """library(RColorBrewer);
219 671 mdecorde
barplot(height=textpropfreqs, main="${title}", xlab="Parties", legend=$legend, beside=TRUE, col=rainbow(length(rownames(textpropfreqs))))"""
220 671 mdecorde
        } else {
221 671 mdecorde
                plotScript = """
222 671 mdecorde
                   library(latticeExtra)
223 671 mdecorde
                   library(lattice)
224 671 mdecorde
                   library(RColorBrewer)
225 2094 mdecorde
                   cloud(value~col+row, ..., panel.3d.cloud=panel.3dbars, col.facet='grey', xbase=0.4, ybase=0.4, scales=list(arrows=FALSE, col=1), par.settings = list(axis.line = list(col = "transparent")))
226 671 mdecorde
                   """
227 671 mdecorde
        }
228 671 mdecorde
229 671 mdecorde
        // execute R script
230 671 mdecorde
        r.plot(file, plotScript)
231 671 mdecorde
232 671 mdecorde
        //println "s1: "+r.eval("s1").asString()
233 671 mdecorde
234 671 mdecorde
        //display the SVG results graphic
235 671 mdecorde
236 671 mdecorde
}
237 671 mdecorde
238 671 mdecorde
def lt = null;
239 671 mdecorde
if (output_lexicaltable) {
240 2105 mdecorde
        def parent = selection[0]
241 2105 mdecorde
        println "parent="+parent.getClass()
242 2105 mdecorde
        if (parent instanceof Part) {
243 2105 mdecorde
                println "part="+parent
244 2105 mdecorde
                parent = parent.getParent()
245 2105 mdecorde
                println "partition="+parent
246 2105 mdecorde
                lt = new LexicalTable(parent);
247 2105 mdecorde
                lt.setData(new LexicalTableImpl(matrix as DoubleMatrix2D, rownames, colnames))
248 2105 mdecorde
249 671 mdecorde
        } else {
250 2105 mdecorde
                lt = new LexicalTable(parent);
251 2105 mdecorde
                lt.setData(new LexicalTableImpl(matrix as DoubleMatrix2D, rownames, colnames))
252 671 mdecorde
        }
253 2105 mdecorde
        lt.setUserName(schema_ursql+">"+unit_ursql)
254 671 mdecorde
}
255 671 mdecorde
256 671 mdecorde
if (file.exists() && file.getTotalSpace() > 0) {
257 671 mdecorde
        println "SVG file: "+file.getAbsolutePath()
258 671 mdecorde
259 671 mdecorde
        monitor.syncExec(new Runnable() {
260 671 mdecorde
                                @Override
261 671 mdecorde
                                public void run() { try {
262 671 mdecorde
                                                if (UnitsIndexMacro.this.output_histogram) {
263 2094 mdecorde
                                                        //OpenSVGGraph.OpenSVGFile(file.getAbsolutePath(), selection.toString()+" Units")
264 2094 mdecorde
                                                        OpenBrowser.openfile(file.getAbsolutePath())
265 671 mdecorde
                                                }
266 671 mdecorde
                                                if (UnitsIndexMacro.this.output_lexicaltable) {
267 671 mdecorde
                                                        CorporaView.refreshObject(corpus)
268 671 mdecorde
                                                        CorporaView.expand(lt)
269 671 mdecorde
                                                }
270 671 mdecorde
                                        } catch (e) { println "** ERROR: "+e }}
271 671 mdecorde
                        })
272 671 mdecorde
} else {
273 671 mdecorde
        println "Error: cannot show histogram result with $file"
274 671 mdecorde
}
275 671 mdecorde
276 671 mdecorde
277 671 mdecorde
/*
278 671 mdecorde
 d <- read.table(text=' x   y     z
279 671 mdecorde
 t1   5   high
280 671 mdecorde
 t1   2   low
281 671 mdecorde
 t1   4   med
282 671 mdecorde
 t2   8   high
283 671 mdecorde
 t2   1   low
284 671 mdecorde
 t2   3   med
285 671 mdecorde
 t3  50   high
286 671 mdecorde
 t3  12   med
287 671 mdecorde
 t3  35   low', header=TRUE)
288 671 mdecorde
 library(latticeExtra)
289 671 mdecorde
 cloud(y~x+z, d, panel.3d.cloud=panel.3dbars, col.facet='grey',
290 671 mdecorde
 xbase=0.4, ybase=0.4, scales=list(arrows=FALSE, col=1),
291 671 mdecorde
 par.settings = list(axis.line = list(col = "transparent")))
292 671 mdecorde
 */