Statistics
| Revision:

root / tmp / org.txm.analec.rcp / src / org / txm / macro / analec / exploit / UnitsIndexMacro.groovy @ 1094

History | View | Annotate | Download (10.6 kB)

1
// Copyright © 2016 ENS de Lyon, CNRS, University of Franche-Comté
2
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
3
// @author mdecorde
4
// @author sheiden
5
// STANDARD DECLARATIONS
6
package org.txm.macro.analec.exploit
7

    
8
import groovy.transform.Field
9

    
10
import org.apache.commons.lang.StringUtils
11
import org.kohsuke.args4j.*
12
import org.txm.Toolbox
13
import org.txm.analec.*
14
import org.txm.lexicaltable.core.statsengine.r.data.LexicalTableImpl
15
import org.txm.macro.analec.*
16
import org.txm.rcp.commands.*
17
import org.txm.rcp.swt.widget.parameters.*
18
import org.txm.rcp.views.corpora.CorporaView
19
import org.txm.searchengine.cqp.corpus.*
20
import org.txm.statsengine.r.core.RWorkspace
21

    
22
import visuAnalec.donnees.*
23
import visuAnalec.elements.*
24
import cern.colt.matrix.DoubleFactory2D
25
import cern.colt.matrix.DoubleMatrix2D
26

    
27
def scriptName = this.class.getSimpleName()
28

    
29
def selection = []
30
for (def s : corpusViewSelections) {
31
        if (s instanceof CQPCorpus) selection << s
32
        else if (s instanceof Partition) selection.addAll(s.getParts())
33
}
34

    
35
if (selection.size() == 0) {
36
        println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections
37
        return false
38
}
39

    
40
// BEGINNING OF PARAMETERS
41
@Field @Option(name="schema_ursql", usage="TYPE@PROP=VALUE", widget="String", required=true, def="CHAINE")
42
                String schema_ursql
43
@Field @Option(name="minimum_schema_size", usage="Minimum size needed to consider a schema", widget="Integer", required=true, def="3")
44
                int minimum_schema_size
45
@Field @Option(name="maximum_schema_size", usage="Maximum size needed to consider a schema", widget="Integer", required=true, def="9999999")
46
                int maximum_schema_size
47
@Field @Option(name="unit_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="MENTION")
48
                String unit_ursql
49
@Field @Option(name="unit_property_display", usage="Unit property to count", widget="String", required=true, def="CATEGORIE")
50
                String unit_property_display
51
@Field @Option(name="cqp_property_display", usage="Word property to display instead of the unit property", widget="String", required=false, def="")
52
                String cqp_property_display
53
@Field @Option(name="limit_distance_in_schema", usage="Unit distance in schema (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
54
                int limit_distance_in_schema
55
@Field @Option(name="limit_cql", usage="CQL to build structure limits", widget="Query", required=true, def="<div> [] expand to div")
56
                limit_cql
57
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true")
58
                strict_inclusion
59
@Field @Option(name="limit_distance", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
60
                limit_distance
61
@Field @Option(name="output_2D", usage="output barplot or 3D plot", widget="Boolean", required=true, def="true")
62
                output_2D
63
@Field @Option(name="output_showlegend", usage="output barplot or 3D plot", widget="Boolean", required=true, def="true")
64
                output_showlegend
65
@Field @Option(name="output_fmin", usage="minimal frequency displayed", widget="Integer", required=true, def="0")
66
                output_fmin
67
@Field @Option(name="output_histogram", usage="show or not a histogram of the result", widget="Boolean", required=true, def="true")
68
                output_histogram
69
@Field @Option(name="output_lexicaltable", usage="create or not a lexical table with the result", widget="Boolean", required=true, def="false")
70
                output_lexicaltable
71
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF        ON        ALL        REALLY ALL", required=true, def="OFF")
72
                debug
73
if (!ParametersDialog.open(this)) return
74
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
75

    
76

    
77
if (cqp_property_display != null && cqp_property_display.length() > 0 && unit_property_display != null && unit_property_display.length() > 0) {
78
        println "Warning: both unit_property_display=$unit_property_display and cqp_property_display=$cqp_property_display are set. The index will be computed with the $cqp_property_display CQP property."
79
}
80

    
81
if (cqp_property_display.length() == 0 && unit_property_display.length() == 0) {
82
        println "Error: no analysis property specified in unit_property_display or cqp_property_display. Aborting."
83
        return
84
}
85

    
86
def CQI = CQPSearchEngine.getCqiClient()
87

    
88
def allresults = [:]
89
for (def corpus : selection) {
90

    
91
        mainCorpus = corpus.getMainCorpus()
92

    
93
        def word = mainCorpus.getWordProperty()
94
        def analecCorpus = AnalecCorpora.getCorpus(mainCorpus.getName())
95

    
96
        if (unit_property_display.length() > 0) {
97
                def errors = AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, unit_ursql, unit_property_display).size()
98
                if (errors > 0) {
99
                        println "Error: some Unit types don't contain the $unit_property_display property: $errors"
100
                        return
101
                }
102
        }
103

    
104
        def selectedUnits = AnalecUtils.selectUnitsInSchema(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size,
105
                        unit_ursql, limit_distance_in_schema, limit_cql, strict_inclusion, limit_distance);
106
        selectedUnits = new HashSet(selectedUnits)
107
        def counts = null
108
        if (cqp_property_display != null && cqp_property_display.length() > 0) {
109
                Property p = corpus.getProperty(cqp_property_display)
110
                int[] pos = null
111
                counts = [:]
112
                for (def unit : selectedUnits) {
113

    
114
                        if (unit.getDeb() == unit.getFin()) pos = [unit.getDeb()]
115
                        else pos = (unit.getDeb()..unit.getFin())
116

    
117
                        String s = StringUtils.join(CQI.cpos2Str(p.getQualifiedName(), pos), " ")
118
                        if (!counts.containsKey(s)) counts[s] = 0
119
                        counts[s] = counts[s] + 1
120
                }
121
        } else { // use Analec unit property
122
                counts = selectedUnits.countBy { if (it.getProp(unit_property_display) == null) "<null>" else it.getProp(unit_property_display) }
123
        }
124
        if (counts.containsKey("")) counts["<empty>"] = counts[""];
125
        counts.remove("")
126
        if (debug) {
127
                println "\n*** Statistics: "
128
                for (def k : counts.keySet()) {
129
                        if (k.size() > 0) { println "$k\t"+counts[k] }else{ println "N/A\t"+counts[k] }
130
                }
131
        }
132

    
133
        allresults[corpus] = counts
134
}
135
if (debug) println "RESULTS=$allresults"
136
def keys = new HashSet();
137
for (def corpus : allresults.keySet()) keys.addAll(allresults[corpus].keySet())
138
if (keys.size() == 0) {
139
        println "No values for $unit_property_display property"
140
        return
141
}
142
keys = keys.sort()
143
mFactory = DoubleFactory2D.dense
144
matrix = mFactory.make(keys.size(), selection.size())
145

    
146
if (debug) println "KEYS=$keys"
147

    
148
def corpus = null
149
if (corpusViewSelection instanceof Partition) { corpus = corpusViewSelection.getCorpus() }
150
else { corpus = corpusViewSelection }
151

    
152
def title = "${corpus.getMainCorpus()}.${corpusViewSelection}\n${unit_ursql}"
153
title += "[${limit_distance}]."
154
if (cqp_property_display.length() > 0) title += "${cqp_property_display} frequencies"
155
else if (unit_property_display.length() > 0) title += "${unit_property_display} frequencies"
156
if (limit_cql != null && !limit_cql.getQueryString().equals("\"\"")) title += "\n(${limit_cql} limits)"
157

    
158
if (cqp_property_display.length() > 0) println "Index de la propriété $cqp_property_display des mots des unités $unit_ursql[$limit_distance] de ${corpus.getMainCorpus()}.${corpusViewSelection}"
159
else println "Index de la propriété $unit_property_display des unités $unit_ursql[$limit_distance] de ${corpus.getMainCorpus()}.${corpusViewSelection}"
160
println "$unit_property_display\t"+selection.join("\t")
161

    
162
keys.eachWithIndex { prop_val, i ->
163
        String line = ""
164
        if (prop_val.size() > 0) {
165
                line += prop_val
166
        } else {
167
                line += "N/A"
168
        }
169

    
170
        int min = 0;
171
        allresults.keySet().eachWithIndex { c, j ->
172
                def freq = allresults[c][prop_val]
173
                if (freq == null) freq = 0
174

    
175
                matrix.set(i, j, freq)
176
                line += "\t"+freq
177

    
178
                if (min < freq) min = freq
179
        }
180
        if (min >= output_fmin)
181
                println "$line"
182
}
183

    
184
props = keys
185

    
186
def r = RWorkspace.getRWorkspaceInstance()
187
def colnames = selection as String[]
188
r.addVectorToWorkspace("texts", colnames)
189
def rownames = props as String[]
190
r.addVectorToWorkspace("props", rownames)
191
r.addMatrixToWorkspace("textpropfreqs", matrix as DoubleMatrix2D)
192
r.eval("rownames(textpropfreqs) = props")
193
r.eval("colnames(textpropfreqs) = texts")
194

    
195
def resultsDir = new File(Toolbox.getTxmHomePath(), "results")
196
def file = File.createTempFile("txm", ".svg", resultsDir)
197

    
198

    
199
//if (corpusViewSelection.getClass() == org.txm.searchengine.cqp.corpus.Partition) { corpus = corpusViewSelection.getParent() } else { corpus = corpusViewSelection }
200
if (output_histogram) {
201
        def plotScript
202
        String legend = "rownames(textpropfreqs)"
203
        if (!output_showlegend) legend = "NULL"
204

    
205
        if (output_2D) {
206
                plotScript = """library(RColorBrewer);
207
barplot(height=textpropfreqs, main="${title}", xlab="Parties", legend=$legend, beside=TRUE, col=rainbow(length(rownames(textpropfreqs))))"""
208
        } else {
209
                plotScript = """
210
                   library(latticeExtra)
211
                   library(lattice)
212
                   library(RColorBrewer)
213
                   cloud(value~col+row, m, panel.3d.cloud=panel.3dbars, col.facet='grey', xbase=0.4, ybase=0.4, scales=list(arrows=FALSE, col=1), par.settings = list(axis.line = list(col = "transparent")))
214
                   """
215
        }
216

    
217
        // execute R script
218
        r.plot(file, plotScript)
219

    
220
        //println "s1: "+r.eval("s1").asString()
221

    
222
        //display the SVG results graphic
223
        
224
}
225

    
226
def lt = null;
227
if (output_lexicaltable) {
228
        
229
        if (corpusViewSelection instanceof Partition) {
230
                lt = new LexicalTableImpl(matrix as DoubleMatrix2D, corpusViewSelection, corpusViewSelection.getCorpus().getProperty("word"),
231
                                rownames, colnames)
232
                lt.setCorpus(corpusViewSelection.getCorpus());
233
                corpusViewSelection.storeResult(lt)
234
        } else {
235
                lt = new LexicalTableImpl(matrix as DoubleMatrix2D, corpus.getProperty("word"),
236
                                rownames, colnames)
237
                lt.setCorpus(corpus);
238
                corpus.storeResult(lt)
239
        }
240
}
241

    
242

    
243
if (file.exists() && file.getTotalSpace() > 0) {
244
        println "SVG file: "+file.getAbsolutePath()
245

    
246
        monitor.syncExec(new Runnable() {
247
                                @Override
248
                                public void run() { try {
249
                                                if (UnitsIndexMacro.this.output_histogram) {
250
                                                        OpenSVGGraph.OpenSVGFile(file.getAbsolutePath(), selection.toString()+" Units")
251
                                                }
252
                                                if (UnitsIndexMacro.this.output_lexicaltable) {
253
                                                        CorporaView.refreshObject(corpus)
254
                                                        CorporaView.expand(lt)
255
                                                }
256
                                        } catch (e) { println "** ERROR: "+e }}
257
                        })
258
} else {
259
        println "Error: cannot show histogram result with $file"
260
}
261

    
262

    
263
/*
264
 d <- read.table(text=' x   y     z
265
 t1   5   high
266
 t1   2   low
267
 t1   4   med
268
 t2   8   high
269
 t2   1   low
270
 t2   3   med
271
 t3  50   high
272
 t3  12   med
273
 t3  35   low', header=TRUE)
274
 library(latticeExtra)
275
 cloud(y~x+z, d, panel.3d.cloud=panel.3dbars, col.facet='grey',
276
 xbase=0.4, ybase=0.4, scales=list(arrows=FALSE, col=1),
277
 par.settings = list(axis.line = list(col = "transparent")))
278
 */