Statistics
| Revision:

root / tmp / org.txm.analec.rcp / src / org / txm / macro / urs / exploit / UnitsIndexMacro.groovy @ 2105

History | View | Annotate | Download (11.3 kB)

1
// Copyright © 2016 ENS de Lyon, CNRS, University of Franche-Comté
2
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
3
// @author mdecorde
4
// @author sheiden
5
// STANDARD DECLARATIONS
6
package org.txm.macro.urs.exploit
7

    
8
import groovy.transform.Field
9

    
10
import org.apache.commons.lang.StringUtils
11
import org.kohsuke.args4j.*
12
import org.txm.Toolbox
13
import org.txm.annotation.urs.*
14
import org.txm.lexicaltable.core.functions.LexicalTable
15
import org.txm.lexicaltable.core.statsengine.r.data.LexicalTableImpl
16
import org.txm.macro.urs.*
17
import org.txm.rcp.commands.*
18
import org.txm.rcp.swt.widget.parameters.*
19
import org.txm.rcp.views.corpora.CorporaView
20
import org.txm.searchengine.cqp.corpus.*
21
import org.txm.searchengine.cqp.CQPSearchEngine
22
import org.txm.statsengine.r.core.RWorkspace
23

    
24
import visuAnalec.donnees.*
25
import visuAnalec.elements.*
26
import cern.colt.matrix.DoubleFactory2D
27
import cern.colt.matrix.DoubleMatrix2D
28

    
29
def scriptName = this.class.getSimpleName()
30

    
31
def selection = []
32
for (def s : corpusViewSelections) {
33
        if (s instanceof CQPCorpus) selection << s
34
        else if (s instanceof Partition) selection.addAll(s.getParts())
35
}
36

    
37
if (selection.size() == 0) {
38
        println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections
39
        return false
40
} else {
41
        for (def c : selection) c.compute(false)
42
}
43

    
44
// BEGINNING OF PARAMETERS
45
@Field @Option(name="schema_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="CHAINE")
46
                String schema_ursql
47
@Field @Option(name="minimum_schema_size", usage="Minimum size needed to consider a schema", widget="Integer", required=false, def="3")
48
                int minimum_schema_size
49
@Field @Option(name="maximum_schema_size", usage="Maximum size needed to consider a schema", widget="Integer", required=false, def="9999999")
50
                int maximum_schema_size
51
@Field @Option(name="unit_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="MENTION")
52
                String unit_ursql
53
@Field @Option(name="unit_property_display", usage="Unit property to count", widget="String", required=false, def="CATEGORIE")
54
                String unit_property_display
55
@Field @Option(name="word_property_display", usage="Word property to display instead of the unit property", widget="String", required=false, def="")
56
                String word_property_display
57
@Field @Option(name="position_in_schema", usage="Unit distance in schema (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=false, def="0")
58
                int position_in_schema
59
@Field @Option(name="cql_limit", usage="CQL to build structure limits", widget="Query", required=false, def="")
60
                cql_limit
61
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=false, def="true")
62
                strict_inclusion
63
@Field @Option(name="position_in_matches", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=false, def="0")
64
                position_in_matches
65
//@Field @Option(name="output_2D", usage="output barplot or 3D plot", widget="Boolean", required=true, def="true")
66
                output_2D = true
67
@Field @Option(name="output_showlegend", usage="output barplot or 3D plot", widget="Boolean", required=false, def="true")
68
                output_showlegend
69
@Field @Option(name="output_fmin", usage="minimal frequency displayed", widget="Integer", required=false, def="0")
70
                output_fmin
71
@Field @Option(name="output_histogram", usage="show or not a histogram of the result", widget="Boolean", required=false, def="true")
72
                output_histogram
73
@Field @Option(name="output_lexicaltable", usage="create or not a lexical table with the result", widget="Boolean", required=false, def="false")
74
                output_lexicaltable
75
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF        ON        ALL        REALLY ALL", required=false, def="OFF")
76
                debug
77
if (!ParametersDialog.open(this)) return
78
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
79

    
80

    
81
if (word_property_display != null && word_property_display.length() > 0 && unit_property_display != null && unit_property_display.length() > 0) {
82
        println "Warning: both unit_property_display=$unit_property_display and word_property_display=$word_property_display are set. The index will be computed with the $word_property_display CQP property."
83
}
84

    
85
if (word_property_display.length() == 0 && unit_property_display.length() == 0) {
86
        println "Error: no analysis property specified in unit_property_display or word_property_display. Aborting."
87
        return
88
}
89

    
90
def CQI = CQPSearchEngine.getCqiClient()
91

    
92
def allresults = [:]
93
for (def corpus : selection) {
94

    
95
        mainCorpus = corpus.getMainCorpus()
96

    
97
        def word = mainCorpus.getWordProperty()
98
        def analecCorpus = URSCorpora.getCorpus(mainCorpus)
99

    
100
        def errors = AnalecUtils.isPropertyDefined(Schema.class, analecCorpus, schema_ursql)
101
        if (errors.size() > 0) {
102
                println "** The $schema_ursql schema URSQL cannot be computed in the corpus with types: $errors."
103
                return;
104
        }
105
        
106
        errors=AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, unit_ursql)
107
        if (errors.size() > 0) {
108
                println "** $unit_ursql unit URSQL cannot be computed in the corpus with types: $errors."
109
                return;
110
        }
111
        
112
        if (unit_property_display.length() > 0) {
113
                errors = AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, AnalecUtils.getFilterParameters(unit_ursql)[0], unit_property_display).size()
114
                if (errors > 0) {
115
                        println "Error: some Unit types don't contain the $unit_property_display property: $errors"
116
                        return
117
                }
118
        }
119

    
120
        def selectedUnits = AnalecUtils.selectUnitsInSchema(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size,
121
                        unit_ursql, position_in_schema, cql_limit, strict_inclusion, position_in_matches);
122
        selectedUnits = new HashSet(selectedUnits)
123
        def counts = null
124
        if (word_property_display != null && word_property_display.length() > 0) {
125
                Property p = corpus.getProperty(word_property_display)
126
                int[] pos = null
127
                counts = [:]
128
                for (def unit : selectedUnits) {
129

    
130
                        if (unit.getDeb() == unit.getFin()) pos = [unit.getDeb()]
131
                        else pos = (unit.getDeb()..unit.getFin())
132

    
133
                        String s = StringUtils.join(CQI.cpos2Str(p.getQualifiedName(), pos), " ")
134
                        if (!counts.containsKey(s)) counts[s] = 0
135
                        counts[s] = counts[s] + 1
136
                }
137
        } else { // use Analec unit property
138
                counts = selectedUnits.countBy { if (it.getProp(unit_property_display) == null) "<null>" else it.getProp(unit_property_display) }
139
        }
140
        if (counts.containsKey("")) counts["<empty>"] = counts[""];
141
        counts.remove("")
142
        if (debug) {
143
                println "\n*** Statistics: "
144
                for (def k : counts.keySet()) {
145
                        if (k.size() > 0) { println "$k\t"+counts[k] }else{ println "N/A\t"+counts[k] }
146
                }
147
        }
148

    
149
        allresults[corpus] = counts
150
}
151
if (debug) println "RESULTS=$allresults"
152
def keys = new HashSet();
153
for (def corpus : allresults.keySet()) keys.addAll(allresults[corpus].keySet())
154
if (keys.size() == 0) {
155
        println "No values for $unit_property_display property"
156
        return
157
}
158
keys = keys.sort()
159
mFactory = DoubleFactory2D.dense
160
matrix = mFactory.make(keys.size(), selection.size())
161

    
162
if (debug) println "KEYS=$keys"
163

    
164
def corpus = null
165
if (corpusViewSelection instanceof Partition) { corpus = corpusViewSelection.getCorpus() }
166
else { corpus = corpusViewSelection }
167

    
168
def title = "${corpus.getMainCorpus()}.${corpusViewSelection}\n${unit_ursql}"
169
title += "[${position_in_matches}]."
170
if (word_property_display.length() > 0) title += "${word_property_display} frequencies"
171
else if (unit_property_display.length() > 0) title += "${unit_property_display} frequencies"
172
if (cql_limit != null && !cql_limit.getQueryString().equals("\"\"")) title += "\n(${cql_limit} limits)"
173

    
174
if (word_property_display.length() > 0) println "Index de la propriété $word_property_display des mots des unités $unit_ursql[$position_in_matches] de ${corpus.getMainCorpus()}.${corpusViewSelection}"
175
else println "Index de la propriété $unit_property_display des unités $unit_ursql[$position_in_matches] de ${corpus.getMainCorpus()}.${corpusViewSelection}"
176
println "$unit_property_display\t"+selection.join("\t")
177

    
178
keys.eachWithIndex { prop_val, i ->
179
        String line = ""
180
        if (prop_val.size() > 0) {
181
                line += prop_val
182
        } else {
183
                line += "N/A"
184
        }
185

    
186
        int min = 0;
187
        allresults.keySet().eachWithIndex { c, j ->
188
                def freq = allresults[c][prop_val]
189
                if (freq == null) freq = 0
190

    
191
                matrix.set(i, j, freq)
192
                line += "\t"+freq
193

    
194
                if (min < freq) min = freq
195
        }
196
        if (min >= output_fmin)
197
                println "$line"
198
}
199

    
200
props = keys
201

    
202
def r = RWorkspace.getRWorkspaceInstance()
203
def colnames = selection as String[]
204
r.addVectorToWorkspace("texts", colnames)
205
def rownames = props as String[]
206
r.addVectorToWorkspace("props", rownames)
207
r.addMatrixToWorkspace("textpropfreqs", matrix as DoubleMatrix2D)
208
r.eval("rownames(textpropfreqs) = props")
209
r.eval("colnames(textpropfreqs) = texts")
210

    
211
def resultsDir = new File(Toolbox.getTxmHomePath(), "results")
212
def file = File.createTempFile("txm", ".svg", resultsDir)
213

    
214

    
215
//if (corpusViewSelection.getClass() == org.txm.searchengine.cqp.corpus.Partition) { corpus = corpusViewSelection.getParent() } else { corpus = corpusViewSelection }
216
if (output_histogram) {
217
        def plotScript
218
        String legend = "rownames(textpropfreqs)"
219
        if (!output_showlegend) legend = "NULL"
220

    
221
        if (output_2D) {
222
                plotScript = """library(RColorBrewer);
223
barplot(height=textpropfreqs, main="${title}", xlab="Parties", legend=$legend, beside=TRUE, col=rainbow(length(rownames(textpropfreqs))))"""
224
        } else {
225
                plotScript = """
226
                   library(latticeExtra)
227
                   library(lattice)
228
                   library(RColorBrewer)
229
                   cloud(value~col+row, ..., panel.3d.cloud=panel.3dbars, col.facet='grey', xbase=0.4, ybase=0.4, scales=list(arrows=FALSE, col=1), par.settings = list(axis.line = list(col = "transparent")))
230
                   """
231
        }
232

    
233
        // execute R script
234
        r.plot(file, plotScript)
235

    
236
        //println "s1: "+r.eval("s1").asString()
237

    
238
        //display the SVG results graphic
239
        
240
}
241

    
242
def lt = null;
243
if (output_lexicaltable) {
244
        def parent = selection[0]
245
        println "parent="+parent.getClass()
246
        if (parent instanceof Part) {
247
                println "part="+parent
248
                parent = parent.getParent()
249
                println "partition="+parent
250
                lt = new LexicalTable(parent);
251
                lt.setData(new LexicalTableImpl(matrix as DoubleMatrix2D, rownames, colnames))
252

    
253
        } else {
254
                lt = new LexicalTable(parent);
255
                lt.setData(new LexicalTableImpl(matrix as DoubleMatrix2D, rownames, colnames))
256
        }
257
        lt.setUserName(schema_ursql+">"+unit_ursql)
258
}
259

    
260
if (file.exists() && file.getTotalSpace() > 0) {
261
        println "SVG file: "+file.getAbsolutePath()
262

    
263
        monitor.syncExec(new Runnable() {
264
                                @Override
265
                                public void run() { try {
266
                                                if (UnitsIndexMacro.this.output_histogram) {
267
                                                        //OpenSVGGraph.OpenSVGFile(file.getAbsolutePath(), selection.toString()+" Units")
268
                                                        OpenBrowser.openfile(file.getAbsolutePath())
269
                                                }
270
                                                if (UnitsIndexMacro.this.output_lexicaltable) {
271
                                                        CorporaView.refreshObject(corpus)
272
                                                        CorporaView.expand(lt)
273
                                                }
274
                                        } catch (e) { println "** ERROR: "+e }}
275
                        })
276
} else {
277
        println "Error: cannot show histogram result with $file"
278
}
279

    
280

    
281
/*
282
 d <- read.table(text=' x   y     z
283
 t1   5   high
284
 t1   2   low
285
 t1   4   med
286
 t2   8   high
287
 t2   1   low
288
 t2   3   med
289
 t3  50   high
290
 t3  12   med
291
 t3  35   low', header=TRUE)
292
 library(latticeExtra)
293
 cloud(y~x+z, d, panel.3d.cloud=panel.3dbars, col.facet='grey',
294
 xbase=0.4, ybase=0.4, scales=list(arrows=FALSE, col=1),
295
 par.settings = list(axis.line = list(col = "transparent")))
296
 */