Statistics
| Revision:

root / tmp / org.txm.analec.rcp / src / org / txm / macro / urs / exploit / UnitsIndexMacro.groovy @ 2144

History | View | Annotate | Download (11.2 kB)

1
// Copyright © 2016 ENS de Lyon, CNRS, University of Franche-Comté
2
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
3
// @author mdecorde
4
// @author sheiden
5
// STANDARD DECLARATIONS
6
package org.txm.macro.urs.exploit
7

    
8
import groovy.transform.Field
9

    
10
import org.apache.commons.lang.StringUtils
11
import org.kohsuke.args4j.*
12
import org.txm.Toolbox
13
import org.txm.annotation.urs.*
14
import org.txm.lexicaltable.core.functions.LexicalTable
15
import org.txm.lexicaltable.core.statsengine.r.data.LexicalTableImpl
16
import org.txm.macro.urs.*
17
import org.txm.rcp.commands.*
18
import org.txm.rcp.swt.widget.parameters.*
19
import org.txm.rcp.views.corpora.CorporaView
20
import org.txm.searchengine.cqp.corpus.*
21
import org.txm.searchengine.cqp.CQPSearchEngine
22
import org.txm.statsengine.r.core.RWorkspace
23

    
24
import visuAnalec.donnees.*
25
import visuAnalec.elements.*
26
import cern.colt.matrix.DoubleFactory2D
27
import cern.colt.matrix.DoubleMatrix2D
28

    
29
def scriptName = this.class.getSimpleName()
30

    
31
def selection = []
32
for (def s : corpusViewSelections) {
33
        if (s instanceof CQPCorpus) selection << s
34
        else if (s instanceof Partition) selection.addAll(s.getParts())
35
}
36

    
37
if (selection.size() == 0) {
38
        println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections
39
        return false
40
} else {
41
        for (def c : selection) c.compute(false)
42
}
43

    
44
// BEGINNING OF PARAMETERS
45
@Field @Option(name="schema_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="CHAINE")
46
                String schema_ursql
47
@Field @Option(name="minimum_schema_size", usage="Minimum size needed to consider a schema", widget="Integer", required=false, def="3")
48
                int minimum_schema_size
49
@Field @Option(name="maximum_schema_size", usage="Maximum size needed to consider a schema", widget="Integer", required=false, def="9999999")
50
                int maximum_schema_size
51
@Field @Option(name="unit_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="MENTION")
52
                String unit_ursql
53
@Field @Option(name="unit_property_display", usage="Unit property to count", widget="String", required=false, def="CATEGORIE")
54
                String unit_property_display
55
@Field @Option(name="word_property_display", usage="Word property to display instead of the unit property", widget="String", required=false, def="")
56
                String word_property_display
57
@Field @Option(name="position_in_schema", usage="Unit distance in schema (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=false, def="0")
58
                int position_in_schema
59
@Field @Option(name="cql_limit", usage="CQL to build structure limits", widget="Query", required=false, def="")
60
                cql_limit
61
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=false, def="true")
62
                strict_inclusion
63
@Field @Option(name="position_in_matches", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=false, def="0")
64
                position_in_matches
65
//@Field @Option(name="output_2D", usage="output barplot or 3D plot", widget="Boolean", required=true, def="true")
66
                output_2D = true
67
@Field @Option(name="output_showlegend", usage="output barplot or 3D plot", widget="Boolean", required=false, def="true")
68
                output_showlegend
69
@Field @Option(name="output_fmin", usage="minimal frequency displayed", widget="Integer", required=false, def="0")
70
                output_fmin
71
@Field @Option(name="output_histogram", usage="show or not a histogram of the result", widget="Boolean", required=false, def="true")
72
                output_histogram
73
@Field @Option(name="output_lexicaltable", usage="create or not a lexical table with the result", widget="Boolean", required=false, def="false")
74
                output_lexicaltable
75
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF        ON        ALL        REALLY ALL", required=false, def="OFF")
76
                debug
77
if (!ParametersDialog.open(this)) return
78
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
79

    
80

    
81
if (word_property_display != null && word_property_display.length() > 0 && unit_property_display != null && unit_property_display.length() > 0) {
82
        println "Warning: both unit_property_display=$unit_property_display and word_property_display=$word_property_display are set. The index will be computed with the $word_property_display CQP property."
83
}
84

    
85
if (word_property_display.length() == 0 && unit_property_display.length() == 0) {
86
        println "Error: no analysis property specified in unit_property_display or word_property_display. Aborting."
87
        return
88
}
89

    
90
def CQI = CQPSearchEngine.getCqiClient()
91

    
92
def allresults = [:]
93
for (def corpus : selection) {
94

    
95
        def word = corpus.getWordProperty()
96
        def analecCorpus = URSCorpora.getCorpus(corpus)
97

    
98
        def errors = AnalecUtils.isPropertyDefined(Schema.class, analecCorpus, schema_ursql)
99
        if (errors.size() > 0) {
100
                println "** The $schema_ursql schema URSQL cannot be computed in the corpus with types: $errors."
101
                return;
102
        }
103
        
104
        errors=AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, unit_ursql)
105
        if (errors.size() > 0) {
106
                println "** $unit_ursql unit URSQL cannot be computed in the corpus with types: $errors."
107
                return;
108
        }
109
        
110
        if (unit_property_display.length() > 0) {
111
                errors = AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, AnalecUtils.getFilterParameters(unit_ursql)[0], unit_property_display).size()
112
                if (errors > 0) {
113
                        println "Error: some Unit types don't contain the $unit_property_display property: $errors"
114
                        return
115
                }
116
        }
117

    
118
        def selectedUnits = AnalecUtils.selectUnitsInSchema(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size,
119
                        unit_ursql, position_in_schema, cql_limit, strict_inclusion, position_in_matches);
120
        selectedUnits = new HashSet(selectedUnits)
121
        def counts = null
122
        if (word_property_display != null && word_property_display.length() > 0) {
123
                Property p = corpus.getProperty(word_property_display)
124
                int[] pos = null
125
                counts = [:]
126
                for (def unit : selectedUnits) {
127

    
128
                        if (unit.getDeb() == unit.getFin()) pos = [unit.getDeb()]
129
                        else pos = (unit.getDeb()..unit.getFin())
130

    
131
                        String s = StringUtils.join(CQI.cpos2Str(p.getQualifiedName(), pos), " ")
132
                        if (!counts.containsKey(s)) counts[s] = 0
133
                        counts[s] = counts[s] + 1
134
                }
135
        } else { // use Analec unit property
136
                counts = selectedUnits.countBy { if (it.getProp(unit_property_display) == null) "<null>" else it.getProp(unit_property_display) }
137
        }
138
        if (counts.containsKey("")) counts["<empty>"] = counts[""];
139
        counts.remove("")
140
        if (debug) {
141
                println "\n*** Statistics: "
142
                for (def k : counts.keySet()) {
143
                        if (k.size() > 0) { println "$k\t"+counts[k] }else{ println "N/A\t"+counts[k] }
144
                }
145
        }
146

    
147
        allresults[corpus] = counts
148
}
149
if (debug) println "RESULTS=$allresults"
150
def keys = new HashSet();
151
for (def corpus : allresults.keySet()) keys.addAll(allresults[corpus].keySet())
152
if (keys.size() == 0) {
153
        println "No values for $unit_property_display property"
154
        return
155
}
156
keys = keys.sort()
157
mFactory = DoubleFactory2D.dense
158
matrix = mFactory.make(keys.size(), selection.size())
159

    
160
if (debug) println "KEYS=$keys"
161

    
162
def corpus = null
163
if (corpusViewSelection instanceof Partition) { corpus = corpusViewSelection.getCorpus() }
164
else { corpus = corpusViewSelection }
165

    
166
def title = "${corpus.getMainCorpus()}.${corpusViewSelection}\n${unit_ursql}"
167
title += "[${position_in_matches}]."
168
if (word_property_display.length() > 0) title += "${word_property_display} frequencies"
169
else if (unit_property_display.length() > 0) title += "${unit_property_display} frequencies"
170
if (cql_limit != null && !cql_limit.getQueryString().equals("\"\"")) title += "\n(${cql_limit} limits)"
171

    
172
if (word_property_display.length() > 0) println "Index de la propriété $word_property_display des mots des unités $unit_ursql[$position_in_matches] de ${corpus.getMainCorpus()}.${corpusViewSelection}"
173
else println "Index de la propriété $unit_property_display des unités $unit_ursql[$position_in_matches] de ${corpus.getMainCorpus()}.${corpusViewSelection}"
174
println "$unit_property_display\t"+selection.join("\t")
175

    
176
keys.eachWithIndex { prop_val, i ->
177
        String line = ""
178
        if (prop_val.size() > 0) {
179
                line += prop_val
180
        } else {
181
                line += "N/A"
182
        }
183

    
184
        int min = 0;
185
        allresults.keySet().eachWithIndex { c, j ->
186
                def freq = allresults[c][prop_val]
187
                if (freq == null) freq = 0
188

    
189
                matrix.set(i, j, freq)
190
                line += "\t"+freq
191

    
192
                if (min < freq) min = freq
193
        }
194
        if (min >= output_fmin)
195
                println "$line"
196
}
197

    
198
props = keys
199

    
200
def r = RWorkspace.getRWorkspaceInstance()
201
def colnames = selection as String[]
202
r.addVectorToWorkspace("texts", colnames)
203
def rownames = props as String[]
204
r.addVectorToWorkspace("props", rownames)
205
r.addMatrixToWorkspace("textpropfreqs", matrix as DoubleMatrix2D)
206
r.eval("rownames(textpropfreqs) = props")
207
r.eval("colnames(textpropfreqs) = texts")
208

    
209
def resultsDir = new File(Toolbox.getTxmHomePath(), "results")
210
def file = File.createTempFile("txm", ".svg", resultsDir)
211

    
212

    
213
//if (corpusViewSelection.getClass() == org.txm.searchengine.cqp.corpus.Partition) { corpus = corpusViewSelection.getParent() } else { corpus = corpusViewSelection }
214
if (output_histogram) {
215
        def plotScript
216
        String legend = "rownames(textpropfreqs)"
217
        if (!output_showlegend) legend = "NULL"
218

    
219
        if (output_2D) {
220
                plotScript = """library(RColorBrewer);
221
barplot(height=textpropfreqs, main="${title}", xlab="Parties", legend=$legend, beside=TRUE, col=rainbow(length(rownames(textpropfreqs))))"""
222
        } else {
223
                plotScript = """
224
                   library(latticeExtra)
225
                   library(lattice)
226
                   library(RColorBrewer)
227
                   cloud(value~col+row, ..., panel.3d.cloud=panel.3dbars, col.facet='grey', xbase=0.4, ybase=0.4, scales=list(arrows=FALSE, col=1), par.settings = list(axis.line = list(col = "transparent")))
228
                   """
229
        }
230

    
231
        // execute R script
232
        r.plot(file, plotScript)
233

    
234
        //println "s1: "+r.eval("s1").asString()
235

    
236
        //display the SVG results graphic
237
        
238
}
239

    
240
def lt = null;
241
if (output_lexicaltable) {
242
        def parent = selection[0]
243
        println "parent="+parent.getClass()
244
        if (parent instanceof Part) {
245
                println "part="+parent
246
                parent = parent.getParent()
247
                println "partition="+parent
248
                lt = new LexicalTable(parent);
249
                lt.setData(new LexicalTableImpl(matrix as DoubleMatrix2D, rownames, colnames))
250

    
251
        } else {
252
                lt = new LexicalTable(parent);
253
                lt.setData(new LexicalTableImpl(matrix as DoubleMatrix2D, rownames, colnames))
254
        }
255
        lt.setUserName(schema_ursql+">"+unit_ursql)
256
}
257

    
258
if (file.exists() && file.getTotalSpace() > 0) {
259
        println "SVG file: "+file.getAbsolutePath()
260

    
261
        monitor.syncExec(new Runnable() {
262
                                @Override
263
                                public void run() { try {
264
                                                if (UnitsIndexMacro.this.output_histogram) {
265
                                                        //OpenSVGGraph.OpenSVGFile(file.getAbsolutePath(), selection.toString()+" Units")
266
                                                        OpenBrowser.openfile(file.getAbsolutePath())
267
                                                }
268
                                                if (UnitsIndexMacro.this.output_lexicaltable) {
269
                                                        CorporaView.refreshObject(corpus)
270
                                                        CorporaView.expand(lt)
271
                                                }
272
                                        } catch (e) { println "** ERROR: "+e }}
273
                        })
274
} else {
275
        println "Error: cannot show histogram result with $file"
276
}
277

    
278

    
279
/*
280
 d <- read.table(text=' x   y     z
281
 t1   5   high
282
 t1   2   low
283
 t1   4   med
284
 t2   8   high
285
 t2   1   low
286
 t2   3   med
287
 t3  50   high
288
 t3  12   med
289
 t3  35   low', header=TRUE)
290
 library(latticeExtra)
291
 cloud(y~x+z, d, panel.3d.cloud=panel.3dbars, col.facet='grey',
292
 xbase=0.4, ybase=0.4, scales=list(arrows=FALSE, col=1),
293
 par.settings = list(axis.line = list(col = "transparent")))
294
 */