root / tmp / org.txm.analec.rcp / src / org / txm / macro / analec / misc / UnitsCorrelationMacro.groovy @ 875
History | View | Annotate | Download (7.3 kB)
1 |
// Copyright © 2016 ENS de Lyon, CNRS, University of Franche-Comté
|
---|---|
2 |
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
|
3 |
// @author mdecorde
|
4 |
// @author sheiden
|
5 |
// STANDARD DECLARATIONS
|
6 |
package org.txm.macro.analec.misc
|
7 |
|
8 |
import org.apache.commons.lang.StringUtils |
9 |
import org.txm.rcp.views.corpora.CorporaView |
10 |
import groovy.transform.Field |
11 |
|
12 |
import org.kohsuke.args4j.* |
13 |
import org.txm.Toolbox |
14 |
import org.txm.analec.* |
15 |
import org.txm.lexicaltable.core.statsengine.r.data.LexicalTableImpl |
16 |
import org.txm.macro.analec.* |
17 |
import org.txm.rcp.commands.* |
18 |
import org.txm.rcp.swt.widget.parameters.* |
19 |
import org.txm.searchengine.cqp.corpus.* |
20 |
import org.txm.searchengine.cqp.corpus.query.Query; |
21 |
|
22 |
import visuAnalec.donnees.* |
23 |
import visuAnalec.elements.* |
24 |
import cern.colt.matrix.DoubleFactory2D |
25 |
import cern.colt.matrix.DoubleMatrix2D |
26 |
|
27 |
def scriptName = this.class.getSimpleName() |
28 |
def parent
|
29 |
def selection = [] |
30 |
if (!(corpusViewSelection instanceof Corpus)) { |
31 |
println "** $scriptName please select a Corpus to run the macro"
|
32 |
} |
33 |
|
34 |
|
35 |
// BEGINNING OF PARAMETERS
|
36 |
@Field @Option(name="schema_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="CHAINE") |
37 |
String schema_ursql
|
38 |
@Field @Option(name="minimum_schema_size", usage="Minimum size needed to consider a schema", widget="Integer", required=true, def="3") |
39 |
int minimum_schema_size
|
40 |
@Field @Option(name="maximum_schema_size", usage="Maximum size needed to consider a schema", widget="Integer", required=true, def="9999999") |
41 |
int maximum_schema_size
|
42 |
@Field @Option(name="unit_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="MENTION") |
43 |
String unit_ursql
|
44 |
@Field @Option(name="limit_distance_in_schema", usage="Unit distance in schema (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0") |
45 |
int limit_distance_in_schema
|
46 |
@Field @Option(name="limit_cql", usage="CQL to build structure limits", widget="Query", required=true, def="<div> [] expand to div") |
47 |
limit_cql |
48 |
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true") |
49 |
boolean strict_inclusion
|
50 |
@Field @Option(name="limit_distance", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0") |
51 |
int limit_distance
|
52 |
@Field @Option(name="unit_prop1", usage="PROP1", widget="String", required=false, def="PROP1") |
53 |
String unit_prop1
|
54 |
@Field @Option(name="unit_prop2", usage="PROP2", widget="String", required=false, def="PROP2") |
55 |
String unit_prop2
|
56 |
@Field @Option(name="corr_method", usage="try them all", widget="StringArray", metaVar="pearson spearman kendall", required=false, def="pearson") |
57 |
String corr_method
|
58 |
@Field @Option(name="corr_style", usage="try them all", widget="StringArray", metaVar="circle square ellipse number shade color pie", required=false, def="number") |
59 |
String corr_style
|
60 |
@Field @Option(name="corr_layout", usage="try them all", widget="StringArray", metaVar="full lower upper", required=false, def="upper") |
61 |
String corr_layout
|
62 |
@Field @Option(name="corr_order", usage="try them all", widget="StringArray", metaVar="AOE FPC hclust alphabet", required=false, def="hclust") |
63 |
String corr_order
|
64 |
@Field @Option(name="output_lexicaltable", usage="create or not a lexical table with the result", widget="Boolean", required=true, def="false") |
65 |
output_lexicaltable |
66 |
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF ON ALL REALLY ALL", required=true, def="OFF") |
67 |
debug |
68 |
if (!ParametersDialog.open(this)) return |
69 |
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3 |
70 |
|
71 |
|
72 |
def CQI = CQPSearchEngine.getCqiClient()
|
73 |
|
74 |
def correlations = [:]
|
75 |
def values1 = new HashSet() |
76 |
def values2 = new HashSet() |
77 |
def corpus = corpusViewSelection
|
78 |
|
79 |
mainCorpus = corpus.getMainCorpus() |
80 |
|
81 |
def word = mainCorpus.getWordProperty()
|
82 |
def analecCorpus = AnalecCorpora.getCorpus(mainCorpus.getName())
|
83 |
|
84 |
|
85 |
|
86 |
def selectedUnits = AnalecUtils.selectUnitsInSchema(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size,
|
87 |
unit_ursql, limit_distance_in_schema, limit_cql, strict_inclusion, limit_distance); |
88 |
|
89 |
for (def unit : selectedUnits) { |
90 |
def value1 = unit.getProp(unit_prop1)
|
91 |
if (value1 == null) value1 = "<null>" |
92 |
if (value1.length() == 0) value1 = "<empty>" |
93 |
def value2 = unit.getProp(unit_prop2)
|
94 |
if (value2 == null) value2 = "<null>" |
95 |
if (value2.length() == 0) value2 = "<empty>" |
96 |
|
97 |
values1 << value1 |
98 |
values2 << value2 |
99 |
|
100 |
if (!correlations.containsKey(value1)) correlations[value1] = [:]
|
101 |
def line = correlations[value1]
|
102 |
if (!line.containsKey(value2)) line[value2] = 0 |
103 |
line[value2] += 1
|
104 |
} |
105 |
|
106 |
def matrix = new int[values1.size()][values2.size()]; |
107 |
println "\t"+values2.join("\t") |
108 |
int i = 0; |
109 |
for (def value1 : values1) { |
110 |
print value1 |
111 |
int j = 0; |
112 |
for (def value2 : values2) { |
113 |
if (correlations[value1][value2] == null) correlations[value1][value2] = 0; |
114 |
print "\t"+correlations[value1][value2]
|
115 |
|
116 |
matrix[i][j] = correlations[value1][value2] |
117 |
j++ |
118 |
} |
119 |
println ""
|
120 |
i++ |
121 |
} |
122 |
|
123 |
def r = RWorkspace.getRWorkspaceInstance()
|
124 |
r.addVectorToWorkspace("corrlines", values1 as String[]) |
125 |
r.addVectorToWorkspace("corrcols", values2 as String[]) |
126 |
r.addMatrixToWorkspace("corrmatrix", matrix)
|
127 |
r.eval("rownames(corrmatrix) = corrlines")
|
128 |
r.eval("colnames(corrmatrix) = corrcols")
|
129 |
|
130 |
def resultsDir = new File(Toolbox.getTxmHomePath(), "results") |
131 |
resultsDir.mkdirs() |
132 |
file = File.createTempFile("txm_corr_pairs_", ".svg", resultsDir) |
133 |
|
134 |
|
135 |
|
136 |
def title = "${corpus.getMainCorpus()}.${corpus}\n${unit_ursql}" |
137 |
if (limit_distance > 1) title += "[${limit_distance}]." |
138 |
if (limit_cql != null && !limit_cql.getQueryString().equals("\"\"")) title += "\n(${limit_cql} limits)" |
139 |
title += "\t P1=$unit_prop1 P2=unit_prop2"
|
140 |
|
141 |
def plotScript = """ |
142 |
|
143 |
r1 = cor(corrmatrix, use="complete.obs", method="$corr_method");
|
144 |
r2 = cov(corrmatrix, use="complete.obs") ;
|
145 |
|
146 |
library(corrplot)
|
147 |
corrplot(r1, type="$corr_layout", order="$corr_order", method="$corr_style")
|
148 |
"""
|
149 |
|
150 |
|
151 |
// execute R script
|
152 |
if (!output_lexicaltable) {
|
153 |
r.plot(file, plotScript) |
154 |
} |
155 |
title = "$unit_prop1 $corr_method correlations"
|
156 |
|
157 |
|
158 |
def lt = null; |
159 |
if (output_lexicaltable) {
|
160 |
mFactory = DoubleFactory2D.dense |
161 |
dmatrix = mFactory.make(values1.size(), values2.size()) |
162 |
for (int ii = 0 ; ii < values1.size() ; ii++) { |
163 |
for (int jj = 0 ; jj < values2.size() ; jj++) { |
164 |
dmatrix.set(ii, jj, matrix[ii][jj]) |
165 |
} |
166 |
} |
167 |
if (corpusViewSelection instanceof Partition) { |
168 |
lt = new LexicalTableImpl(dmatrix, corpusViewSelection, corpusViewSelection.getCorpus().getProperty("word"), |
169 |
values1 as String[], values2 as String[]) |
170 |
lt.setCorpus(corpusViewSelection.getCorpus()); |
171 |
corpusViewSelection.storeResult(lt) |
172 |
} else {
|
173 |
lt = new LexicalTableImpl(dmatrix, corpus.getProperty("word"), |
174 |
values1 as String[], values2 as String[]) |
175 |
lt.setCorpus(corpus); |
176 |
corpus.storeResult(lt) |
177 |
} |
178 |
} |
179 |
|
180 |
|
181 |
|
182 |
|
183 |
monitor.syncExec(new Runnable() { |
184 |
@Override
|
185 |
public void run() { try { |
186 |
|
187 |
|
188 |
|
189 |
if (UnitsCorrelationMacro.this.output_lexicaltable) {
|
190 |
CorporaView.refreshObject(corpus) |
191 |
CorporaView.expand(lt) |
192 |
} else {
|
193 |
OpenSVGGraph.OpenSVGFile(UnitsCorrelationMacro.this.file.getAbsolutePath(), "Correlations Units")
|
194 |
} |
195 |
} catch (e) { e.printStackTrace() }}
|
196 |
}) |
197 |
|
198 |
|
199 |
return correlations
|