Statistics
| Revision:

root / tmp / org.txm.analec.rcp / src / org / txm / macro / urs / exploit / SchemasIndexMacro.groovy @ 2167

History | View | Annotate | Download (5 kB)

1 2167 mdecorde
// Copyright © 2019 ENS de Lyon, CNRS, University of Franche-Comté
2 2167 mdecorde
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
3 2167 mdecorde
// @author mdecorde
4 2167 mdecorde
// @author sheiden
5 2167 mdecorde
6 2167 mdecorde
// STANDARD DECLARATIONS
7 2167 mdecorde
package org.txm.macro.urs.exploit
8 2167 mdecorde
9 2167 mdecorde
import groovy.transform.Field
10 2167 mdecorde
11 2167 mdecorde
import org.apache.commons.lang.StringUtils
12 2167 mdecorde
import org.kohsuke.args4j.*
13 2167 mdecorde
import org.txm.Toolbox
14 2167 mdecorde
import org.txm.annotation.urs.*
15 2167 mdecorde
import org.txm.lexicaltable.core.functions.LexicalTable
16 2167 mdecorde
import org.txm.lexicaltable.core.statsengine.r.data.LexicalTableImpl
17 2167 mdecorde
import org.txm.macro.urs.*
18 2167 mdecorde
import org.txm.rcp.commands.*
19 2167 mdecorde
import org.txm.rcp.swt.widget.parameters.*
20 2167 mdecorde
import org.txm.rcp.views.corpora.CorporaView
21 2167 mdecorde
import org.txm.searchengine.cqp.corpus.*
22 2167 mdecorde
import org.txm.searchengine.cqp.CQPSearchEngine
23 2167 mdecorde
import org.txm.statsengine.r.core.RWorkspace
24 2167 mdecorde
25 2167 mdecorde
import visuAnalec.donnees.*
26 2167 mdecorde
import visuAnalec.elements.*
27 2167 mdecorde
import cern.colt.matrix.DoubleFactory2D
28 2167 mdecorde
import cern.colt.matrix.DoubleMatrix2D
29 2167 mdecorde
30 2167 mdecorde
def selection = []
31 2167 mdecorde
for (def s : corpusViewSelections) {
32 2167 mdecorde
        if (s instanceof CQPCorpus) selection << s
33 2167 mdecorde
        else if (s instanceof Partition) selection.addAll(s.getParts())
34 2167 mdecorde
}
35 2167 mdecorde
36 2167 mdecorde
if (selection.size() == 0) {
37 2167 mdecorde
        println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections
38 2167 mdecorde
        return false
39 2167 mdecorde
} else {
40 2167 mdecorde
        for (def c : selection) c.compute(false)
41 2167 mdecorde
}
42 2167 mdecorde
43 2167 mdecorde
// BEGINNING OF PARAMETERS
44 2167 mdecorde
@Field @Option(name="schema_ursql", usage="TYPE@PROP=VALUE", widget="String", required=true, def="CHAINE")
45 2167 mdecorde
String schema_ursql
46 2167 mdecorde
@Field @Option(name="minimum_schema_size", usage="minimal schema size", widget="Integer", required=true, def="3")
47 2167 mdecorde
int minimum_schema_size
48 2167 mdecorde
@Field @Option(name="maximum_schema_size", usage="Maximum size needed to consider a schema", widget="Integer", required=true, def="9999999")
49 2167 mdecorde
int maximum_schema_size
50 2167 mdecorde
@Field @Option(name="schema_property_display", usage="schema property to show", widget="String", required=true, def="REF")
51 2167 mdecorde
String schema_property_display
52 2167 mdecorde
@Field @Option(name="unit_ursql", usage="TYPE@PROP=VALUE", widget="String", required=false, def="MENTION")
53 2167 mdecorde
String unit_ursql
54 2167 mdecorde
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true")
55 2167 mdecorde
boolean strict_inclusion
56 2167 mdecorde
@Field @Option(name="output_fmin", usage="minimal frequency displayed", widget="Integer", required=false, def="0")
57 2167 mdecorde
output_fmin
58 2167 mdecorde
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF        ON        ALL        REALLY ALL", required=true, def="OFF")
59 2167 mdecorde
debug
60 2167 mdecorde
61 2167 mdecorde
if (!ParametersDialog.open(this)) return;
62 2167 mdecorde
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
63 2167 mdecorde
64 2167 mdecorde
def allresults = [:]
65 2167 mdecorde
for (def corpus : selection) {
66 2167 mdecorde
        def analecCorpus = URSCorpora.getCorpus(corpus)
67 2167 mdecorde
68 2167 mdecorde
        def errors = AnalecUtils.isPropertyDefined(Schema.class, analecCorpus, schema_ursql)
69 2167 mdecorde
        if (errors.size() > 0) {
70 2167 mdecorde
                println "** The $schema_ursql schema URSQL cannot be computed in the corpus with types: $errors."
71 2167 mdecorde
                return;
72 2167 mdecorde
        }
73 2167 mdecorde
74 2167 mdecorde
        errors=AnalecUtils.isPropertyDefined(Schema.class, analecCorpus, schema_property_display)
75 2167 mdecorde
        if (errors.size() > 0) {
76 2167 mdecorde
                println "** $schema_property_display schema property not found in the corpus with types: $errors."
77 2167 mdecorde
                return;
78 2167 mdecorde
        }
79 2167 mdecorde
80 2167 mdecorde
        errors = AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, unit_ursql)
81 2167 mdecorde
        if (errors.size() > 0) {
82 2167 mdecorde
                println "** $unit_ursql unit URSQL cannot be computed in the corpus with types: $errors."
83 2167 mdecorde
                return;
84 2167 mdecorde
        }
85 2167 mdecorde
86 2167 mdecorde
        def CQI = CQPSearchEngine.getCqiClient()
87 2167 mdecorde
88 2167 mdecorde
        def schemas = AnalecUtils.selectSchemasInCorpus(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, strict_inclusion);
89 2167 mdecorde
        schemas.sort() {it.getProps()}
90 2167 mdecorde
91 2167 mdecorde
        def counts = schemas.countBy { if (it.getProp(schema_property_display) == null) "<null>" else it.getProp(schema_property_display) }
92 2167 mdecorde
93 2167 mdecorde
        if (counts.containsKey("")) counts["<empty>"] = counts[""];
94 2167 mdecorde
        counts.remove("")
95 2167 mdecorde
96 2167 mdecorde
        if (debug) {
97 2167 mdecorde
                println "\n*** Statistics: "
98 2167 mdecorde
                for (def k : counts.keySet()) {
99 2167 mdecorde
                        if (k.size() > 0) { println "$k\t"+counts[k] }else{ println "N/A\t"+counts[k] }
100 2167 mdecorde
                }
101 2167 mdecorde
        }
102 2167 mdecorde
103 2167 mdecorde
        allresults[corpus] = counts
104 2167 mdecorde
}
105 2167 mdecorde
106 2167 mdecorde
if (debug) println "RESULTS=$allresults"
107 2167 mdecorde
def keys = new HashSet();
108 2167 mdecorde
for (def corpus : allresults.keySet()) keys.addAll(allresults[corpus].keySet())
109 2167 mdecorde
if (keys.size() == 0) {
110 2167 mdecorde
        println "No values for $schema_property_display property"
111 2167 mdecorde
        return
112 2167 mdecorde
}
113 2167 mdecorde
keys = keys.sort()
114 2167 mdecorde
mFactory = DoubleFactory2D.dense
115 2167 mdecorde
matrix = mFactory.make(keys.size(), selection.size())
116 2167 mdecorde
117 2167 mdecorde
if (debug) println "KEYS=$keys"
118 2167 mdecorde
119 2167 mdecorde
120 2167 mdecorde
121 2167 mdecorde
def title = "${selection}\n${unit_ursql}"
122 2167 mdecorde
title += " ${schema_property_display} frequencies"
123 2167 mdecorde
124 2167 mdecorde
println "Index de la propriété $schema_property_display des schemas $schema_ursql de ${selection}"
125 2167 mdecorde
println "$schema_property_display\t"+selection.join("\t")
126 2167 mdecorde
127 2167 mdecorde
keys.eachWithIndex { prop_val, i ->
128 2167 mdecorde
        String line = ""
129 2167 mdecorde
        if (prop_val.size() > 0) {
130 2167 mdecorde
                line += prop_val
131 2167 mdecorde
        } else {
132 2167 mdecorde
                line += "N/A"
133 2167 mdecorde
        }
134 2167 mdecorde
135 2167 mdecorde
        int min = 0;
136 2167 mdecorde
        allresults.keySet().eachWithIndex { c, j ->
137 2167 mdecorde
                def freq = allresults[c][prop_val]
138 2167 mdecorde
                if (freq == null) freq = 0
139 2167 mdecorde
140 2167 mdecorde
                matrix.set(i, j, freq)
141 2167 mdecorde
                line += "\t"+freq
142 2167 mdecorde
                if (min < freq) min = freq
143 2167 mdecorde
        }
144 2167 mdecorde
        if (min >= output_fmin)
145 2167 mdecorde
                println "$line"
146 2167 mdecorde
}
147 2167 mdecorde
props = keys
148 2167 mdecorde