Statistics
| Revision:

root / tmp / org.txm.analec.rcp / src / org / txm / macro / urs / exploit / SchemaAccessibilityScoresMacro.groovy @ 2144

History | View | Annotate | Download (5.2 kB)

1
// Copyright © 2016 ENS de Lyon, CNRS, University of Franche-Comté
2
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
3
// @author mdecorde
4
// @author sheiden
5

    
6
// STANDARD DECLARATIONS
7
package org.txm.macro.urs.exploit
8

    
9
import org.kohsuke.args4j.*
10
import groovy.transform.Field
11
import org.txm.rcp.swt.widget.parameters.*
12
import org.txm.annotation.urs.*
13
import org.txm.macro.urs.AnalecUtils
14
import visuAnalec.elements.*
15
import org.txm.searchengine.cqp.corpus.*
16
import org.txm.Toolbox
17
import org.txm.rcp.commands.*
18
import org.txm.statsengine.r.core.RWorkspace
19

    
20
def selection = []
21
for (def s : corpusViewSelections) {
22
        if (s instanceof CQPCorpus) selection << s
23
        else if (s instanceof Partition) selection.addAll(s.getParts())
24
}
25

    
26
if (selection.size() == 0) {
27
        println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections
28
        return false
29
} else {
30
        for (def c : selection) c.compute(false)
31
}
32

    
33
@Field @Option(name="schema_ursql", usage="TYPE@PROP=REGEX", widget="String", required=true, def="CHAINE")
34
                String schema_ursql
35

    
36
@Field @Option(name="minimum_schema_size", usage="minimal schema size", widget="Integer", required=true, def="3")
37
                int minimum_schema_size
38
@Field @Option(name="maximum_schema_size", usage="Maximum size needed to consider a schema", widget="Integer", required=true, def="9999999")
39
                int maximum_schema_size
40
@Field @Option(name="schema_property_display", usage="schema property to show", widget="String", required=true, def="REF")
41
                String schema_property_display
42
@Field @Option(name="unit_ursql", usage="TYPE@PROP=REGEX", widget="String", required=false, def="MENTION")
43
                String unit_ursql
44
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true")
45
                boolean strict_inclusion
46
@Field @Option(name="position_in_matches", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
47
                int position_in_matches
48
@Field @Option(name="sep", usage="choose between the len or freq columns", widget="Separator", metaVar="freq        len", required=true, def="")
49
                def sep
50
@Field @Option(name="sort_column", usage="choose between the len or freq columns", widget="StringArray", metaVar="freq        len", required=true, def="freq")
51
                String sort_column
52
@Field @Option(name="max_lines", usage="cut the number of lines shown. '0' means no cut", widget="Integer", required=false, def="0")
53
                int max_lines
54
//@Field @Option(name="output_graph", usage="Show chart", widget="Boolean", required=true, def="false")
55
                output_graph = false
56
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF        ON        ALL        REALLY ALL", required=true, def="OFF")
57
                debug
58

    
59
if (!ParametersDialog.open(this)) return;
60
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
61

    
62
for (def corpus : selection) {
63
        def analecCorpus = URSCorpora.getCorpus(corpus)
64

    
65
        def errors = AnalecUtils.isPropertyDefined(Schema.class, analecCorpus, schema_ursql)
66
        if (errors.size() > 0) {
67
                println "** The $schema_ursql schema URSQL cannot be computed in the corpus with types: $errors."
68
                return;
69
        }
70

    
71
        errors=AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, unit_ursql)
72
        if (errors.size() > 0) {
73
                println "** $unit_ursql unit URSQL cannot be computed in the corpus with types: $errors."
74
                return;
75
        }
76

    
77
        def schemas = AnalecUtils.selectSchemasInCorpus(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, strict_inclusion);
78

    
79
        int nSchemas = 0;
80

    
81
        def scores = [:]
82
        def lensnames = [:]
83
        for (def schema : schemas) {
84

    
85
                def allUnites = schema.getUnitesSousjacentesNonTriees()
86
                
87
                def units = AnalecUtils.filterElements(debug, allUnites, unit_ursql)
88
                
89
                selectedUnits = AnalecUtils.filterUniteByInclusion(debug, units, corpus.getMatches(), strict_inclusion, position_in_matches)
90
                
91
                int nUnites = selectedUnits.size();
92

    
93
                if (!scores.containsKey(schema)) {
94
                        scores[schema] = 0;
95
                }
96

    
97
                scores[schema] = units.su
98
                
99
                String value = schema.getProp(schema_property_display);
100
                nSchemas++;
101
        }
102

    
103
        //println "nSchemas=$nSchemas"
104
        def freqs = lens.keySet();
105
        freqs.sort();
106
        int t = 0;
107
        int n = 0;
108
        //println "Fréquences ("+freqs.size()+")"
109
        for (def f : freqs) {
110
                t += f * lens[f]
111
                n += lens[f]
112
        }
113

    
114
        if (n == 0) {
115
                println "No units selected for schemas=$schema_ursql and units=$unit_ursql"
116
                return false;
117
        }
118
        
119
        coef = (t/n)
120
        
121
        def slens = null
122
        if ("freq".equals(sort_column)) {
123
                slens = lens.sort { a, b -> -a.value <=> -b.value ?: -a.key <=> -b.key }
124
        } else {
125
                slens = lens.sort { a, b -> -a.key <=> -b.key ?: -a.value <=> -b.value }
126
        }
127
        
128
        def flens = []
129
        slens.each { key, value -> value.times { flens << key } }
130
        def nbins = flens.size()*2
131

    
132
        println "Schema scores in '$corpus'. The average schema score is $t/$n = "+coef
133
        println "Index:\nlen\tfreq\tcfreq"
134
        int nShown = 0;
135
        for( def it : slens) {
136
                println it.key+"        "+it.value+"        "+(cfreq+=it.value)+"        "+lensnames[it.key].join(", ")
137
                nShown++
138
                if (max_lines > 0 && nShown >= max_lines) {
139
                        println "... (${slens.size() - max_lines})"
140
                        break;
141
                }
142
        }
143

    
144
        
145
        //return ["result":coef, "data":lens]
146
}
147

    
148