Statistics
| Revision:

root / tmp / org.txm.analec.rcp / src / org / txm / macro / analec / LongueurMoyenneMacro.groovy @ 1166

History | View | Annotate | Download (4.9 kB)

1
// Copyright © 2016 ENS de Lyon, CNRS, University of Franche-Comté
2
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
3
// @author mdecorde
4
// @author sheiden
5

    
6
// STANDARD DECLARATIONS
7
package org.txm.macro.analec
8

    
9
import org.kohsuke.args4j.*
10

    
11
import groovy.transform.Field
12

    
13
import org.txm.rcp.swt.widget.parameters.*
14
import org.txm.analec.*
15
import org.txm.searchengine.cqp.corpus.*
16
import org.txm.statsengine.r.core.RWorkspace;
17
import org.txm.Toolbox
18
import org.txm.rcp.commands.*
19

    
20
// BEGINNING OF PARAMETERS
21
@Field @Option(name="schema_type",usage="", widget="String", required=true, def="Coréférence")
22
String schema_type
23

    
24
@Field @Option(name="minimum_schema_size",usage="", widget="Integer", required=true, def="3")
25
int minimum_schema_size
26

    
27
@Field @Option(name="schema_property_name",usage="", widget="String", required=false, def="")
28
String schema_property_name
29

    
30
@Field @Option(name="schema_property_value",usage="", widget="String", required=false, def=".*")
31
String schema_property_value
32

    
33
@Field @Option(name="unit_type",usage="", widget="String", required=false, def="Maillon")
34
String unit_type
35

    
36
@Field @Option(name="unit_property_name", usage="", widget="String", required=false, def="")
37
String unit_property_name
38

    
39
@Field @Option(name="unit_property_value", usage="", widget="String", required=false, def=".*")
40
String unit_property_value
41

    
42
if (!(corpusViewSelection instanceof MainCorpus)) {
43
        println "Corpora selection is not a Corpus"
44
        return;
45
}
46

    
47
if (!ParametersDialog.open(this)) return;
48
// END OF PARAMETERS
49

    
50
MainCorpus corpus = corpusViewSelection
51
def analecCorpus = AnalecCorpora.getCorpus(corpus)
52

    
53
// check Schema parameters
54
if (!analecCorpus.getStructure().getSchemas().contains(schema_type)) {
55
        println "No schema with name=$schema_type"
56
        return;
57
} else {
58
        if (schema_property_name.length() > 0 && schema_property_value.length() > 0) {
59
                // test property existance
60
                def props = analecCorpus.getStructure().getSchemaProperties(schema_type);
61
                if (!props.contains(schema_property_name)) {
62
                        println "Schema $schema_type has no property named $schema_property_name"
63
                        return;
64
                }
65
        }
66
}
67

    
68
// check unit parameters
69
if (!analecCorpus.getStructure().getUnites().contains(unit_type)) {
70
        println "No unit with name=$unit_type"
71
        return;
72
} else {
73
        if (unit_property_name.length() > 0 && unit_property_value.length() > 0) {
74
                // test property existance
75
                def props = analecCorpus.getStructure().getUniteProperties(unit_type);
76
                if (!props.contains(unit_property_name)) {
77
                        println "Unit $unit_type has no property named $unit_property_name"
78
                        return;
79
                }
80
        }
81
}
82

    
83
def schemas = analecCorpus.getSchemas(schema_type)
84

    
85
int nSchemas = 0;
86

    
87
def lens = [:]
88
for (def schema : schemas) {
89

    
90
        if (schema_property_name.length() > 0 && schema_property_value.length() > 0) {
91
                if (!schema.getProp(schema_property_name).matches(schema_property_value)) {
92
                        // ignoring this schema
93
                        continue
94
                }
95
        }
96
        
97
        int nUnites = 0;
98
        for (def unit : schema.getUnitesSousjacentesNonTriees()) {
99
                if (unit_type.length() > 0) {
100
                        if (!unit.getType().equals(unit_type)) {
101
                                continue
102
                        }
103
                }
104
                
105
                if (unit_property_name.length() > 0 && unit_property_value.length() > 0) {
106
                        if (!unit.getProp(unit_property_name).matches(unit_property_value)) {
107
                                // ignoring this schema
108
                                continue
109
                        }
110
                }
111
        
112
                nUnites++
113
        }
114
        
115
        if (nUnites < minimum_schema_size) continue;
116
        
117
        if (!lens.containsKey(nUnites)) lens[nUnites] = 0;
118
        
119
        lens[nUnites] = lens[nUnites] + 1;
120
        nSchemas++;
121
}
122

    
123
//println "nSchemas=$nSchemas"
124
def freqs = lens.keySet();
125
freqs.sort();
126
int t = 0;
127
int n = 0;
128
//println "Fréquences ("+freqs.size()+")"
129
for (def f : freqs) {
130
        t += f * lens[f]
131
        n += lens[f]
132
}
133

    
134
coef = (t/n)
135
def slens = lens.sort { a, b -> -a.value <=> -b.value ?: -a.key <=> -b.key }
136
def flens = []
137
slens.each { key, value -> value.times { flens << key } }
138
def nbins = flens.size()*2
139

    
140
def cfreq = 0
141
println "longueur moyenne des chaînes de référence : $t/$n = "+coef
142
println "index hiérarchique des longueurs de chaînes :\nlen\tfreq\tcfreq"
143
slens.each { println it.key+"        "+it.value+"        "+(cfreq+=it.value) }
144

    
145
def slens2 = slens.sort { a, b -> -a.key <=> -b.key }
146

    
147
def r = RWorkspace.getRWorkspaceInstance()
148

    
149
r.addVectorToWorkspace("len", slens2.keySet() as int[])
150
r.addVectorToWorkspace("freq", slens2.values() as int[])
151
r.addVectorToWorkspace("flen", flens as int[])
152

    
153
def corpusName = corpus.getName()
154

    
155
def file = File.createTempFile("txm", ".svg", new File(Toolbox.getTxmHomePath(), "results"))
156
println "SVG file: "+file.getAbsolutePath()
157
/// BEGINNING OF R SCRIPT
158
def script ="""
159
hist(flen, xaxt='n', col="gray", xlab="Length", breaks=$nbins, main="$corpusName Longueur des chaînes ($nbins bins)")
160
axis(side=1, at=len)
161
"""
162
/// END OF R SCRIPT
163

    
164
// execute R script
165
r.plot(file, script)
166

    
167
//display the SVG results graphic
168
monitor.syncExec(new Runnable() {
169
        @Override
170
        public void run() { OpenSVGGraph.OpenSVGFile(file.getAbsolutePath(), corpusName+" Longueur des chaînes") }
171
})
172

    
173
return ["result":coef, "data":lens]