Statistics
| Revision:

root / tmp / org.txm.analec.rcp / src / org / txm / macro / urs / exploit / SchemaLengthsMacro.groovy @ 2167

History | View | Annotate | Download (6.8 kB)

1 671 mdecorde
// Copyright © 2016 ENS de Lyon, CNRS, University of Franche-Comté
2 671 mdecorde
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
3 671 mdecorde
// @author mdecorde
4 671 mdecorde
// @author sheiden
5 671 mdecorde
6 671 mdecorde
// STANDARD DECLARATIONS
7 2082 mdecorde
package org.txm.macro.urs.exploit
8 671 mdecorde
9 671 mdecorde
import org.kohsuke.args4j.*
10 671 mdecorde
import groovy.transform.Field
11 671 mdecorde
import org.txm.rcp.swt.widget.parameters.*
12 1217 mdecorde
import org.txm.annotation.urs.*
13 1968 mdecorde
import org.txm.macro.urs.AnalecUtils
14 1968 mdecorde
import visuAnalec.elements.*
15 671 mdecorde
import org.txm.searchengine.cqp.corpus.*
16 671 mdecorde
import org.txm.Toolbox
17 671 mdecorde
import org.txm.rcp.commands.*
18 1968 mdecorde
import org.txm.statsengine.r.core.RWorkspace
19 671 mdecorde
20 2105 mdecorde
def selection = []
21 2105 mdecorde
for (def s : corpusViewSelections) {
22 2105 mdecorde
        if (s instanceof CQPCorpus) selection << s
23 2105 mdecorde
        else if (s instanceof Partition) selection.addAll(s.getParts())
24 1968 mdecorde
}
25 1968 mdecorde
26 2105 mdecorde
if (selection.size() == 0) {
27 2105 mdecorde
        println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections
28 2105 mdecorde
        return false
29 2105 mdecorde
} else {
30 2105 mdecorde
        for (def c : selection) c.compute(false)
31 2105 mdecorde
}
32 2105 mdecorde
33 1962 mdecorde
@Field @Option(name="schema_ursql", usage="TYPE@PROP=REGEX", widget="String", required=true, def="CHAINE")
34 2105 mdecorde
                String schema_ursql
35 671 mdecorde
36 1962 mdecorde
@Field @Option(name="minimum_schema_size", usage="minimal schema size", widget="Integer", required=true, def="3")
37 2105 mdecorde
                int minimum_schema_size
38 2099 mdecorde
@Field @Option(name="maximum_schema_size", usage="Maximum size needed to consider a schema", widget="Integer", required=true, def="9999999")
39 2105 mdecorde
                int maximum_schema_size
40 1962 mdecorde
@Field @Option(name="schema_property_display", usage="schema property to show", widget="String", required=true, def="REF")
41 2105 mdecorde
                String schema_property_display
42 1962 mdecorde
@Field @Option(name="unit_ursql", usage="TYPE@PROP=REGEX", widget="String", required=false, def="MENTION")
43 2105 mdecorde
                String unit_ursql
44 2143 mdecorde
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true")
45 2143 mdecorde
                boolean strict_inclusion
46 2144 mdecorde
@Field @Option(name="position_in_matches", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0")
47 2144 mdecorde
                int position_in_matches
48 2147 mdecorde
@Field @Option(name="sep", usage="choose between the len or freq columns", widget="Separator", metaVar="freq        len", required=false, def="")
49 2143 mdecorde
                def sep
50 2143 mdecorde
@Field @Option(name="sort_column", usage="choose between the len or freq columns", widget="StringArray", metaVar="freq        len", required=true, def="freq")
51 2143 mdecorde
                String sort_column
52 2143 mdecorde
@Field @Option(name="max_lines", usage="cut the number of lines shown. '0' means no cut", widget="Integer", required=false, def="0")
53 2143 mdecorde
                int max_lines
54 2105 mdecorde
//@Field @Option(name="output_graph", usage="Show chart", widget="Boolean", required=true, def="false")
55 2105 mdecorde
                output_graph = false
56 1968 mdecorde
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF        ON        ALL        REALLY ALL", required=true, def="OFF")
57 2105 mdecorde
                debug
58 671 mdecorde
59 671 mdecorde
if (!ParametersDialog.open(this)) return;
60 1968 mdecorde
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
61 671 mdecorde
62 2105 mdecorde
for (def corpus : selection) {
63 2105 mdecorde
        def analecCorpus = URSCorpora.getCorpus(corpus)
64 671 mdecorde
65 2105 mdecorde
        def errors = AnalecUtils.isPropertyDefined(Schema.class, analecCorpus, schema_ursql)
66 2105 mdecorde
        if (errors.size() > 0) {
67 2105 mdecorde
                println "** The $schema_ursql schema URSQL cannot be computed in the corpus with types: $errors."
68 2105 mdecorde
                return;
69 2105 mdecorde
        }
70 671 mdecorde
71 2105 mdecorde
        errors=AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, unit_ursql)
72 2105 mdecorde
        if (errors.size() > 0) {
73 2105 mdecorde
                println "** $unit_ursql unit URSQL cannot be computed in the corpus with types: $errors."
74 2105 mdecorde
                return;
75 2105 mdecorde
        }
76 671 mdecorde
77 2167 mdecorde
        //println([debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, strict_inclusion])
78 2144 mdecorde
        def schemas = AnalecUtils.selectSchemasInCorpus(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, strict_inclusion);
79 671 mdecorde
80 2105 mdecorde
        int nSchemas = 0;
81 671 mdecorde
82 2105 mdecorde
        def lens = [:]
83 2105 mdecorde
        def lensnames = [:]
84 2105 mdecorde
        for (def schema : schemas) {
85 671 mdecorde
86 2105 mdecorde
                def allUnites = schema.getUnitesSousjacentesNonTriees()
87 2143 mdecorde
88 2105 mdecorde
                def units = AnalecUtils.filterElements(debug, allUnites, unit_ursql)
89 2143 mdecorde
90 2144 mdecorde
                selectedUnits = AnalecUtils.filterUniteByInclusion(debug, units, corpus.getMatches(), strict_inclusion, position_in_matches)
91 2143 mdecorde
92 2143 mdecorde
                int nUnites = selectedUnits.size();
93 671 mdecorde
94 2105 mdecorde
                if (!lens.containsKey(nUnites)) {
95 2105 mdecorde
                        lens[nUnites] = 0;
96 2105 mdecorde
                        lensnames[nUnites] = [];
97 2105 mdecorde
                }
98 671 mdecorde
99 2105 mdecorde
                lens[nUnites] = lens[nUnites] + 1;
100 2143 mdecorde
                String value = schema.getProp(schema_property_display);
101 2167 mdecorde
                //if (value.contains(",")) value = "\"$value\""
102 2143 mdecorde
                lensnames[nUnites] << value
103 2105 mdecorde
                nSchemas++;
104 2105 mdecorde
        }
105 671 mdecorde
106 2105 mdecorde
        //println "nSchemas=$nSchemas"
107 2105 mdecorde
        def freqs = lens.keySet();
108 2105 mdecorde
        freqs.sort();
109 2105 mdecorde
        int t = 0;
110 2105 mdecorde
        int n = 0;
111 2105 mdecorde
        //println "Fréquences ("+freqs.size()+")"
112 2105 mdecorde
        for (def f : freqs) {
113 2105 mdecorde
                t += f * lens[f]
114 2105 mdecorde
                n += lens[f]
115 2105 mdecorde
        }
116 671 mdecorde
117 2143 mdecorde
        if (n == 0) {
118 2143 mdecorde
                println "No units selected for schemas=$schema_ursql and units=$unit_ursql"
119 2143 mdecorde
                return false;
120 2143 mdecorde
        }
121 2143 mdecorde
122 2105 mdecorde
        coef = (t/n)
123 2143 mdecorde
124 2143 mdecorde
        def slens = null
125 2143 mdecorde
        if ("freq".equals(sort_column)) {
126 2143 mdecorde
                slens = lens.sort { a, b -> -a.value <=> -b.value ?: -a.key <=> -b.key }
127 2143 mdecorde
        } else {
128 2143 mdecorde
                slens = lens.sort { a, b -> -a.key <=> -b.key ?: -a.value <=> -b.value }
129 2143 mdecorde
        }
130 2143 mdecorde
131 2105 mdecorde
        def flens = []
132 2105 mdecorde
        slens.each { key, value -> value.times { flens << key } }
133 2105 mdecorde
        def nbins = flens.size()*2
134 671 mdecorde
135 2105 mdecorde
        def cfreq = 0
136 2143 mdecorde
        println "Schema lengths in '$corpus'. The average schema length is $t/$n = "+coef
137 2143 mdecorde
        println "Index:\nlen\tfreq\tcfreq"
138 2143 mdecorde
        int nShown = 0;
139 2143 mdecorde
        for( def it : slens) {
140 2167 mdecorde
                def names = lensnames[it.key]
141 2167 mdecorde
                names = names.sort()
142 2167 mdecorde
                println it.key+"        "+it.value+"        "+(cfreq+=it.value)+"        \""+names.join("\", \"")+"\""
143 2143 mdecorde
                nShown++
144 2143 mdecorde
                if (max_lines > 0 && nShown >= max_lines) {
145 2143 mdecorde
                        println "... (${slens.size() - max_lines})"
146 2143 mdecorde
                        break;
147 2143 mdecorde
                }
148 2143 mdecorde
        }
149 671 mdecorde
150 2105 mdecorde
        def slens2 = slens.sort { a, b -> -a.key <=> -b.key }
151 2105 mdecorde
        if (output_graph) {
152 2105 mdecorde
                def r = RWorkspace.getRWorkspaceInstance()
153 671 mdecorde
154 2105 mdecorde
                r.addVectorToWorkspace("len", slens2.keySet() as int[])
155 2105 mdecorde
                r.addVectorToWorkspace("freq", slens2.values() as int[])
156 2105 mdecorde
                r.addVectorToWorkspace("flen", flens as int[])
157 671 mdecorde
158 2105 mdecorde
                def corpusName = corpus.getName()
159 2105 mdecorde
                println "corpusName=$corpusName"
160 2105 mdecorde
                def PNGFile = File.createTempFile("txm_"+corpus.getID()+"_", ".png", new File(Toolbox.getTxmHomePath(), "results"))
161 2105 mdecorde
                def PNGFilePath = PNGFile.getAbsolutePath()
162 2105 mdecorde
                println "PNG file: "+PNGFilePath
163 2105 mdecorde
164 2105 mdecorde
                def SVGFile = File.createTempFile("txm_"+corpus.getID()+"_", ".svg", new File(Toolbox.getTxmHomePath(), "results"))
165 2105 mdecorde
                def SVGFilePath = SVGFile.getAbsolutePath()
166 2105 mdecorde
                println "SVG file: "+SVGFilePath
167 2105 mdecorde
168 2105 mdecorde
                /// BEGINNING OF R SCRIPT
169 2105 mdecorde
                def script ="""
170 671 mdecorde
hist(flen, xaxt='n', col="gray", xlab="Length", breaks=$nbins, main="$corpusName Longueur des chaînes ($nbins bins)")
171 671 mdecorde
axis(side=1, at=len)
172 671 mdecorde
dev.off()
173 671 mdecorde
"""
174 2105 mdecorde
                /// END OF R SCRIPT
175 671 mdecorde
176 2105 mdecorde
                // execute R script
177 2105 mdecorde
                r.plot(PNGFile, "png(file = \"${PNGFilePath}\"); "+script)
178 2105 mdecorde
                r.plot(SVGFile, "svg(file = \"${SVGFilePath}\"); "+script)
179 671 mdecorde
180 2105 mdecorde
                //display the SVG results graphic
181 2105 mdecorde
                monitor.syncExec(new Runnable() {
182 2105 mdecorde
                                        @Override
183 2105 mdecorde
                                        public void run() { OpenBrowser.openfile(SVGFile.getAbsolutePath(), corpusName+" Longueur des chaînes") }
184 2105 mdecorde
                                })
185 2105 mdecorde
        }
186 2105 mdecorde
        //return ["result":coef, "data":lens]
187 2105 mdecorde
}
188 671 mdecorde