root / tmp / org.txm.analec.rcp / src / org / txm / macro / urs / exploit / SchemaLengthsMacro.groovy @ 2167
History | View | Annotate | Download (6.8 kB)
1 | 671 | mdecorde | // Copyright © 2016 ENS de Lyon, CNRS, University of Franche-Comté
|
---|---|---|---|
2 | 671 | mdecorde | // Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
|
3 | 671 | mdecorde | // @author mdecorde
|
4 | 671 | mdecorde | // @author sheiden
|
5 | 671 | mdecorde | |
6 | 671 | mdecorde | // STANDARD DECLARATIONS
|
7 | 2082 | mdecorde | package org.txm.macro.urs.exploit
|
8 | 671 | mdecorde | |
9 | 671 | mdecorde | import org.kohsuke.args4j.* |
10 | 671 | mdecorde | import groovy.transform.Field |
11 | 671 | mdecorde | import org.txm.rcp.swt.widget.parameters.* |
12 | 1217 | mdecorde | import org.txm.annotation.urs.* |
13 | 1968 | mdecorde | import org.txm.macro.urs.AnalecUtils |
14 | 1968 | mdecorde | import visuAnalec.elements.* |
15 | 671 | mdecorde | import org.txm.searchengine.cqp.corpus.* |
16 | 671 | mdecorde | import org.txm.Toolbox |
17 | 671 | mdecorde | import org.txm.rcp.commands.* |
18 | 1968 | mdecorde | import org.txm.statsengine.r.core.RWorkspace |
19 | 671 | mdecorde | |
20 | 2105 | mdecorde | def selection = [] |
21 | 2105 | mdecorde | for (def s : corpusViewSelections) { |
22 | 2105 | mdecorde | if (s instanceof CQPCorpus) selection << s |
23 | 2105 | mdecorde | else if (s instanceof Partition) selection.addAll(s.getParts()) |
24 | 1968 | mdecorde | } |
25 | 1968 | mdecorde | |
26 | 2105 | mdecorde | if (selection.size() == 0) { |
27 | 2105 | mdecorde | println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections
|
28 | 2105 | mdecorde | return false |
29 | 2105 | mdecorde | } else {
|
30 | 2105 | mdecorde | for (def c : selection) c.compute(false) |
31 | 2105 | mdecorde | } |
32 | 2105 | mdecorde | |
33 | 1962 | mdecorde | @Field @Option(name="schema_ursql", usage="TYPE@PROP=REGEX", widget="String", required=true, def="CHAINE") |
34 | 2105 | mdecorde | String schema_ursql
|
35 | 671 | mdecorde | |
36 | 1962 | mdecorde | @Field @Option(name="minimum_schema_size", usage="minimal schema size", widget="Integer", required=true, def="3") |
37 | 2105 | mdecorde | int minimum_schema_size
|
38 | 2099 | mdecorde | @Field @Option(name="maximum_schema_size", usage="Maximum size needed to consider a schema", widget="Integer", required=true, def="9999999") |
39 | 2105 | mdecorde | int maximum_schema_size
|
40 | 1962 | mdecorde | @Field @Option(name="schema_property_display", usage="schema property to show", widget="String", required=true, def="REF") |
41 | 2105 | mdecorde | String schema_property_display
|
42 | 1962 | mdecorde | @Field @Option(name="unit_ursql", usage="TYPE@PROP=REGEX", widget="String", required=false, def="MENTION") |
43 | 2105 | mdecorde | String unit_ursql
|
44 | 2143 | mdecorde | @Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true") |
45 | 2143 | mdecorde | boolean strict_inclusion
|
46 | 2144 | mdecorde | @Field @Option(name="position_in_matches", usage="Unit distance to structure limit (0 = no selection, 1 = first after limit, -1 = last before limit, etc.)", widget="Integer", required=true, def="0") |
47 | 2144 | mdecorde | int position_in_matches
|
48 | 2147 | mdecorde | @Field @Option(name="sep", usage="choose between the len or freq columns", widget="Separator", metaVar="freq len", required=false, def="") |
49 | 2143 | mdecorde | def sep
|
50 | 2143 | mdecorde | @Field @Option(name="sort_column", usage="choose between the len or freq columns", widget="StringArray", metaVar="freq len", required=true, def="freq") |
51 | 2143 | mdecorde | String sort_column
|
52 | 2143 | mdecorde | @Field @Option(name="max_lines", usage="cut the number of lines shown. '0' means no cut", widget="Integer", required=false, def="0") |
53 | 2143 | mdecorde | int max_lines
|
54 | 2105 | mdecorde | //@Field @Option(name="output_graph", usage="Show chart", widget="Boolean", required=true, def="false")
|
55 | 2105 | mdecorde | output_graph = false
|
56 | 1968 | mdecorde | @Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF ON ALL REALLY ALL", required=true, def="OFF") |
57 | 2105 | mdecorde | debug |
58 | 671 | mdecorde | |
59 | 671 | mdecorde | if (!ParametersDialog.open(this)) return; |
60 | 1968 | mdecorde | if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3 |
61 | 671 | mdecorde | |
62 | 2105 | mdecorde | for (def corpus : selection) { |
63 | 2105 | mdecorde | def analecCorpus = URSCorpora.getCorpus(corpus)
|
64 | 671 | mdecorde | |
65 | 2105 | mdecorde | def errors = AnalecUtils.isPropertyDefined(Schema.class, analecCorpus, schema_ursql) |
66 | 2105 | mdecorde | if (errors.size() > 0) { |
67 | 2105 | mdecorde | println "** The $schema_ursql schema URSQL cannot be computed in the corpus with types: $errors."
|
68 | 2105 | mdecorde | return; |
69 | 2105 | mdecorde | } |
70 | 671 | mdecorde | |
71 | 2105 | mdecorde | errors=AnalecUtils.isPropertyDefined(Unite.class, analecCorpus, unit_ursql) |
72 | 2105 | mdecorde | if (errors.size() > 0) { |
73 | 2105 | mdecorde | println "** $unit_ursql unit URSQL cannot be computed in the corpus with types: $errors."
|
74 | 2105 | mdecorde | return; |
75 | 2105 | mdecorde | } |
76 | 671 | mdecorde | |
77 | 2167 | mdecorde | //println([debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, strict_inclusion])
|
78 | 2144 | mdecorde | def schemas = AnalecUtils.selectSchemasInCorpus(debug, analecCorpus, corpus, schema_ursql, minimum_schema_size, maximum_schema_size, strict_inclusion);
|
79 | 671 | mdecorde | |
80 | 2105 | mdecorde | int nSchemas = 0; |
81 | 671 | mdecorde | |
82 | 2105 | mdecorde | def lens = [:]
|
83 | 2105 | mdecorde | def lensnames = [:]
|
84 | 2105 | mdecorde | for (def schema : schemas) { |
85 | 671 | mdecorde | |
86 | 2105 | mdecorde | def allUnites = schema.getUnitesSousjacentesNonTriees()
|
87 | 2143 | mdecorde | |
88 | 2105 | mdecorde | def units = AnalecUtils.filterElements(debug, allUnites, unit_ursql)
|
89 | 2143 | mdecorde | |
90 | 2144 | mdecorde | selectedUnits = AnalecUtils.filterUniteByInclusion(debug, units, corpus.getMatches(), strict_inclusion, position_in_matches) |
91 | 2143 | mdecorde | |
92 | 2143 | mdecorde | int nUnites = selectedUnits.size();
|
93 | 671 | mdecorde | |
94 | 2105 | mdecorde | if (!lens.containsKey(nUnites)) {
|
95 | 2105 | mdecorde | lens[nUnites] = 0;
|
96 | 2105 | mdecorde | lensnames[nUnites] = [];
|
97 | 2105 | mdecorde | } |
98 | 671 | mdecorde | |
99 | 2105 | mdecorde | lens[nUnites] = lens[nUnites] + 1;
|
100 | 2143 | mdecorde | String value = schema.getProp(schema_property_display);
|
101 | 2167 | mdecorde | //if (value.contains(",")) value = "\"$value\""
|
102 | 2143 | mdecorde | lensnames[nUnites] << value |
103 | 2105 | mdecorde | nSchemas++; |
104 | 2105 | mdecorde | } |
105 | 671 | mdecorde | |
106 | 2105 | mdecorde | //println "nSchemas=$nSchemas"
|
107 | 2105 | mdecorde | def freqs = lens.keySet();
|
108 | 2105 | mdecorde | freqs.sort(); |
109 | 2105 | mdecorde | int t = 0; |
110 | 2105 | mdecorde | int n = 0; |
111 | 2105 | mdecorde | //println "Fréquences ("+freqs.size()+")"
|
112 | 2105 | mdecorde | for (def f : freqs) { |
113 | 2105 | mdecorde | t += f * lens[f] |
114 | 2105 | mdecorde | n += lens[f] |
115 | 2105 | mdecorde | } |
116 | 671 | mdecorde | |
117 | 2143 | mdecorde | if (n == 0) { |
118 | 2143 | mdecorde | println "No units selected for schemas=$schema_ursql and units=$unit_ursql"
|
119 | 2143 | mdecorde | return false; |
120 | 2143 | mdecorde | } |
121 | 2143 | mdecorde | |
122 | 2105 | mdecorde | coef = (t/n) |
123 | 2143 | mdecorde | |
124 | 2143 | mdecorde | def slens = null |
125 | 2143 | mdecorde | if ("freq".equals(sort_column)) { |
126 | 2143 | mdecorde | slens = lens.sort { a, b -> -a.value <=> -b.value ?: -a.key <=> -b.key } |
127 | 2143 | mdecorde | } else {
|
128 | 2143 | mdecorde | slens = lens.sort { a, b -> -a.key <=> -b.key ?: -a.value <=> -b.value } |
129 | 2143 | mdecorde | } |
130 | 2143 | mdecorde | |
131 | 2105 | mdecorde | def flens = [] |
132 | 2105 | mdecorde | slens.each { key, value -> value.times { flens << key } } |
133 | 2105 | mdecorde | def nbins = flens.size()*2 |
134 | 671 | mdecorde | |
135 | 2105 | mdecorde | def cfreq = 0 |
136 | 2143 | mdecorde | println "Schema lengths in '$corpus'. The average schema length is $t/$n = "+coef
|
137 | 2143 | mdecorde | println "Index:\nlen\tfreq\tcfreq"
|
138 | 2143 | mdecorde | int nShown = 0; |
139 | 2143 | mdecorde | for( def it : slens) { |
140 | 2167 | mdecorde | def names = lensnames[it.key] |
141 | 2167 | mdecorde | names = names.sort() |
142 | 2167 | mdecorde | println it.key+" "+it.value+" "+(cfreq+=it.value)+" \""+names.join("\", \"")+"\"" |
143 | 2143 | mdecorde | nShown++ |
144 | 2143 | mdecorde | if (max_lines > 0 && nShown >= max_lines) { |
145 | 2143 | mdecorde | println "... (${slens.size() - max_lines})"
|
146 | 2143 | mdecorde | break;
|
147 | 2143 | mdecorde | } |
148 | 2143 | mdecorde | } |
149 | 671 | mdecorde | |
150 | 2105 | mdecorde | def slens2 = slens.sort { a, b -> -a.key <=> -b.key }
|
151 | 2105 | mdecorde | if (output_graph) {
|
152 | 2105 | mdecorde | def r = RWorkspace.getRWorkspaceInstance()
|
153 | 671 | mdecorde | |
154 | 2105 | mdecorde | r.addVectorToWorkspace("len", slens2.keySet() as int[]) |
155 | 2105 | mdecorde | r.addVectorToWorkspace("freq", slens2.values() as int[]) |
156 | 2105 | mdecorde | r.addVectorToWorkspace("flen", flens as int[]) |
157 | 671 | mdecorde | |
158 | 2105 | mdecorde | def corpusName = corpus.getName()
|
159 | 2105 | mdecorde | println "corpusName=$corpusName"
|
160 | 2105 | mdecorde | def PNGFile = File.createTempFile("txm_"+corpus.getID()+"_", ".png", new File(Toolbox.getTxmHomePath(), "results")) |
161 | 2105 | mdecorde | def PNGFilePath = PNGFile.getAbsolutePath()
|
162 | 2105 | mdecorde | println "PNG file: "+PNGFilePath
|
163 | 2105 | mdecorde | |
164 | 2105 | mdecorde | def SVGFile = File.createTempFile("txm_"+corpus.getID()+"_", ".svg", new File(Toolbox.getTxmHomePath(), "results")) |
165 | 2105 | mdecorde | def SVGFilePath = SVGFile.getAbsolutePath()
|
166 | 2105 | mdecorde | println "SVG file: "+SVGFilePath
|
167 | 2105 | mdecorde | |
168 | 2105 | mdecorde | /// BEGINNING OF R SCRIPT
|
169 | 2105 | mdecorde | def script =""" |
170 | 671 | mdecorde | hist(flen, xaxt='n', col="gray", xlab="Length", breaks=$nbins, main="$corpusName Longueur des chaînes ($nbins bins)")
|
171 | 671 | mdecorde | axis(side=1, at=len)
|
172 | 671 | mdecorde | dev.off()
|
173 | 671 | mdecorde | """
|
174 | 2105 | mdecorde | /// END OF R SCRIPT
|
175 | 671 | mdecorde | |
176 | 2105 | mdecorde | // execute R script
|
177 | 2105 | mdecorde | r.plot(PNGFile, "png(file = \"${PNGFilePath}\"); "+script)
|
178 | 2105 | mdecorde | r.plot(SVGFile, "svg(file = \"${SVGFilePath}\"); "+script)
|
179 | 671 | mdecorde | |
180 | 2105 | mdecorde | //display the SVG results graphic
|
181 | 2105 | mdecorde | monitor.syncExec(new Runnable() { |
182 | 2105 | mdecorde | @Override
|
183 | 2105 | mdecorde | public void run() { OpenBrowser.openfile(SVGFile.getAbsolutePath(), corpusName+" Longueur des chaînes") } |
184 | 2105 | mdecorde | }) |
185 | 2105 | mdecorde | } |
186 | 2105 | mdecorde | //return ["result":coef, "data":lens]
|
187 | 2105 | mdecorde | } |
188 | 671 | mdecorde |