39 |
39 |
int maximum_schema_size
|
40 |
40 |
@Field @Option(name="schema_property_display", usage="schema property to show", widget="String", required=true, def="REF")
|
41 |
41 |
String schema_property_display
|
42 |
|
|
43 |
42 |
@Field @Option(name="unit_ursql", usage="TYPE@PROP=REGEX", widget="String", required=false, def="MENTION")
|
44 |
43 |
String unit_ursql
|
|
44 |
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true")
|
|
45 |
boolean strict_inclusion
|
|
46 |
@Field @Option(name="sep", usage="choose between the len or freq columns", widget="Separator", metaVar="freq len", required=true, def="")
|
|
47 |
def sep
|
|
48 |
@Field @Option(name="sort_column", usage="choose between the len or freq columns", widget="StringArray", metaVar="freq len", required=true, def="freq")
|
|
49 |
String sort_column
|
|
50 |
@Field @Option(name="max_lines", usage="cut the number of lines shown. '0' means no cut", widget="Integer", required=false, def="0")
|
|
51 |
int max_lines
|
45 |
52 |
//@Field @Option(name="output_graph", usage="Show chart", widget="Boolean", required=true, def="false")
|
46 |
53 |
output_graph = false
|
47 |
54 |
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF ON ALL REALLY ALL", required=true, def="OFF")
|
... | ... | |
74 |
81 |
for (def schema : schemas) {
|
75 |
82 |
|
76 |
83 |
def allUnites = schema.getUnitesSousjacentesNonTriees()
|
77 |
|
|
|
84 |
|
78 |
85 |
def units = AnalecUtils.filterElements(debug, allUnites, unit_ursql)
|
|
86 |
|
|
87 |
selectedUnits = AnalecUtils.filterUniteByInclusion(debug, units, corpus.getMatches(), strict_inclusion, 0)
|
|
88 |
|
|
89 |
int nUnites = selectedUnits.size();
|
79 |
90 |
|
80 |
|
int nUnites = units.size();
|
81 |
|
|
82 |
91 |
if (!lens.containsKey(nUnites)) {
|
83 |
92 |
lens[nUnites] = 0;
|
84 |
93 |
lensnames[nUnites] = [];
|
85 |
94 |
}
|
86 |
95 |
|
87 |
96 |
lens[nUnites] = lens[nUnites] + 1;
|
88 |
|
lensnames[nUnites] << schema.getProp(schema_property_display)
|
|
97 |
String value = schema.getProp(schema_property_display);
|
|
98 |
if (value.contains(",")) value = "\"$value\""
|
|
99 |
lensnames[nUnites] << value
|
89 |
100 |
nSchemas++;
|
90 |
101 |
}
|
91 |
102 |
|
... | ... | |
100 |
111 |
n += lens[f]
|
101 |
112 |
}
|
102 |
113 |
|
|
114 |
if (n == 0) {
|
|
115 |
println "No units selected for schemas=$schema_ursql and units=$unit_ursql"
|
|
116 |
return false;
|
|
117 |
}
|
|
118 |
|
103 |
119 |
coef = (t/n)
|
104 |
|
def slens = lens.sort { a, b -> -a.value <=> -b.value ?: -a.key <=> -b.key }
|
|
120 |
|
|
121 |
def slens = null
|
|
122 |
if ("freq".equals(sort_column)) {
|
|
123 |
slens = lens.sort { a, b -> -a.value <=> -b.value ?: -a.key <=> -b.key }
|
|
124 |
} else {
|
|
125 |
slens = lens.sort { a, b -> -a.key <=> -b.key ?: -a.value <=> -b.value }
|
|
126 |
}
|
|
127 |
|
105 |
128 |
def flens = []
|
106 |
129 |
slens.each { key, value -> value.times { flens << key } }
|
107 |
130 |
def nbins = flens.size()*2
|
108 |
131 |
|
109 |
132 |
def cfreq = 0
|
110 |
|
println "Longueur moyenne des chaînes de référence : $t/$n = "+coef
|
111 |
|
println "Index hiérarchique des longueurs de chaînes :\nlen\tfreq\tcfreq"
|
112 |
|
slens.each { println it.key+" "+it.value+" "+(cfreq+=it.value)+" "+lensnames[it.key] }
|
|
133 |
println "Schema lengths in '$corpus'. The average schema length is $t/$n = "+coef
|
|
134 |
println "Index:\nlen\tfreq\tcfreq"
|
|
135 |
int nShown = 0;
|
|
136 |
for( def it : slens) {
|
|
137 |
println it.key+" "+it.value+" "+(cfreq+=it.value)+" "+lensnames[it.key].join(", ")
|
|
138 |
nShown++
|
|
139 |
if (max_lines > 0 && nShown >= max_lines) {
|
|
140 |
println "... (${slens.size() - max_lines})"
|
|
141 |
break;
|
|
142 |
}
|
|
143 |
}
|
113 |
144 |
|
114 |
145 |
def slens2 = slens.sort { a, b -> -a.key <=> -b.key }
|
115 |
146 |
if (output_graph) {
|