Revision 2143 tmp/org.txm.analec.rcp/src/org/txm/macro/urs/exploit/SchemaLengthsMacro.groovy

SchemaLengthsMacro.groovy (revision 2143)
39 39
		int maximum_schema_size
40 40
@Field @Option(name="schema_property_display", usage="schema property to show", widget="String", required=true, def="REF")
41 41
		String schema_property_display
42

  
43 42
@Field @Option(name="unit_ursql", usage="TYPE@PROP=REGEX", widget="String", required=false, def="MENTION")
44 43
		String unit_ursql
44
@Field @Option(name="strict_inclusion", usage="Units must be strictly included into corpus matches", widget="Boolean", required=true, def="true")
45
		boolean strict_inclusion
46
@Field @Option(name="sep", usage="choose between the len or freq columns", widget="Separator", metaVar="freq	len", required=true, def="")
47
		def sep
48
@Field @Option(name="sort_column", usage="choose between the len or freq columns", widget="StringArray", metaVar="freq	len", required=true, def="freq")
49
		String sort_column
50
@Field @Option(name="max_lines", usage="cut the number of lines shown. '0' means no cut", widget="Integer", required=false, def="0")
51
		int max_lines
45 52
//@Field @Option(name="output_graph", usage="Show chart", widget="Boolean", required=true, def="false")
46 53
		output_graph = false
47 54
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF	ON	ALL	REALLY ALL", required=true, def="OFF")
......
74 81
	for (def schema : schemas) {
75 82

  
76 83
		def allUnites = schema.getUnitesSousjacentesNonTriees()
77

  
84
		
78 85
		def units = AnalecUtils.filterElements(debug, allUnites, unit_ursql)
86
		
87
		selectedUnits = AnalecUtils.filterUniteByInclusion(debug, units, corpus.getMatches(), strict_inclusion, 0)
88
		
89
		int nUnites = selectedUnits.size();
79 90

  
80
		int nUnites = units.size();
81

  
82 91
		if (!lens.containsKey(nUnites)) {
83 92
			lens[nUnites] = 0;
84 93
			lensnames[nUnites] = [];
85 94
		}
86 95

  
87 96
		lens[nUnites] = lens[nUnites] + 1;
88
		lensnames[nUnites] << schema.getProp(schema_property_display)
97
		String value = schema.getProp(schema_property_display);
98
		if (value.contains(",")) value = "\"$value\""
99
		lensnames[nUnites] << value
89 100
		nSchemas++;
90 101
	}
91 102

  
......
100 111
		n += lens[f]
101 112
	}
102 113

  
114
	if (n == 0) {
115
		println "No units selected for schemas=$schema_ursql and units=$unit_ursql"
116
		return false;
117
	}
118
	
103 119
	coef = (t/n)
104
	def slens = lens.sort { a, b -> -a.value <=> -b.value ?: -a.key <=> -b.key }
120
	
121
	def slens = null
122
	if ("freq".equals(sort_column)) {
123
		slens = lens.sort { a, b -> -a.value <=> -b.value ?: -a.key <=> -b.key }
124
	} else {
125
		slens = lens.sort { a, b -> -a.key <=> -b.key ?: -a.value <=> -b.value }
126
	}
127
	
105 128
	def flens = []
106 129
	slens.each { key, value -> value.times { flens << key } }
107 130
	def nbins = flens.size()*2
108 131

  
109 132
	def cfreq = 0
110
	println "Longueur moyenne des chaînes de référence : $t/$n = "+coef
111
	println "Index hiérarchique des longueurs de chaînes :\nlen\tfreq\tcfreq"
112
	slens.each { println it.key+"	"+it.value+"	"+(cfreq+=it.value)+"	"+lensnames[it.key] }
133
	println "Schema lengths in '$corpus'. The average schema length is $t/$n = "+coef
134
	println "Index:\nlen\tfreq\tcfreq"
135
	int nShown = 0;
136
	for( def it : slens) {
137
		println it.key+"	"+it.value+"	"+(cfreq+=it.value)+"	"+lensnames[it.key].join(", ")
138
		nShown++
139
		if (max_lines > 0 && nShown >= max_lines) {
140
			println "... (${slens.size() - max_lines})"
141
			break;
142
		}
143
	}
113 144

  
114 145
	def slens2 = slens.sort { a, b -> -a.key <=> -b.key }
115 146
	if (output_graph) {

Also available in: Unified diff