root / tmp / org.txm.tigersearch.rcp / groovy / org / txm / macro / tiger / exploit / TIGERIndexMacro.groovy @ 2179
History | View | Annotate | Download (6.1 kB)
1 |
// Copyright © 2019 ENS de Lyon, CNRS, University of Franche-Comté
|
---|---|
2 |
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
|
3 |
// @author mdecorde
|
4 |
// STANDARD DECLARATIONS
|
5 |
package org.txm.macro.tiger.exploit
|
6 |
|
7 |
import groovy.transform.Field |
8 |
|
9 |
import java.util.List |
10 |
|
11 |
import org.txm.searchengine.core.EmptySelection |
12 |
import org.txm.searchengine.core.SearchEnginesManager |
13 |
import org.txm.searchengine.cqp.AbstractCqiClient |
14 |
import org.txm.searchengine.cqp.CQPSearchEngine |
15 |
import org.txm.searchengine.cqp.corpus.* |
16 |
import org.txm.searchengine.ts.TIGERSearchEngine |
17 |
import org.txm.searchengine.ts.TSCorpus |
18 |
import org.txm.searchengine.ts.TSResult |
19 |
import org.txm.utils.ConsoleProgressBar |
20 |
import org.txm.utils.logger.Log |
21 |
|
22 |
import ims.tiger.corpus.Sentence |
23 |
import ims.tiger.query.api.*; |
24 |
|
25 |
def scriptName = this.class.getSimpleName() |
26 |
|
27 |
def selection = [] |
28 |
for (def s : corpusViewSelections) { |
29 |
if (s instanceof CQPCorpus) selection << s |
30 |
else if (s instanceof Partition) selection.addAll(s.getParts()) |
31 |
} |
32 |
|
33 |
if (selection.size() == 0) { |
34 |
println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections
|
35 |
return false |
36 |
} else {
|
37 |
for (def c : selection) c.compute(false) |
38 |
} |
39 |
|
40 |
@Field @Option(name="tiger_query", usage="A Full TIGERSearch query", widget="Text", required=true, def="[]") |
41 |
String tiger_query
|
42 |
@Field @Option(name="labels", usage="List of TIGER labels separated with ','", widget="String", required=true, def="A,B,C") |
43 |
def labels
|
44 |
@Field @Option(name="properties", usage="List of properties separated with ','", widget="String", required=true, def="word, word, word") |
45 |
def properties
|
46 |
@Field @Option(name="count_subgraph", usage="A Full TIGERSearch query", widget="Boolean", required=true, def="true") |
47 |
def count_subgraph
|
48 |
@Field @Option(name="sort_column", usage="choose between the len or freq columns", widget="StringArray", metaVar="freq labels", required=true, def="freq") |
49 |
String sort_column
|
50 |
@Field @Option(name="max_lines", usage="choose between the len or freq columns", widget="Integer", metaVar="freq labels", required=true, def="-1") |
51 |
int max_lines
|
52 |
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF ON ALL REALLY ALL", required=true, def="OFF") |
53 |
debug |
54 |
if (!ParametersDialog.open(this)) return |
55 |
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3 |
56 |
|
57 |
TIGERSearchEngine tse = SearchEnginesManager.getTIGERSearchEngine() |
58 |
|
59 |
if (labels.length() == 0) { |
60 |
println "Error: no label given. Aborting"
|
61 |
return false; |
62 |
} else {
|
63 |
labels = labels.split(",")
|
64 |
} |
65 |
if (properties.length() == 0) { |
66 |
println "Error: no property given. Aborting"
|
67 |
return false; |
68 |
} else {
|
69 |
properties = properties.split(",")
|
70 |
} |
71 |
|
72 |
if (properties.size() != labels.size()) {
|
73 |
println "Error: the number of labels and properties does not matches. labels=$labels properties=$properties (${labels.size()} != {properties.size()})"
|
74 |
return false; |
75 |
} |
76 |
|
77 |
def missing = [] |
78 |
for (def l : labels) { |
79 |
if (!tiger_query.contains("#${l}:")) { |
80 |
missing << l |
81 |
} |
82 |
} |
83 |
if (missing.size() > 0) { |
84 |
println "Error: $missing label(s) not found in query: $tiger_query"
|
85 |
return false |
86 |
} |
87 |
|
88 |
def results = new LinkedHashMap() |
89 |
def total_counts = [:] // List<String> -> Integer |
90 |
|
91 |
ConsoleProgressBar cpb = new ConsoleProgressBar(selection.size())
|
92 |
for (def corpus : selection) { |
93 |
|
94 |
def counts = [:] // List<String> -> Integer |
95 |
def root = corpus.getRootCorpusBuild();
|
96 |
File buildDirectory = new File(root.getProjectDirectory(), "tiger"); |
97 |
|
98 |
if (!tse.hasIndexes(corpus)) {
|
99 |
println "Warning: skipping $corpus: no TIGERSearch indexes found."
|
100 |
continue;
|
101 |
} |
102 |
|
103 |
def tcorpus = tse.getTSCorpus(corpus);
|
104 |
def index = tcorpus.getIndex()
|
105 |
def sentences_min_max = tse.getSentMinMax(corpus);
|
106 |
MatchResult mresult = tcorpus.manager.processQuery(tiger_query, sentences_min_max[0], sentences_min_max[1], 9999999); |
107 |
|
108 |
List<String> variables = java.util.Arrays.asList(mresult.getVariableNames()); |
109 |
def iVariables = [] |
110 |
for (def l : labels) { |
111 |
iVariables << variables.indexOf(l) |
112 |
} |
113 |
|
114 |
def matches = mresult.matches
|
115 |
int size = matches.keySet().size();
|
116 |
if (debug > 0) println "$size sentences matched." |
117 |
for (int sent : matches.keySet()) { // the matching sentences |
118 |
if (!mresult.isMatchingSentence(sent)) continue; |
119 |
|
120 |
int sent_submatch_size = mresult.getSentenceSubmatchSize(sent);
|
121 |
//println "sent no $sent with $sent_submatch_size submatches : "
|
122 |
def sentence = index.getSentence(sent)
|
123 |
for (int j = 0 ; j < sent_submatch_size ; j++) { |
124 |
//println "submatch no $j"
|
125 |
def positions = mresult.getSentenceSubmatchAt(sent, j)
|
126 |
def strings = [] |
127 |
// println " positions=$positions"
|
128 |
// println " properties=$properties"
|
129 |
// println " iVariables=$iVariables"
|
130 |
int t = 0; |
131 |
for (int iV : iVariables) { |
132 |
def node = sentence.getNode(positions[iV])
|
133 |
String value = node.getFeature(properties[t])
|
134 |
t++ |
135 |
if (value == null) { |
136 |
//println "Warning: no '${properties[iV]}' feature found in node="+node.getFeatures()
|
137 |
value = "NA"
|
138 |
} |
139 |
strings << value |
140 |
} |
141 |
|
142 |
if (!counts.containsKey(strings)) {
|
143 |
counts[strings] = 0;
|
144 |
} |
145 |
if (!total_counts.containsKey(strings)) {
|
146 |
total_counts[strings] = 0;
|
147 |
} |
148 |
|
149 |
counts[strings] = counts[strings] + 1
|
150 |
total_counts[strings] = total_counts[strings] + 1
|
151 |
|
152 |
if (!count_subgraph) break; |
153 |
} |
154 |
} |
155 |
|
156 |
results[corpus] = counts |
157 |
cpb.tick() |
158 |
// DEBUG
|
159 |
// def keys = []
|
160 |
// keys.addAll(counts.keySet())
|
161 |
// if ("freq".equals(sort_column)) {
|
162 |
// keys = keys.sort() {-counts[it]}
|
163 |
// } else {
|
164 |
// keys = keys.sort()
|
165 |
// }
|
166 |
//
|
167 |
// for (def k : keys) {
|
168 |
// println "${k.join('_')} ${counts[k]}"
|
169 |
// }
|
170 |
} |
171 |
cpb.done() |
172 |
|
173 |
def keys = [] |
174 |
keys.addAll(total_counts.keySet()) |
175 |
if ("freq".equals(sort_column)) { |
176 |
keys = keys.sort() {-total_counts[it]}
|
177 |
} else {
|
178 |
keys = keys.sort() |
179 |
} |
180 |
|
181 |
println properties.join(", ")+"\t"+"F\t"+selection.join("\t") |
182 |
|
183 |
int nline = 0; |
184 |
for (def k : keys) { |
185 |
print "${k.join('_')} ${total_counts[k]}"
|
186 |
if (selection.size() > 1) { |
187 |
for (def corpus : selection) { |
188 |
def v = results[corpus][k];
|
189 |
if (v == null) v = 0; |
190 |
print "\t"+v
|
191 |
} |
192 |
} |
193 |
println ""
|
194 |
nline++ |
195 |
if (max_lines > 0 && nline >= max_lines) { |
196 |
println "... (${keys.size() - max_lines})"
|
197 |
break;
|
198 |
} |
199 |
} |