Statistics
| Revision:

root / tmp / org.txm.tigersearch.rcp / groovy / org / txm / macro / tiger / exploit / TIGERIndexMacro.groovy @ 2179

History | View | Annotate | Download (6.1 kB)

1
// Copyright © 2019 ENS de Lyon, CNRS, University of Franche-Comté
2
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
3
// @author mdecorde
4
// STANDARD DECLARATIONS
5
package org.txm.macro.tiger.exploit
6

    
7
import groovy.transform.Field
8

    
9
import java.util.List
10

    
11
import org.txm.searchengine.core.EmptySelection
12
import org.txm.searchengine.core.SearchEnginesManager
13
import org.txm.searchengine.cqp.AbstractCqiClient
14
import org.txm.searchengine.cqp.CQPSearchEngine
15
import org.txm.searchengine.cqp.corpus.*
16
import org.txm.searchengine.ts.TIGERSearchEngine
17
import org.txm.searchengine.ts.TSCorpus
18
import org.txm.searchengine.ts.TSResult
19
import org.txm.utils.ConsoleProgressBar
20
import org.txm.utils.logger.Log
21

    
22
import ims.tiger.corpus.Sentence
23
import ims.tiger.query.api.*;
24

    
25
def scriptName = this.class.getSimpleName()
26

    
27
def selection = []
28
for (def s : corpusViewSelections) {
29
        if (s instanceof CQPCorpus) selection << s
30
        else if (s instanceof Partition) selection.addAll(s.getParts())
31
}
32

    
33
if (selection.size() == 0) {
34
        println "** $scriptName: please select a Corpus or a Partition in the Corpus view: "+corpusViewSelections
35
        return false
36
} else {
37
        for (def c : selection) c.compute(false)
38
}
39

    
40
@Field @Option(name="tiger_query", usage="A Full TIGERSearch query", widget="Text", required=true, def="[]")
41
                String tiger_query
42
@Field @Option(name="labels", usage="List of TIGER labels separated with ','", widget="String", required=true, def="A,B,C")
43
                def labels
44
@Field @Option(name="properties", usage="List of properties separated with ','", widget="String", required=true, def="word, word, word")
45
                def properties
46
@Field @Option(name="count_subgraph", usage="A Full TIGERSearch query", widget="Boolean", required=true, def="true")
47
                def count_subgraph
48
@Field @Option(name="sort_column", usage="choose between the len or freq columns", widget="StringArray", metaVar="freq        labels", required=true, def="freq")
49
                String sort_column
50
                @Field @Option(name="max_lines", usage="choose between the len or freq columns", widget="Integer", metaVar="freq        labels", required=true, def="-1")
51
                int max_lines
52
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF        ON        ALL        REALLY ALL", required=true, def="OFF")
53
                debug
54
if (!ParametersDialog.open(this)) return
55
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
56

    
57
TIGERSearchEngine tse = SearchEnginesManager.getTIGERSearchEngine()
58

    
59
if (labels.length() == 0) {
60
        println "Error: no label given. Aborting"
61
        return false;
62
} else {
63
        labels = labels.split(",")
64
}
65
if (properties.length() == 0) {
66
        println "Error: no property given. Aborting"
67
        return false;
68
} else {
69
        properties = properties.split(",")
70
}
71

    
72
if (properties.size() != labels.size()) {
73
        println "Error: the number of labels and properties does not matches. labels=$labels properties=$properties (${labels.size()} != {properties.size()})"
74
        return false;
75
}
76

    
77
def missing = []
78
for (def l : labels) {
79
        if (!tiger_query.contains("#${l}:")) {
80
                missing << l
81
        }
82
}
83
if (missing.size() > 0) {
84
        println "Error: $missing label(s) not found in query: $tiger_query"
85
        return false
86
}
87

    
88
def results = new LinkedHashMap()
89
def total_counts = [:] // List<String> -> Integer
90

    
91
ConsoleProgressBar cpb = new ConsoleProgressBar(selection.size())
92
for (def corpus : selection) {
93

    
94
        def counts = [:] // List<String> -> Integer
95
        def root = corpus.getRootCorpusBuild();
96
        File buildDirectory = new File(root.getProjectDirectory(), "tiger");
97

    
98
        if (!tse.hasIndexes(corpus)) {
99
                println "Warning: skipping $corpus: no TIGERSearch indexes found."
100
                continue;
101
        }
102

    
103
        def tcorpus = tse.getTSCorpus(corpus);
104
        def index = tcorpus.getIndex()
105
        def sentences_min_max = tse.getSentMinMax(corpus);
106
        MatchResult mresult = tcorpus.manager.processQuery(tiger_query, sentences_min_max[0], sentences_min_max[1], 9999999);
107

    
108
        List<String> variables = java.util.Arrays.asList(mresult.getVariableNames());
109
        def iVariables = []
110
        for (def l : labels) {
111
                iVariables << variables.indexOf(l)
112
        }
113

    
114
        def matches = mresult.matches
115
        int size = matches.keySet().size();
116
        if (debug > 0) println "$size sentences matched."
117
        for (int sent : matches.keySet()) { // the matching sentences
118
                if (!mresult.isMatchingSentence(sent)) continue;
119

    
120
                int sent_submatch_size = mresult.getSentenceSubmatchSize(sent);
121
                //println "sent no $sent with $sent_submatch_size submatches : "
122
                def sentence = index.getSentence(sent)
123
                for (int j = 0 ; j < sent_submatch_size ; j++) {
124
                        //println "submatch no $j"
125
                        def positions = mresult.getSentenceSubmatchAt(sent, j)
126
                        def strings = []
127
//                        println "        positions=$positions"
128
//                        println "        properties=$properties"
129
//                        println "        iVariables=$iVariables"
130
                        int t = 0;
131
                        for (int iV : iVariables) {
132
                                def node = sentence.getNode(positions[iV])
133
                                String value = node.getFeature(properties[t])
134
                                t++
135
                                if (value == null) {
136
                                        //println "Warning: no '${properties[iV]}' feature found in node="+node.getFeatures()
137
                                        value = "NA"
138
                                }
139
                                strings << value
140
                        }
141
                        
142
                        if (!counts.containsKey(strings)) {
143
                                counts[strings] = 0;
144
                        }
145
                        if (!total_counts.containsKey(strings)) {
146
                                total_counts[strings] = 0;
147
                        }
148

    
149
                        counts[strings] = counts[strings] + 1
150
                        total_counts[strings] = total_counts[strings] + 1
151

    
152
                        if (!count_subgraph) break;
153
                }
154
        }
155

    
156
        results[corpus] = counts
157
        cpb.tick()
158
        // DEBUG
159
        //        def keys = []
160
        //        keys.addAll(counts.keySet())
161
        //        if ("freq".equals(sort_column)) {
162
        //                keys = keys.sort() {-counts[it]}
163
        //        } else {
164
        //                keys = keys.sort()
165
        //        }
166
        //
167
        //        for (def k : keys) {
168
        //                println "${k.join('_')}        ${counts[k]}"
169
        //        }
170
}
171
cpb.done()
172

    
173
def keys = []
174
keys.addAll(total_counts.keySet())
175
if ("freq".equals(sort_column)) {
176
        keys = keys.sort() {-total_counts[it]}
177
} else {
178
        keys = keys.sort()
179
}
180

    
181
println properties.join(", ")+"\t"+"F\t"+selection.join("\t")
182

    
183
int nline = 0;
184
for (def k : keys) {
185
        print "${k.join('_')}        ${total_counts[k]}"
186
        if (selection.size() > 1) {
187
                for (def corpus : selection) {
188
                        def v = results[corpus][k];
189
                        if (v == null) v = 0;
190
                        print "\t"+v
191
                }
192
        }
193
        println ""
194
        nline++
195
        if (max_lines > 0 && nline >= max_lines) {
196
                println "... (${keys.size() - max_lines})"
197
                break;
198
        }
199
}