Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / commands / NIndexMacro.groovy @ 2051

History | View | Annotate | Download (8.6 kB)

1
// Copyright © 2017 ENS de Lyon, CNRS, University of Franche-Comté
2
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
3
// @author sheiden
4

    
5
package org.txm.macro.commands
6

    
7
import org.kohsuke.args4j.*
8
import groovy.transform.Field
9
import org.txm.rcpapplication.swt.widget.parameters.*
10
import org.txm.searchengine.cqp.CQPSearchEngine
11
import org.txm.Toolbox
12
import org.txm.macro.cqp.CQPUtils
13
import org.txm.searchengine.cqp.ICqiClient
14
import org.odftoolkit.simple.SpreadsheetDocument
15
import org.odftoolkit.odfdom.doc.table.OdfTable
16
import org.odftoolkit.odfdom.doc.table.OdfTableRow
17
import org.odftoolkit.odfdom.doc.table.OdfTableCell
18
import org.odftoolkit.simple.table.Table
19
import org.odftoolkit.simple.Document
20
import org.odftoolkit.simple.style.Border
21
import org.odftoolkit.simple.style.Font
22
import org.odftoolkit.simple.style.StyleTypeDefinitions
23
import org.odftoolkit.simple.style.StyleTypeDefinitions.FontStyle
24
import org.odftoolkit.simple.table.Row
25
import org.odftoolkit.simple.table.Cell
26
import org.odftoolkit.simple.style.StyleTypeDefinitions.CellBordersType
27
import org.odftoolkit.simple.style.StyleTypeDefinitions.HorizontalAlignmentType
28

    
29
// BEGINNING OF PARAMETERS
30

    
31
@Field @Option(name="inputFile", usage="input sequence list file (one sequence per line)", widget="File", required=true, def="")
32
def inputFile
33

    
34
@Field @Option(name="inputIsCQL", usage="input sequences are CQL queries", widget="Boolean", required=true, def="false")
35
def inputIsCQL
36

    
37
@Field @Option(name="inputWordProperty", usage="implicit word property to use", widget="String", required=true, def="word")
38
String inputWordProperty
39

    
40
@Field @Option(name="outputFile", usage="results output File (TSV format)", widget="File", required=true, def="")
41
def outputFile
42

    
43
@Field @Option(name="odsOutputFormat", usage="use ODS as output format", widget="Boolean", required=true, def="false")
44
def odsOutputFormat
45

    
46
@Field @Option(name="outputWordProperty", usage="output word property to use", widget="String", required=true, def="word")
47
String outputWordProperty
48

    
49
@Field @Option(name="groupByQuery", usage="provide frequency of each CQL match values or just the total frequency of each CQL match", widget="Boolean", required=true, def="false")
50
def groupByQuery
51

    
52
// Open the parameters input dialog box
53
if (!ParametersDialog.open(this)) return
54

    
55
// END OF PARAMETERS
56

    
57
def scriptName = this.class.getSimpleName()
58

    
59
if (!(inputFile && inputFile.isFile() && inputFile.canRead() )) {
60

    
61
        println "** $scriptName: impossible to read input file. Aborting."
62
        return false
63
}
64

    
65
corpusEngine = CQPSearchEngine.getCqiClient()
66

    
67
utils = new CQPUtils()
68

    
69
corpora = utils.getCorpora(this)
70

    
71
if ((corpora == null) || corpora.size() == 0) {
72
        println "** $scriptName: please select a corpus in the Corpus view or provide a corpus name. Aborting."
73
        return false
74
}
75

    
76
def getComputerName()
77
{
78
        env = System.getenv()
79
        if (env.containsKey("COMPUTERNAME"))
80
                return env.get("COMPUTERNAME")
81
        else if (env.containsKey("HOSTNAME"))
82
                return env.get("HOSTNAME")
83
        else
84
                return InetAddress.getLocalHost().getHostName()
85
}
86

    
87
def fullName = outputFile.getName()
88
def i = fullName.lastIndexOf(".")
89
def name
90
if (i == -1) {
91
        name = fullName
92
} else {
93
        name = fullName.substring(0, i)
94
}
95

    
96
if (odsOutputFormat) {
97

    
98
        try {
99
                 Class.forName( "org.odftoolkit.simple.SpreadsheetDocument")
100
        } catch(e) {
101
                 println "** $scriptName: the simple-odf-0.8.1-incubating.jar file is needed in the \$HOME/TXM/scripts/lib directory to write in ODS format. Please uncheck the odsOutputFormat option. Aborting."
102
                return false
103
        }
104

    
105
        outputFile = new File(outputFile.getParentFile(), name+".ods")
106
        ods = SpreadsheetDocument.newSpreadsheetDocument()
107
        table = ods.getTableByName("Sheet1")
108
        font = new Font("Arial", FontStyle.BOLD, 11)
109

    
110
        properties = Table.newTable(ods);
111
                properties.setTableName("Properties")
112
        setStringCellBold(properties, 0, 0, "Name")
113
        setStringCellBold(properties, 0, 1, "Value")
114
        line = 1
115
        setStringCell2(properties, line++, 0, "date", new Date().format("dd/MM/yyyy"))
116
        setStringCell2(properties, line++, 0, "time", new Date().format("HH'h'mm"))
117
        setStringCell2(properties, line++, 0, "machine", getComputerName())
118
        setStringCell2(properties, line++, 0, "user", System.getProperty("user.name"))
119
        setStringCell2(properties, line++, 0, "macro", scriptName)
120
        setStringCell2(properties, line++, 0, "corpora", corpora.collect { it.getName() }.join(","))
121
        setStringCell2(properties, line++, 0, "inputFile", inputFile.getAbsolutePath())
122
        setStringCell2(properties, line++, 0, "inputIsCQL", inputIsCQL.toString())
123
        setStringCell2(properties, line++, 0, "inputWordProperty", inputWordProperty)
124
        setStringCell2(properties, line++, 0, "outputFile", outputFile.getAbsolutePath())
125
        setStringCell2(properties, line++, 0, "odsOutputFormat", odsOutputFormat.toString())
126
        setStringCell2(properties, line++, 0, "outputWordProperty", outputWordProperty)
127
        setStringCell2(properties, line++, 0, "groupByQuery", groupByQuery.toString())
128

    
129
} else {
130
        outputFile = new File(outputFile.getParentFile(), name+".tsv")
131
        output = new FileWriter(outputFile)
132
}
133

    
134
def setRow(table, row, corpus, string, frequency) {
135
        cell=table.getCellByPosition(0, row)
136
        cell.setStringValue(corpus)
137
        cell=table.getCellByPosition(1, row)
138
        cell.setStringValue(string)
139
        cell=table.getCellByPosition(2, row)
140
        cell.setDoubleValue(frequency)
141
}
142

    
143
def setRow2(table, row, corpus, string1, string2, frequency) {
144
        cell=table.getCellByPosition(0, row)
145
        cell.setStringValue(corpus)
146
        cell=table.getCellByPosition(1, row)
147
        cell.setStringValue(string1)
148
        cell=table.getCellByPosition(2, row)
149
        cell.setStringValue((frequency == 0)?'':string2)
150
        cell=table.getCellByPosition(3, row)
151
        cell.setDoubleValue(frequency)
152
}
153

    
154
def setStringCell(table, row, col, string) {
155
        cell=table.getCellByPosition(col, row)
156
        cell.setStringValue(string)
157
        return cell
158
}
159

    
160
def setStringCellBold(table, row, col, string) {
161
        cell=table.getCellByPosition(col, row)
162
        cell.setStringValue(string)
163
        cell.setFont(font)
164
        return cell
165
}
166

    
167
def setStringCell2(table, row, col, string1, string2) {
168
        cell=table.getCellByPosition(col, row)
169
        cell.setStringValue(string1)
170
        cell=table.getCellByPosition(col+1, row)
171
        cell.setStringValue(string2)
172
        return cell
173
}
174

    
175
if (odsOutputFormat) {
176
        if (groupByQuery) {
177
                setStringCellBold(table, 0, 0, "corpus")
178
                setStringCellBold(table, 0, 1, inputIsCQL?"CQL":outputWordProperty+" query")
179
                cell = setStringCellBold(table, 0, 2, "f")
180
                cell.setHorizontalAlignment(HorizontalAlignmentType.RIGHT)
181
        } else {
182
                setStringCellBold(table, 0, 0, "corpus")
183
                setStringCellBold(table, 0, 1, inputIsCQL?"CQL":outputWordProperty+" query")
184
                setStringCellBold(table, 0, 2, outputWordProperty+" values")
185
                cell = setStringCellBold(table, 0, 3, "f")
186
                cell.setHorizontalAlignment(HorizontalAlignmentType.RIGHT)
187
        }
188
} else {
189
        if (groupByQuery) {
190
                output.println "corpus\t"+(inputIsCQL?"CQL":outputWordProperty+" query")+"\tf"
191
        } else {
192
                output.println "corpus\t"+(inputIsCQL?"CQL":outputWordProperty+" query")+"\t"+(outputWordProperty+" values")+"\tf"
193
        }
194
}
195

    
196
row = 1
197

    
198
corpora.each { corpus ->
199

    
200
        corpusName = corpus.getName()
201
        seqN = 0
202
        inputFile.eachLine() { line ->
203

    
204
                cql = ""
205
                subCorpus = "NIndex"+seqN++
206

    
207
                if (line.length() > 0) {
208
                        if (inputIsCQL) {
209
                                cql = line
210
                        } else {
211
                                cql = "[$inputWordProperty=\""+line.replaceAll("\\p{Blank}+", "\"] [$inputWordProperty=\"",)+"\"]"
212
                        }
213

    
214
                        corpusEngine.cqpQuery(corpusName, "$subCorpus", cql)
215
                        nmatches = corpusEngine.subCorpusSize("$corpusName:$subCorpus")
216
                        if (nmatches == 0) {
217
                                if (groupByQuery) {
218
                                        if (odsOutputFormat) {
219
                                                setRow(table, row++, corpusName, line, 0)
220
                                        } else {
221
                                                output.println corpusName+"\t"+line+"\t0"
222
                                        }
223
                                } else {
224
                                        if (odsOutputFormat) {
225
                                                setRow2(table, row++, corpusName, line, line, 0)
226
                                        } else {
227
                                                output.println corpusName+"\t"+line+"\t\t0"
228
                                        }
229
                                }
230
                        } else {
231
                                if (groupByQuery) {
232
                                        if (odsOutputFormat) {
233
                                                setRow(table, row++, corpusName, line, nmatches)
234
                                        } else {
235
                                                output.println corpusName+"\t"+line+"\t$nmatches"
236
                                        }
237
                                } else {
238
                                        starts = corpusEngine.dumpSubCorpus("$corpusName:$subCorpus", ICqiClient.CQI_CONST_FIELD_MATCH, 0, nmatches-1)
239
                                        ends = corpusEngine.dumpSubCorpus("$corpusName:$subCorpus", ICqiClient.CQI_CONST_FIELD_MATCHEND, 0, nmatches-1)
240
                                        [starts, ends].transpose().collect {
241
                                                corpusEngine.cpos2Str("$corpusName.$outputWordProperty", (it[0]..it[1]) as int[]).join(' ')
242
                                        }.countBy { it }.sort { -it.value }.each {
243
                                                if (odsOutputFormat) {
244
                                                        setRow2(table, row++, corpusName, line, it.key, it.value)
245
                                                } else {
246
                                                        output.println corpusName+"\t"+line+"\t"+it.key+"\t"+it.value
247
                                                }
248
                                        }
249
                                }
250
                        }
251
                }
252
        }
253
}
254

    
255
if (odsOutputFormat) {
256
        ods.save(outputFile)
257
} else {
258
        output.close()
259
}
260

    
261
return true