Révision 2051
tmp/org.txm.searchengine.cqp.core/src/org/txm/searchengine/cqp/CQPSearchEngine.java (revision 2051) | ||
---|---|---|
10 | 10 |
import org.txm.core.engines.EngineType; |
11 | 11 |
import org.txm.core.messages.TXMCoreMessages; |
12 | 12 |
import org.txm.core.results.TXMResult; |
13 |
import org.txm.importer.cwb.PatchCwbRegistry; |
|
14 | 13 |
import org.txm.libs.cqp.CQPLibPreferences; |
15 | 14 |
import org.txm.objects.CorpusBuild; |
16 | 15 |
import org.txm.objects.Match; |
17 |
import org.txm.objects.Project; |
|
18 | 16 |
import org.txm.searchengine.core.EmptySelection; |
19 | 17 |
import org.txm.searchengine.core.Query; |
20 | 18 |
import org.txm.searchengine.core.SearchEngine; |
... | ... | |
29 | 27 |
import org.txm.searchengine.cqp.corpus.query.CQLQuery; |
30 | 28 |
import org.txm.utils.OSDetector; |
31 | 29 |
import org.txm.utils.Sh; |
32 |
import org.txm.utils.io.IOUtils; |
|
33 | 30 |
import org.txm.utils.logger.Log; |
34 | 31 |
|
35 | 32 |
public class CQPSearchEngine extends SearchEngine { |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/commands/NIndexMacro.groovy (revision 2051) | ||
---|---|---|
1 |
// Copyright © 2017 ENS de Lyon, CNRS, University of Franche-Comté |
|
2 |
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses) |
|
3 |
// @author sheiden |
|
4 |
|
|
5 |
package org.txm.macro.commands |
|
6 |
|
|
7 |
import org.kohsuke.args4j.* |
|
8 |
import groovy.transform.Field |
|
9 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
10 |
import org.txm.searchengine.cqp.CQPSearchEngine |
|
11 |
import org.txm.Toolbox |
|
12 |
import org.txm.macro.cqp.CQPUtils |
|
13 |
import org.txm.searchengine.cqp.ICqiClient |
|
14 |
import org.odftoolkit.simple.SpreadsheetDocument |
|
15 |
import org.odftoolkit.odfdom.doc.table.OdfTable |
|
16 |
import org.odftoolkit.odfdom.doc.table.OdfTableRow |
|
17 |
import org.odftoolkit.odfdom.doc.table.OdfTableCell |
|
18 |
import org.odftoolkit.simple.table.Table |
|
19 |
import org.odftoolkit.simple.Document |
|
20 |
import org.odftoolkit.simple.style.Border |
|
21 |
import org.odftoolkit.simple.style.Font |
|
22 |
import org.odftoolkit.simple.style.StyleTypeDefinitions |
|
23 |
import org.odftoolkit.simple.style.StyleTypeDefinitions.FontStyle |
|
24 |
import org.odftoolkit.simple.table.Row |
|
25 |
import org.odftoolkit.simple.table.Cell |
|
26 |
import org.odftoolkit.simple.style.StyleTypeDefinitions.CellBordersType |
|
27 |
import org.odftoolkit.simple.style.StyleTypeDefinitions.HorizontalAlignmentType |
|
28 |
|
|
29 |
// BEGINNING OF PARAMETERS |
|
30 |
|
|
31 |
@Field @Option(name="inputFile", usage="input sequence list file (one sequence per line)", widget="File", required=true, def="") |
|
32 |
def inputFile |
|
33 |
|
|
34 |
@Field @Option(name="inputIsCQL", usage="input sequences are CQL queries", widget="Boolean", required=true, def="false") |
|
35 |
def inputIsCQL |
|
36 |
|
|
37 |
@Field @Option(name="inputWordProperty", usage="implicit word property to use", widget="String", required=true, def="word") |
|
38 |
String inputWordProperty |
|
39 |
|
|
40 |
@Field @Option(name="outputFile", usage="results output File (TSV format)", widget="File", required=true, def="") |
|
41 |
def outputFile |
|
42 |
|
|
43 |
@Field @Option(name="odsOutputFormat", usage="use ODS as output format", widget="Boolean", required=true, def="false") |
|
44 |
def odsOutputFormat |
|
45 |
|
|
46 |
@Field @Option(name="outputWordProperty", usage="output word property to use", widget="String", required=true, def="word") |
|
47 |
String outputWordProperty |
|
48 |
|
|
49 |
@Field @Option(name="groupByQuery", usage="provide frequency of each CQL match values or just the total frequency of each CQL match", widget="Boolean", required=true, def="false") |
|
50 |
def groupByQuery |
|
51 |
|
|
52 |
// Open the parameters input dialog box |
|
53 |
if (!ParametersDialog.open(this)) return |
|
54 |
|
|
55 |
// END OF PARAMETERS |
|
56 |
|
|
57 |
def scriptName = this.class.getSimpleName() |
|
58 |
|
|
59 |
if (!(inputFile && inputFile.isFile() && inputFile.canRead() )) { |
|
60 |
|
|
61 |
println "** $scriptName: impossible to read input file. Aborting." |
|
62 |
return false |
|
63 |
} |
|
64 |
|
|
65 |
corpusEngine = CQPSearchEngine.getCqiClient() |
|
66 |
|
|
67 |
utils = new CQPUtils() |
|
68 |
|
|
69 |
corpora = utils.getCorpora(this) |
|
70 |
|
|
71 |
if ((corpora == null) || corpora.size() == 0) { |
|
72 |
println "** $scriptName: please select a corpus in the Corpus view or provide a corpus name. Aborting." |
|
73 |
return false |
|
74 |
} |
|
75 |
|
|
76 |
def getComputerName() |
|
77 |
{ |
|
78 |
env = System.getenv() |
|
79 |
if (env.containsKey("COMPUTERNAME")) |
|
80 |
return env.get("COMPUTERNAME") |
|
81 |
else if (env.containsKey("HOSTNAME")) |
|
82 |
return env.get("HOSTNAME") |
|
83 |
else |
|
84 |
return InetAddress.getLocalHost().getHostName() |
|
85 |
} |
|
86 |
|
|
87 |
def fullName = outputFile.getName() |
|
88 |
def i = fullName.lastIndexOf(".") |
|
89 |
def name |
|
90 |
if (i == -1) { |
|
91 |
name = fullName |
|
92 |
} else { |
|
93 |
name = fullName.substring(0, i) |
|
94 |
} |
|
95 |
|
|
96 |
if (odsOutputFormat) { |
|
97 |
|
|
98 |
try { |
|
99 |
Class.forName( "org.odftoolkit.simple.SpreadsheetDocument") |
|
100 |
} catch(e) { |
|
101 |
println "** $scriptName: the simple-odf-0.8.1-incubating.jar file is needed in the \$HOME/TXM/scripts/lib directory to write in ODS format. Please uncheck the odsOutputFormat option. Aborting." |
|
102 |
return false |
|
103 |
} |
|
104 |
|
|
105 |
outputFile = new File(outputFile.getParentFile(), name+".ods") |
|
106 |
ods = SpreadsheetDocument.newSpreadsheetDocument() |
|
107 |
table = ods.getTableByName("Sheet1") |
|
108 |
font = new Font("Arial", FontStyle.BOLD, 11) |
|
109 |
|
|
110 |
properties = Table.newTable(ods); |
|
111 |
properties.setTableName("Properties") |
|
112 |
setStringCellBold(properties, 0, 0, "Name") |
|
113 |
setStringCellBold(properties, 0, 1, "Value") |
|
114 |
line = 1 |
|
115 |
setStringCell2(properties, line++, 0, "date", new Date().format("dd/MM/yyyy")) |
|
116 |
setStringCell2(properties, line++, 0, "time", new Date().format("HH'h'mm")) |
|
117 |
setStringCell2(properties, line++, 0, "machine", getComputerName()) |
|
118 |
setStringCell2(properties, line++, 0, "user", System.getProperty("user.name")) |
|
119 |
setStringCell2(properties, line++, 0, "macro", scriptName) |
|
120 |
setStringCell2(properties, line++, 0, "corpora", corpora.collect { it.getName() }.join(",")) |
|
121 |
setStringCell2(properties, line++, 0, "inputFile", inputFile.getAbsolutePath()) |
|
122 |
setStringCell2(properties, line++, 0, "inputIsCQL", inputIsCQL.toString()) |
|
123 |
setStringCell2(properties, line++, 0, "inputWordProperty", inputWordProperty) |
|
124 |
setStringCell2(properties, line++, 0, "outputFile", outputFile.getAbsolutePath()) |
|
125 |
setStringCell2(properties, line++, 0, "odsOutputFormat", odsOutputFormat.toString()) |
|
126 |
setStringCell2(properties, line++, 0, "outputWordProperty", outputWordProperty) |
|
127 |
setStringCell2(properties, line++, 0, "groupByQuery", groupByQuery.toString()) |
|
128 |
|
|
129 |
} else { |
|
130 |
outputFile = new File(outputFile.getParentFile(), name+".tsv") |
|
131 |
output = new FileWriter(outputFile) |
|
132 |
} |
|
133 |
|
|
134 |
def setRow(table, row, corpus, string, frequency) { |
|
135 |
cell=table.getCellByPosition(0, row) |
|
136 |
cell.setStringValue(corpus) |
|
137 |
cell=table.getCellByPosition(1, row) |
|
138 |
cell.setStringValue(string) |
|
139 |
cell=table.getCellByPosition(2, row) |
|
140 |
cell.setDoubleValue(frequency) |
|
141 |
} |
|
142 |
|
|
143 |
def setRow2(table, row, corpus, string1, string2, frequency) { |
|
144 |
cell=table.getCellByPosition(0, row) |
|
145 |
cell.setStringValue(corpus) |
|
146 |
cell=table.getCellByPosition(1, row) |
|
147 |
cell.setStringValue(string1) |
|
148 |
cell=table.getCellByPosition(2, row) |
|
149 |
cell.setStringValue((frequency == 0)?'':string2) |
|
150 |
cell=table.getCellByPosition(3, row) |
|
151 |
cell.setDoubleValue(frequency) |
|
152 |
} |
|
153 |
|
|
154 |
def setStringCell(table, row, col, string) { |
|
155 |
cell=table.getCellByPosition(col, row) |
|
156 |
cell.setStringValue(string) |
|
157 |
return cell |
|
158 |
} |
|
159 |
|
|
160 |
def setStringCellBold(table, row, col, string) { |
|
161 |
cell=table.getCellByPosition(col, row) |
|
162 |
cell.setStringValue(string) |
|
163 |
cell.setFont(font) |
|
164 |
return cell |
|
165 |
} |
|
166 |
|
|
167 |
def setStringCell2(table, row, col, string1, string2) { |
|
168 |
cell=table.getCellByPosition(col, row) |
|
169 |
cell.setStringValue(string1) |
|
170 |
cell=table.getCellByPosition(col+1, row) |
|
171 |
cell.setStringValue(string2) |
|
172 |
return cell |
|
173 |
} |
|
174 |
|
|
175 |
if (odsOutputFormat) { |
|
176 |
if (groupByQuery) { |
|
177 |
setStringCellBold(table, 0, 0, "corpus") |
|
178 |
setStringCellBold(table, 0, 1, inputIsCQL?"CQL":outputWordProperty+" query") |
|
179 |
cell = setStringCellBold(table, 0, 2, "f") |
|
180 |
cell.setHorizontalAlignment(HorizontalAlignmentType.RIGHT) |
|
181 |
} else { |
|
182 |
setStringCellBold(table, 0, 0, "corpus") |
|
183 |
setStringCellBold(table, 0, 1, inputIsCQL?"CQL":outputWordProperty+" query") |
|
184 |
setStringCellBold(table, 0, 2, outputWordProperty+" values") |
|
185 |
cell = setStringCellBold(table, 0, 3, "f") |
|
186 |
cell.setHorizontalAlignment(HorizontalAlignmentType.RIGHT) |
|
187 |
} |
|
188 |
} else { |
|
189 |
if (groupByQuery) { |
|
190 |
output.println "corpus\t"+(inputIsCQL?"CQL":outputWordProperty+" query")+"\tf" |
|
191 |
} else { |
|
192 |
output.println "corpus\t"+(inputIsCQL?"CQL":outputWordProperty+" query")+"\t"+(outputWordProperty+" values")+"\tf" |
|
193 |
} |
|
194 |
} |
|
195 |
|
|
196 |
row = 1 |
|
197 |
|
|
198 |
corpora.each { corpus -> |
|
199 |
|
|
200 |
corpusName = corpus.getName() |
|
201 |
seqN = 0 |
|
202 |
inputFile.eachLine() { line -> |
|
203 |
|
|
204 |
cql = "" |
|
205 |
subCorpus = "NIndex"+seqN++ |
|
206 |
|
|
207 |
if (line.length() > 0) { |
|
208 |
if (inputIsCQL) { |
|
209 |
cql = line |
|
210 |
} else { |
|
211 |
cql = "[$inputWordProperty=\""+line.replaceAll("\\p{Blank}+", "\"] [$inputWordProperty=\"",)+"\"]" |
|
212 |
} |
|
213 |
|
|
214 |
corpusEngine.cqpQuery(corpusName, "$subCorpus", cql) |
|
215 |
nmatches = corpusEngine.subCorpusSize("$corpusName:$subCorpus") |
|
216 |
if (nmatches == 0) { |
|
217 |
if (groupByQuery) { |
|
218 |
if (odsOutputFormat) { |
|
219 |
setRow(table, row++, corpusName, line, 0) |
|
220 |
} else { |
|
221 |
output.println corpusName+"\t"+line+"\t0" |
|
222 |
} |
|
223 |
} else { |
|
224 |
if (odsOutputFormat) { |
|
225 |
setRow2(table, row++, corpusName, line, line, 0) |
|
226 |
} else { |
|
227 |
output.println corpusName+"\t"+line+"\t\t0" |
|
228 |
} |
|
229 |
} |
|
230 |
} else { |
|
231 |
if (groupByQuery) { |
|
232 |
if (odsOutputFormat) { |
|
233 |
setRow(table, row++, corpusName, line, nmatches) |
|
234 |
} else { |
|
235 |
output.println corpusName+"\t"+line+"\t$nmatches" |
|
236 |
} |
|
237 |
} else { |
|
238 |
starts = corpusEngine.dumpSubCorpus("$corpusName:$subCorpus", ICqiClient.CQI_CONST_FIELD_MATCH, 0, nmatches-1) |
|
239 |
ends = corpusEngine.dumpSubCorpus("$corpusName:$subCorpus", ICqiClient.CQI_CONST_FIELD_MATCHEND, 0, nmatches-1) |
|
240 |
[starts, ends].transpose().collect { |
|
241 |
corpusEngine.cpos2Str("$corpusName.$outputWordProperty", (it[0]..it[1]) as int[]).join(' ') |
|
242 |
}.countBy { it }.sort { -it.value }.each { |
|
243 |
if (odsOutputFormat) { |
|
244 |
setRow2(table, row++, corpusName, line, it.key, it.value) |
|
245 |
} else { |
|
246 |
output.println corpusName+"\t"+line+"\t"+it.key+"\t"+it.value |
|
247 |
} |
|
248 |
} |
|
249 |
} |
|
250 |
} |
|
251 |
} |
|
252 |
} |
|
253 |
} |
|
254 |
|
|
255 |
if (odsOutputFormat) { |
|
256 |
ods.save(outputFile) |
|
257 |
} else { |
|
258 |
output.close() |
|
259 |
} |
|
260 |
|
|
261 |
return true |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/cqp/CQPUtils.groovy (revision 2051) | ||
---|---|---|
1 |
// Copyright © 2017 ENS de Lyon, CNRS, University of Franche-Comté |
|
2 |
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses) |
|
3 |
// @author sheiden |
|
4 |
|
|
5 |
package org.txm.macro.cqp |
|
6 |
|
|
7 |
import org.txm.searchengine.cqp.corpus.* |
|
8 |
|
|
9 |
def getCorpusByName(name) { |
|
10 |
def cl = CorpusManager.getCorpusManager().getCorpora() |
|
11 |
def rc = cl.find { |
|
12 |
it.getName() == name |
|
13 |
} |
|
14 |
|
|
15 |
if (rc) return rc |
|
16 |
|
|
17 |
def rsc = null |
|
18 |
rc = cl.find { c -> |
|
19 |
def scl = c.getSubcorpora() |
|
20 |
def rrsc = scl.find { sc -> |
|
21 |
sc.getName() == name |
|
22 |
} |
|
23 |
rsc = rrsc |
|
24 |
} |
|
25 |
|
|
26 |
return rsc |
|
27 |
} |
|
28 |
|
|
29 |
def getCorpora(def script) { |
|
30 |
|
|
31 |
def scriptName = this.class.getSimpleName() |
|
32 |
|
|
33 |
if (script.binding.variables["args"]) { |
|
34 |
def args = script.binding.variables["args"] |
|
35 |
if (args["corpus"]) { |
|
36 |
return [getCorpusByName(args["corpus"])] |
|
37 |
} else if (args["corpora"]) { |
|
38 |
return args["corpora"].tokenize(',').collect { |
|
39 |
return getCorpusByName(it) |
|
40 |
} |
|
41 |
} else { |
|
42 |
if (script.corpusViewSelection == null || !(script.corpusViewSelection instanceof Subcorpus || script.corpusViewSelection instanceof MainCorpus)) { |
|
43 |
return null |
|
44 |
} |
|
45 |
return script.corpusViewSelections.findAll { it instanceof Subcorpus || it instanceof MainCorpus } |
|
46 |
} |
|
47 |
} else { |
|
48 |
if (script.corpusViewSelection == null || !(script.corpusViewSelection instanceof Subcorpus || script.corpusViewSelection instanceof MainCorpus)) { |
|
49 |
return null |
|
50 |
} |
|
51 |
return script.corpusViewSelections.findAll { it instanceof Subcorpus || it instanceof MainCorpus } |
|
52 |
} |
|
53 |
|
|
54 |
} |
|
55 |
|
|
56 |
def newTestSubCorpusName(name) { |
|
57 |
def cl = CorpusManager.getCorpusManager().getCorpora() |
|
58 |
def rc = cl.find { |
|
59 |
it.getName() == name |
|
60 |
} |
|
61 |
|
|
62 |
if (!rc) { |
|
63 |
println "** corpus '"+name+"' not found." |
|
64 |
return null |
|
65 |
} |
|
66 |
|
|
67 |
def scl = rc.getSubcorpora() |
|
68 |
def n = 1 |
|
69 |
def loop = true |
|
70 |
while (loop) { |
|
71 |
def rsc = scl.find { sc -> |
|
72 |
sc.getName() == "CORPUS"+n |
|
73 |
} |
|
74 |
if (rsc) n++ else loop = false |
|
75 |
} |
|
76 |
|
|
77 |
return "CORPUS"+n |
|
78 |
} |
|
79 |
|
|
80 |
def positions2cql(matches) { |
|
81 |
// [0],[1],[2-3],[4-5],[6] |
|
82 |
// 0,1,2-3,4-5,6 |
|
83 |
// "(n1:[])|(n2:[] []{1}) :: (n1 = 0)|(n1 = 1)|(n1 = 6)|(n2 = 2)|(n2 = 4)" |
|
84 |
|
|
85 |
def singletons = [] |
|
86 |
def intervals = [:] |
|
87 |
matches.tokenize(',').each { interval -> |
|
88 |
if (interval.indexOf('-') != -1) { // start-end |
|
89 |
(start, end) = interval.tokenize('-') as int[] |
|
90 |
l = end-start |
|
91 |
if (intervals[l]) { |
|
92 |
intervals.put(l, intervals[l] << start) |
|
93 |
} else { |
|
94 |
intervals.put(l, [start]) |
|
95 |
} |
|
96 |
} else { // singleton |
|
97 |
singletons << interval.toInteger() |
|
98 |
} |
|
99 |
} |
|
100 |
|
|
101 |
def singletonPositions = "" |
|
102 |
def intervalsCQLs = "" |
|
103 |
def intervalsPositions = "" |
|
104 |
|
|
105 |
// "(n1:[])|(n2:[] []{1}) :: (n1 = 0)|(n1 = 1)|(n1 = 6)|(n2 = 2)|(n2 = 4)" |
|
106 |
|
|
107 |
def firstSingleton = true |
|
108 |
singletons.each { |
|
109 |
if (!firstSingleton) singletonPositions += "|" |
|
110 |
firstSingleton = false |
|
111 |
singletonPositions += "n1="+it+"" |
|
112 |
} |
|
113 |
|
|
114 |
def firstInterval = true |
|
115 |
intervals.each { length, positions -> |
|
116 |
if (!firstInterval) intervalsCQLs += "|" |
|
117 |
if (!firstInterval) intervalsPositions += "|" |
|
118 |
firstInterval = false |
|
119 |
intervalsCQLs += "n"+(length+1)+":[][]{"+length+"}" |
|
120 |
def firstPosition = true |
|
121 |
positions.each { |
|
122 |
if (!firstPosition) intervalsPositions += "|" |
|
123 |
firstPosition = false |
|
124 |
intervalsPositions += "n"+(length+1)+"="+it |
|
125 |
} |
|
126 |
} |
|
127 |
|
|
128 |
def q = "" |
|
129 |
if (singletons.size() > 0) { |
|
130 |
q += "n1:[]" |
|
131 |
} |
|
132 |
if (singletons.size() > 0 && intervals.size() > 0) { |
|
133 |
q += "|" |
|
134 |
} |
|
135 |
if (intervals.size() > 0) { |
|
136 |
q += intervalsCQLs |
|
137 |
} |
|
138 |
q += "::" |
|
139 |
if (singletons.size() > 0) { |
|
140 |
q += singletonPositions |
|
141 |
} |
|
142 |
if (singletons.size() > 0 && intervals.size() > 0) { |
|
143 |
q += "|" |
|
144 |
} |
|
145 |
if (intervals.size() > 0) { |
|
146 |
q += intervalsPositions |
|
147 |
} |
|
148 |
|
|
149 |
return q |
|
150 |
|
|
151 |
} |
|
152 |
|
|
153 |
def corpus2positions(def corpus, def maxMatches, def pretty = false) { |
|
154 |
|
|
155 |
def res = "" |
|
156 |
def matches = corpus.getMatches() |
|
157 |
|
|
158 |
if (matches.size() == 0) { |
|
159 |
return "<empty>" |
|
160 |
} else { |
|
161 |
def first = 1 |
|
162 |
matches.take(maxMatches).each { |
|
163 |
if (first) { first = 0 } else { |
|
164 |
if (pretty) { |
|
165 |
res += ", " |
|
166 |
} else { |
|
167 |
res += "," |
|
168 |
} |
|
169 |
} |
|
170 |
// res += "[" |
|
171 |
if (it.getStart() < it.getEnd()) { |
|
172 |
res += it.getStart()+"-"+it.getEnd() |
|
173 |
} else { |
|
174 |
res += it.getStart() |
|
175 |
} |
|
176 |
// res += "]" |
|
177 |
} |
|
178 |
} |
|
179 |
|
|
180 |
if (matches.size() > maxMatches) { |
|
181 |
res += "..." |
|
182 |
} |
|
183 |
|
|
184 |
return res |
|
185 |
} |
|
186 |
|
|
187 |
methods = this.getClass().declaredMethods.findAll { !it.synthetic }.name.sort().unique() |
|
188 |
methods.removeAll(['main', 'run']) |
|
189 |
println "** CQPUtils is not meant to be called directly.\nPlease import its methods ${methods}\nwith the following declaration:\nimport org.txm.macro.cqp.CQPUtilsMacro" |
|
190 |
|
|
191 |
/* |
|
192 |
___ ___ ___ ___ ___ ___ |
|
193 |
/\ \ /\__\ /\__\ /\ \ |\__\ /\ \ |
|
194 |
/::\ \ /::| | /::| | /::\ \ |:| | /::\ \ |
|
195 |
/:/\:\ \ /:|:| | /:|:| | /:/\:\ \ |:| | /:/\:\ \ |
|
196 |
/::\~\:\ \ /:/|:| |__ /:/|:| |__ /::\~\:\ \ |:|__|__ /::\~\:\ \ |
|
197 |
/:/\:\ \:\__\ /:/ |:| /\__\ /:/ |:| /\__\ /:/\:\ \:\__\ ____/::::\__\ /:/\:\ \:\__\ |
|
198 |
\/__\:\/:/ / \/__|:|/:/ / \/__|:|/:/ / \:\~\:\ \/__/ \::::/~~/~ \:\~\:\ \/__/ |
|
199 |
\::/ / |:/:/ / |:/:/ / \:\ \:\__\ ~~|:|~~| \:\ \:\__\ |
|
200 |
/:/ / |::/ / |::/ / \:\ \/__/ |:| | \:\ \/__/ |
|
201 |
/:/ / /:/ / /:/ / \:\__\ |:| | \:\__\ |
|
202 |
\/__/ \/__/ \/__/ \/__/ \|__| \/__/ |
|
203 |
|
|
204 |
|
|
205 |
|
|
206 |
0 1 2 3 4 5 6 7 8 9 |
|
207 |
|
|
208 |
+ + + + |
|
209 |
0 1 2 3 4 5 6 7 8 9 |
|
210 |
+0,+1,+2,+3 |
|
211 |
n:[] :: n = 0|n = 1|n = 2|n = 3 |
|
212 |
|
|
213 |
[ ] [ ] |
|
214 |
0 1 2 3 4 5 6 7 8 9 |
|
215 |
0-1,2-3 |
|
216 |
n:[] []{1} :: n = 0 |
|
217 |
n:[] []{1} :: n = 2 |
|
218 |
|
|
219 |
[ ] + |
|
220 |
0 1 2 3 4 5 6 7 8 9 |
|
221 |
0-2,+3 |
|
222 |
n:[] []{2} :: n = 0 |
|
223 |
n:[] :: n = 3 |
|
224 |
|
|
225 |
|
|
226 |
*/ |
|
227 |
|
|
228 |
/* |
|
229 |
|
|
230 |
Touching intervals mergers |
|
231 |
========================== |
|
232 |
|
|
233 |
A) from http://stackoverflow.com/questions/31670849/merge-overlapping-intervals |
|
234 |
|
|
235 |
import java.util.*; |
|
236 |
|
|
237 |
public class Ideone |
|
238 |
{ |
|
239 |
public static void main (String[] args) throws java.lang.Exception |
|
240 |
{ |
|
241 |
ArrayList<Interval> x = new ArrayList<>(); |
|
242 |
|
|
243 |
x.add(new Interval(1, 3)); |
|
244 |
x.add(new Interval(2, 6)); |
|
245 |
x.add(new Interval(8, 10)); |
|
246 |
x.add(new Interval(15, 18)); |
|
247 |
x.add(new Interval(17, 20)); |
|
248 |
|
|
249 |
x = merge(x); |
|
250 |
|
|
251 |
for(Interval i : x) |
|
252 |
{ |
|
253 |
System.out.println(i.getStart() + " " + i.getEnd()); |
|
254 |
} |
|
255 |
} |
|
256 |
|
|
257 |
public static ArrayList<Interval> merge(ArrayList<Interval> intervals) { |
|
258 |
|
|
259 |
if(intervals.size() == 0 || intervals.size() == 1) |
|
260 |
return intervals; |
|
261 |
|
|
262 |
Collections.sort(intervals, new IntervalComparator()); |
|
263 |
|
|
264 |
Interval first = intervals.get(0); |
|
265 |
int start = first.getStart(); |
|
266 |
int end = first.getEnd(); |
|
267 |
|
|
268 |
ArrayList<Interval> result = new ArrayList<Interval>(); |
|
269 |
|
|
270 |
for (int i = 1; i < intervals.size(); i++) { |
|
271 |
Interval current = intervals.get(i); |
|
272 |
if (current.getStart() <= end) { |
|
273 |
end = Math.max(current.getEnd(), end); |
|
274 |
} else { |
|
275 |
result.add(new Interval(start, end)); |
|
276 |
start = current.getStart(); |
|
277 |
end = current.getEnd(); |
|
278 |
} |
|
279 |
} |
|
280 |
|
|
281 |
result.add(new Interval(start, end)); |
|
282 |
return result; |
|
283 |
} |
|
284 |
} |
|
285 |
|
|
286 |
class Interval |
|
287 |
{ |
|
288 |
private int start; |
|
289 |
private int end; |
|
290 |
|
|
291 |
Interval() { |
|
292 |
start = 0; |
|
293 |
end = 0; |
|
294 |
} |
|
295 |
|
|
296 |
Interval(int s, int e) |
|
297 |
{ |
|
298 |
start = s; |
|
299 |
end = e; |
|
300 |
} |
|
301 |
|
|
302 |
public int getStart() { |
|
303 |
return start; |
|
304 |
} |
|
305 |
|
|
306 |
public int getEnd() { |
|
307 |
return end; |
|
308 |
} |
|
309 |
} |
|
310 |
|
|
311 |
class IntervalComparator implements Comparator<Interval> |
|
312 |
{ |
|
313 |
public int compare(Interval i1, Interval i2) |
|
314 |
{ |
|
315 |
return i1.getStart() - i2.getStart(); |
|
316 |
} |
|
317 |
} |
|
318 |
|
|
319 |
B) from http://www.programcreek.com/2012/12/leetcode-merge-intervals/ |
|
320 |
|
|
321 |
public List<Interval> merge(List<Interval> intervals) { |
|
322 |
List<Interval> result = new ArrayList<Interval>(); |
|
323 |
|
|
324 |
if(intervals==null||intervals.size()==0) |
|
325 |
return result; |
|
326 |
|
|
327 |
Collections.sort(intervals, new Comparator<Interval>(){ |
|
328 |
public int compare(Interval i1, Interval i2){ |
|
329 |
if(i1.start!=i2.start) |
|
330 |
return i1.start-i2.start; |
|
331 |
else |
|
332 |
return i1.end-i2.end; |
|
333 |
} |
|
334 |
}); |
|
335 |
|
|
336 |
Interval pre = intervals.get(0); |
|
337 |
for(int i=0; i<intervals.size(); i++){ |
|
338 |
Interval curr = intervals.get(i); |
|
339 |
if(curr.start>pre.end){ |
|
340 |
result.add(pre); |
|
341 |
pre = curr; |
|
342 |
}else{ |
|
343 |
Interval merged = new Interval(pre.start, Math.max(pre.end, curr.end)); |
|
344 |
pre = merged; |
|
345 |
} |
|
346 |
} |
|
347 |
result.add(pre); |
|
348 |
|
|
349 |
return result; |
|
350 |
} |
|
351 |
*/ |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/text/ExcelDir2XMLMacro.groovy (revision 2051) | ||
---|---|---|
1 |
package org.txm.macro.text |
|
2 |
|
|
3 |
import org.kohsuke.args4j.* |
|
4 |
import groovy.transform.Field |
|
5 |
import java.nio.charset.Charset |
|
6 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
7 |
import org.txm.utils.* |
|
8 |
import javax.xml.stream.* |
|
9 |
import java.net.URL |
|
10 |
|
|
11 |
import org.apache.poi.ss.usermodel.* |
|
12 |
import org.apache.poi.hssf.usermodel.* |
|
13 |
import org.apache.poi.xssf.usermodel.* |
|
14 |
import org.apache.poi.ss.util.* |
|
15 |
|
|
16 |
@Field @Option(name="inputDirectory", usage="répertoire des fichiers Excel à traiter", widget="Folder", required=true, def="") |
|
17 |
def inputDirectory |
|
18 |
|
|
19 |
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="") |
|
20 |
def sheetName |
|
21 |
|
|
22 |
@Field @Option(name="rootTag", usage="Root tag name", widget="String", required=false, def="root") |
|
23 |
def rootTag |
|
24 |
|
|
25 |
@Field @Option(name="textTag", usage="Text tag name", widget="String", required=false, def="unit") |
|
26 |
def textTag |
|
27 |
|
|
28 |
@Field @Option(name="metadataColumnList", usage="metadataColumnList column list separated with comma", widget="String", required=false, def="meta1,meta2") |
|
29 |
def metadataColumnList |
|
30 |
|
|
31 |
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2") |
|
32 |
def dateColumnList |
|
33 |
|
|
34 |
@Field @Option(name="textColumnList", usage="textColumnList column list separated with comma", widget="String", required=false, def="textColumnList1,textColumnList2") |
|
35 |
def textColumnList |
|
36 |
|
|
37 |
if (!ParametersDialog.open(this)) return |
|
38 |
|
|
39 |
if (!inputDirectory.exists()) { |
|
40 |
println "** ExcelDir2XML: no '"+inputDirectory.name+"' directory found. Aborting." |
|
41 |
return false |
|
42 |
} |
|
43 |
|
|
44 |
if (!inputDirectory.canRead()) { |
|
45 |
println "** ExcelDir2XML: '"+inputDirectory.name+"' directory not readable. Aborting." |
|
46 |
return false |
|
47 |
} |
|
48 |
|
|
49 |
def f = [] |
|
50 |
inputDirectory.eachFileMatch(~/.*xlsx/) { f << it } |
|
51 |
|
|
52 |
if (f.size() == 0) { |
|
53 |
println "** ExcelDir2XML: no .xlsx file found. Aborting." |
|
54 |
return false |
|
55 |
} |
|
56 |
|
|
57 |
try { |
|
58 |
|
|
59 |
f.sort { it.name }.each { inputFile -> |
|
60 |
|
|
61 |
res = gse.run(Excel2XMLMacro, ["args":[ |
|
62 |
|
|
63 |
"inputFile":inputFile, |
|
64 |
"sheetName":sheetName, |
|
65 |
"metadataColumnList":metadataColumnList, |
|
66 |
"dateColumnList":dateColumnList, |
|
67 |
"textColumnList":textColumnList, |
|
68 |
"rootTag":rootTag, |
|
69 |
"textTag":textTag, |
|
70 |
|
|
71 |
"selection":selection, |
|
72 |
"selections":selections, |
|
73 |
"corpusViewSelection":corpusViewSelection, |
|
74 |
"corpusViewSelections":corpusViewSelections, |
|
75 |
"monitor":monitor]]) |
|
76 |
if (!res) println "** problem calling Excel2XMLMacro." |
|
77 |
} |
|
78 |
|
|
79 |
} catch (Exception e) { |
|
80 |
println "** ExcelDir2XML: unable to read input files. Aborting." |
|
81 |
println e.getLocalizedMessage() |
|
82 |
println e.printStackTrace() |
|
83 |
return false |
|
84 |
} |
|
85 |
|
|
86 |
return true |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/text/Excel2XMLMacro.groovy (revision 2051) | ||
---|---|---|
1 |
package org.txm.macro.text |
|
2 |
|
|
3 |
import org.kohsuke.args4j.* |
|
4 |
import groovy.transform.Field |
|
5 |
import java.nio.charset.Charset |
|
6 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
7 |
import org.txm.utils.* |
|
8 |
import javax.xml.stream.* |
|
9 |
import java.net.URL |
|
10 |
import java.text.SimpleDateFormat |
|
11 |
|
|
12 |
import org.apache.poi.ss.usermodel.* |
|
13 |
import org.apache.poi.hssf.usermodel.* |
|
14 |
import org.apache.poi.xssf.usermodel.* |
|
15 |
import org.apache.poi.ss.util.* |
|
16 |
|
|
17 |
def stringToIndent = { str -> org.txm.utils.AsciiUtils.buildAttributeId(org.txm.utils.AsciiUtils.convertNonAscii(str)).toLowerCase() } |
|
18 |
|
|
19 |
// from http://www.java-connect.com/apache-poi-tutorials/read-all-type-of-excel-cell-value-as-string-using-poi |
|
20 |
def getCellValueAsString = { cell -> |
|
21 |
strCellValue = null |
|
22 |
if (cell != null) { |
|
23 |
switch (cell.getCellType()) { |
|
24 |
case Cell.CELL_TYPE_STRING: |
|
25 |
strCellValue = cell.toString() |
|
26 |
break |
|
27 |
case Cell.CELL_TYPE_NUMERIC: |
|
28 |
if (DateUtil.isCellDateFormatted(cell)) { |
|
29 |
SimpleDateFormat dateFormat = new SimpleDateFormat("dd/MM/yyyy") |
|
30 |
strCellValue = dateFormat.format(cell.getDateCellValue()) |
|
31 |
} else { |
|
32 |
value = cell.getNumericCellValue() |
|
33 |
longValue = value.longValue() |
|
34 |
strCellValue = new String(longValue.toString()) |
|
35 |
} |
|
36 |
break |
|
37 |
case Cell.CELL_TYPE_BOOLEAN: |
|
38 |
strCellValue = new String(new Boolean(cell.getBooleanCellValue()).toString()) |
|
39 |
break |
|
40 |
case Cell.CELL_TYPE_BLANK: |
|
41 |
strCellValue = "" |
|
42 |
break |
|
43 |
} |
|
44 |
} |
|
45 |
if (strCellValue == null) strCellValue = "" |
|
46 |
return strCellValue |
|
47 |
} |
|
48 |
|
|
49 |
@Field @Option(name="inputFile", usage="fichier Excel à traiter", widget="File", required=true, def="") |
|
50 |
File inputFile |
|
51 |
|
|
52 |
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="") |
|
53 |
def sheetName |
|
54 |
|
|
55 |
@Field @Option(name="rootTag", usage="root tag name", widget="String", required=false, def="root") |
|
56 |
def rootTag |
|
57 |
|
|
58 |
@Field @Option(name="textTag", usage="line unit tag name", widget="String", required=false, def="unit") |
|
59 |
def textTag |
|
60 |
|
|
61 |
@Field @Option(name="metadataColumnList", usage="metadata columns list separated by comma", widget="String", required=false, def="meta1,meta2") |
|
62 |
def metadataColumnList |
|
63 |
|
|
64 |
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2") |
|
65 |
def dateColumnList |
|
66 |
|
|
67 |
@Field @Option(name="textColumnList", usage="text columns list separated by comma", widget="String", required=false, def="textColumnList1,textColumnList2") |
|
68 |
def textColumnList |
|
69 |
|
|
70 |
if (!ParametersDialog.open(this)) return |
|
71 |
|
|
72 |
if (!inputFile.exists()) { |
|
73 |
println "** Excel2XML: no '"+inputFile.name+"' file found. Aborting." |
|
74 |
return false |
|
75 |
} |
|
76 |
|
|
77 |
if (!inputFile.canRead()) { |
|
78 |
println "** Excel2XML: '"+inputFile.name+"' file not readable. Aborting." |
|
79 |
return false |
|
80 |
} |
|
81 |
|
|
82 |
try { |
|
83 |
|
|
84 |
metadataColumnList = metadataColumnList.split(",").collect { it.trim() } |
|
85 |
dateColumnList = dateColumnList.split(",").collect { it.trim() } |
|
86 |
textColumnList = textColumnList.split(",").collect { it.trim() } |
|
87 |
textTag = textTag.trim() |
|
88 |
rootTag = rootTag.trim() |
|
89 |
|
|
90 |
wb = WorkbookFactory.create(inputFile) |
|
91 |
|
|
92 |
if (sheetName.length() == 0) { |
|
93 |
ws = wb.getSheetAt(0) |
|
94 |
} else { |
|
95 |
ws = wb.getSheet(sheetName) |
|
96 |
if (ws == null) { |
|
97 |
println "** Excel2XML: no '"+sheetName+" found. Aborting." |
|
98 |
return false |
|
99 |
} |
|
100 |
} |
|
101 |
|
|
102 |
if (ws == null) { |
|
103 |
println "** Excel2XML: no sheet found. Aborting." |
|
104 |
return false |
|
105 |
} |
|
106 |
|
|
107 |
nRows = ws.getPhysicalNumberOfRows() |
|
108 |
println nRows+" rows." |
|
109 |
|
|
110 |
firstRow = ws.getRow(0) |
|
111 |
colMax = firstRow.getLastCellNum() |
|
112 |
|
|
113 |
headers = (0..colMax-1).collect { getCellValueAsString(firstRow.getCell(it)) } |
|
114 |
|
|
115 |
println "Headers: $headers" |
|
116 |
|
|
117 |
normalizedHeaders = headers.collect { stringToIndent(it) } |
|
118 |
|
|
119 |
ok = true |
|
120 |
metadataColumnList.each { m -> |
|
121 |
if (!headers.contains(m)) { |
|
122 |
println "** Excel2XML: missing metadataColumnList column: $m" |
|
123 |
ok = false |
|
124 |
} |
|
125 |
} |
|
126 |
textColumnList.each { t -> |
|
127 |
if (!headers.contains(t)) { |
|
128 |
println "** Excel2XML: missing textColumnList column: $t" |
|
129 |
ok = false |
|
130 |
} |
|
131 |
} |
|
132 |
|
|
133 |
if (!ok) { return false } |
|
134 |
|
|
135 |
metadataColumnIndex = metadataColumnList.collect { headers.indexOf(it) } |
|
136 |
dateColumnsIndex = dateColumnList.collect { headers.indexOf(it) } |
|
137 |
textColumnIndex = textColumnList.collect { headers.indexOf(it) } |
|
138 |
|
|
139 |
println "metadataColumnList = "+metadataColumnList |
|
140 |
println "metadataColumnIndex = "+metadataColumnIndex |
|
141 |
|
|
142 |
println "dateColumnList = "+dateColumnList |
|
143 |
println "dateColumnsIndex = "+dateColumnsIndex |
|
144 |
|
|
145 |
println "textColumnList = "+textColumnList |
|
146 |
println "textColumnIndex = "+textColumnIndex |
|
147 |
|
|
148 |
name = inputFile.getName() |
|
149 |
idx = name.lastIndexOf(".") |
|
150 |
|
|
151 |
if (idx > 0) name = name.substring(0, idx) |
|
152 |
outputFile = new File(inputFile.getParentFile(), name+".xml") |
|
153 |
|
|
154 |
factory = XMLOutputFactory.newInstance() |
|
155 |
output = new FileOutputStream(outputFile) |
|
156 |
writer = factory.createXMLStreamWriter(output, "UTF-8") |
|
157 |
|
|
158 |
writer.writeStartDocument("UTF-8","1.0") |
|
159 |
writer.writeCharacters("\n") // simple XML formating |
|
160 |
writer.writeStartElement(rootTag) |
|
161 |
writer.writeCharacters("\n") |
|
162 |
|
|
163 |
pb_n = 1 |
|
164 |
|
|
165 |
(1..nRows-1).each { rowIndex -> |
|
166 |
|
|
167 |
writer.writeCharacters(" ") |
|
168 |
writer.writeEmptyElement("pb") // <pb/> to get one page per input line (don't forget high number of words per page in import module) |
|
169 |
writer.writeAttribute("n", ""+pb_n++) |
|
170 |
writer.writeCharacters("\n") // simple XML formating |
|
171 |
|
|
172 |
writer.writeCharacters(" ") |
|
173 |
writer.writeStartElement(textTag) |
|
174 |
metadataColumnIndex.each { colIndex -> // build an attribute for each metadata |
|
175 |
String s = getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex)); |
|
176 |
if (s == null) s =""; |
|
177 |
value = s.replaceAll("\n", ";").trim() |
|
178 |
writer.writeAttribute(normalizedHeaders[colIndex], value) |
|
179 |
if (colIndex in dateColumnsIndex) { // also split date attributes in day+month+year attributes |
|
180 |
matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/) |
|
181 |
writer.writeAttribute(normalizedHeaders[colIndex]+"jour", matches[0][1]) |
|
182 |
writer.writeAttribute(normalizedHeaders[colIndex]+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value))) |
|
183 |
writer.writeAttribute(normalizedHeaders[colIndex]+"mois", matches[0][2]) |
|
184 |
writer.writeAttribute(normalizedHeaders[colIndex]+"annee", matches[0][3]) |
|
185 |
} |
|
186 |
} |
|
187 |
writer.writeCharacters("\n") |
|
188 |
|
|
189 |
writer.writeCharacters(" ") |
|
190 |
writer.writeStartElement("metadata") |
|
191 |
writer.writeStartElement("list") |
|
192 |
writer.writeAttribute("type", "unordered") |
|
193 |
writer.writeCharacters("\n") |
|
194 |
|
|
195 |
metadataColumnIndex.each { colIndex -> |
|
196 |
writer.writeStartElement("item") |
|
197 |
writer.writeCharacters(headers[colIndex]+" : "+getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex)).replaceAll("\n", ";")) |
|
198 |
writer.writeEndElement() // item |
|
199 |
writer.writeCharacters("\n") |
|
200 |
} |
|
201 |
writer.writeCharacters(" ") |
|
202 |
writer.writeEndElement() // list |
|
203 |
writer.writeEndElement() // head |
|
204 |
writer.writeCharacters("\n") |
|
205 |
|
|
206 |
textColumnIndex.each { colIndex -> |
|
207 |
|
|
208 |
writer.writeCharacters(" ") |
|
209 |
writer.writeStartElement(normalizedHeaders[colIndex]) |
|
210 |
writer.writeStartElement("p") |
|
211 |
writer.writeStartElement("head") |
|
212 |
writer.writeStartElement("hi") |
|
213 |
writer.writeCharacters(headers[colIndex]+" : ") |
|
214 |
writer.writeEndElement() // hi |
|
215 |
writer.writeEndElement() // head |
|
216 |
value = getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex)) |
|
217 |
|
|
218 |
if (value ==~ /(?s)^[A-Z]{3}: [^;\n]+? +[;\n].*/) { |
|
219 |
value.findAll( /(?s)[A-Z]{3}: ([^;\n]+?) +[;\n]/ ).each { desc -> |
|
220 |
writer.writeStartElement("descripteur") |
|
221 |
matches = (desc =~ /(?s)([A-Z]{3}): ([^;\n]+?) +[;\n]/) |
|
222 |
writer.writeAttribute("type", matches[0][1]) |
|
223 |
writer.writeCharacters(matches[0][2]) |
|
224 |
writer.writeEndElement() // descripteur |
|
225 |
} |
|
226 |
} else { |
|
227 |
writer.writeCharacters(value) |
|
228 |
} |
|
229 |
writer.writeEndElement() // p |
|
230 |
writer.writeEndElement() // textColumn |
|
231 |
writer.writeCharacters("\n") |
|
232 |
} |
|
233 |
|
|
234 |
writer.writeCharacters(" ") |
|
235 |
writer.writeEndElement() // textTag |
|
236 |
writer.writeCharacters("\n") |
|
237 |
} |
|
238 |
|
|
239 |
writer.writeEndElement() // rootTag |
|
240 |
writer.writeCharacters("\n") |
|
241 |
writer.close() |
|
242 |
output.close() |
|
243 |
println "Result file: $outputFile" |
|
244 |
|
|
245 |
} catch (Exception e) { |
|
246 |
println "** Excel2XML: unable to read input file. Aborting." |
|
247 |
println e.getLocalizedMessage() |
|
248 |
println e.printStackTrace() |
|
249 |
return false |
|
250 |
} |
|
251 |
|
|
252 |
return true |
Formats disponibles : Unified diff