Révision 2051
| tmp/org.txm.searchengine.cqp.core/src/org/txm/searchengine/cqp/CQPSearchEngine.java (revision 2051) | ||
|---|---|---|
| 10 | 10 |
import org.txm.core.engines.EngineType; |
| 11 | 11 |
import org.txm.core.messages.TXMCoreMessages; |
| 12 | 12 |
import org.txm.core.results.TXMResult; |
| 13 |
import org.txm.importer.cwb.PatchCwbRegistry; |
|
| 14 | 13 |
import org.txm.libs.cqp.CQPLibPreferences; |
| 15 | 14 |
import org.txm.objects.CorpusBuild; |
| 16 | 15 |
import org.txm.objects.Match; |
| 17 |
import org.txm.objects.Project; |
|
| 18 | 16 |
import org.txm.searchengine.core.EmptySelection; |
| 19 | 17 |
import org.txm.searchengine.core.Query; |
| 20 | 18 |
import org.txm.searchengine.core.SearchEngine; |
| ... | ... | |
| 29 | 27 |
import org.txm.searchengine.cqp.corpus.query.CQLQuery; |
| 30 | 28 |
import org.txm.utils.OSDetector; |
| 31 | 29 |
import org.txm.utils.Sh; |
| 32 |
import org.txm.utils.io.IOUtils; |
|
| 33 | 30 |
import org.txm.utils.logger.Log; |
| 34 | 31 |
|
| 35 | 32 |
public class CQPSearchEngine extends SearchEngine {
|
| tmp/org.txm.groovy.core/src/groovy/org/txm/macro/commands/NIndexMacro.groovy (revision 2051) | ||
|---|---|---|
| 1 |
// Copyright © 2017 ENS de Lyon, CNRS, University of Franche-Comté |
|
| 2 |
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses) |
|
| 3 |
// @author sheiden |
|
| 4 |
|
|
| 5 |
package org.txm.macro.commands |
|
| 6 |
|
|
| 7 |
import org.kohsuke.args4j.* |
|
| 8 |
import groovy.transform.Field |
|
| 9 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
| 10 |
import org.txm.searchengine.cqp.CQPSearchEngine |
|
| 11 |
import org.txm.Toolbox |
|
| 12 |
import org.txm.macro.cqp.CQPUtils |
|
| 13 |
import org.txm.searchengine.cqp.ICqiClient |
|
| 14 |
import org.odftoolkit.simple.SpreadsheetDocument |
|
| 15 |
import org.odftoolkit.odfdom.doc.table.OdfTable |
|
| 16 |
import org.odftoolkit.odfdom.doc.table.OdfTableRow |
|
| 17 |
import org.odftoolkit.odfdom.doc.table.OdfTableCell |
|
| 18 |
import org.odftoolkit.simple.table.Table |
|
| 19 |
import org.odftoolkit.simple.Document |
|
| 20 |
import org.odftoolkit.simple.style.Border |
|
| 21 |
import org.odftoolkit.simple.style.Font |
|
| 22 |
import org.odftoolkit.simple.style.StyleTypeDefinitions |
|
| 23 |
import org.odftoolkit.simple.style.StyleTypeDefinitions.FontStyle |
|
| 24 |
import org.odftoolkit.simple.table.Row |
|
| 25 |
import org.odftoolkit.simple.table.Cell |
|
| 26 |
import org.odftoolkit.simple.style.StyleTypeDefinitions.CellBordersType |
|
| 27 |
import org.odftoolkit.simple.style.StyleTypeDefinitions.HorizontalAlignmentType |
|
| 28 |
|
|
| 29 |
// BEGINNING OF PARAMETERS |
|
| 30 |
|
|
| 31 |
@Field @Option(name="inputFile", usage="input sequence list file (one sequence per line)", widget="File", required=true, def="") |
|
| 32 |
def inputFile |
|
| 33 |
|
|
| 34 |
@Field @Option(name="inputIsCQL", usage="input sequences are CQL queries", widget="Boolean", required=true, def="false") |
|
| 35 |
def inputIsCQL |
|
| 36 |
|
|
| 37 |
@Field @Option(name="inputWordProperty", usage="implicit word property to use", widget="String", required=true, def="word") |
|
| 38 |
String inputWordProperty |
|
| 39 |
|
|
| 40 |
@Field @Option(name="outputFile", usage="results output File (TSV format)", widget="File", required=true, def="") |
|
| 41 |
def outputFile |
|
| 42 |
|
|
| 43 |
@Field @Option(name="odsOutputFormat", usage="use ODS as output format", widget="Boolean", required=true, def="false") |
|
| 44 |
def odsOutputFormat |
|
| 45 |
|
|
| 46 |
@Field @Option(name="outputWordProperty", usage="output word property to use", widget="String", required=true, def="word") |
|
| 47 |
String outputWordProperty |
|
| 48 |
|
|
| 49 |
@Field @Option(name="groupByQuery", usage="provide frequency of each CQL match values or just the total frequency of each CQL match", widget="Boolean", required=true, def="false") |
|
| 50 |
def groupByQuery |
|
| 51 |
|
|
| 52 |
// Open the parameters input dialog box |
|
| 53 |
if (!ParametersDialog.open(this)) return |
|
| 54 |
|
|
| 55 |
// END OF PARAMETERS |
|
| 56 |
|
|
| 57 |
def scriptName = this.class.getSimpleName() |
|
| 58 |
|
|
| 59 |
if (!(inputFile && inputFile.isFile() && inputFile.canRead() )) {
|
|
| 60 |
|
|
| 61 |
println "** $scriptName: impossible to read input file. Aborting." |
|
| 62 |
return false |
|
| 63 |
} |
|
| 64 |
|
|
| 65 |
corpusEngine = CQPSearchEngine.getCqiClient() |
|
| 66 |
|
|
| 67 |
utils = new CQPUtils() |
|
| 68 |
|
|
| 69 |
corpora = utils.getCorpora(this) |
|
| 70 |
|
|
| 71 |
if ((corpora == null) || corpora.size() == 0) {
|
|
| 72 |
println "** $scriptName: please select a corpus in the Corpus view or provide a corpus name. Aborting." |
|
| 73 |
return false |
|
| 74 |
} |
|
| 75 |
|
|
| 76 |
def getComputerName() |
|
| 77 |
{
|
|
| 78 |
env = System.getenv() |
|
| 79 |
if (env.containsKey("COMPUTERNAME"))
|
|
| 80 |
return env.get("COMPUTERNAME")
|
|
| 81 |
else if (env.containsKey("HOSTNAME"))
|
|
| 82 |
return env.get("HOSTNAME")
|
|
| 83 |
else |
|
| 84 |
return InetAddress.getLocalHost().getHostName() |
|
| 85 |
} |
|
| 86 |
|
|
| 87 |
def fullName = outputFile.getName() |
|
| 88 |
def i = fullName.lastIndexOf(".")
|
|
| 89 |
def name |
|
| 90 |
if (i == -1) {
|
|
| 91 |
name = fullName |
|
| 92 |
} else {
|
|
| 93 |
name = fullName.substring(0, i) |
|
| 94 |
} |
|
| 95 |
|
|
| 96 |
if (odsOutputFormat) {
|
|
| 97 |
|
|
| 98 |
try {
|
|
| 99 |
Class.forName( "org.odftoolkit.simple.SpreadsheetDocument") |
|
| 100 |
} catch(e) {
|
|
| 101 |
println "** $scriptName: the simple-odf-0.8.1-incubating.jar file is needed in the \$HOME/TXM/scripts/lib directory to write in ODS format. Please uncheck the odsOutputFormat option. Aborting." |
|
| 102 |
return false |
|
| 103 |
} |
|
| 104 |
|
|
| 105 |
outputFile = new File(outputFile.getParentFile(), name+".ods") |
|
| 106 |
ods = SpreadsheetDocument.newSpreadsheetDocument() |
|
| 107 |
table = ods.getTableByName("Sheet1")
|
|
| 108 |
font = new Font("Arial", FontStyle.BOLD, 11)
|
|
| 109 |
|
|
| 110 |
properties = Table.newTable(ods); |
|
| 111 |
properties.setTableName("Properties")
|
|
| 112 |
setStringCellBold(properties, 0, 0, "Name") |
|
| 113 |
setStringCellBold(properties, 0, 1, "Value") |
|
| 114 |
line = 1 |
|
| 115 |
setStringCell2(properties, line++, 0, "date", new Date().format("dd/MM/yyyy"))
|
|
| 116 |
setStringCell2(properties, line++, 0, "time", new Date().format("HH'h'mm"))
|
|
| 117 |
setStringCell2(properties, line++, 0, "machine", getComputerName()) |
|
| 118 |
setStringCell2(properties, line++, 0, "user", System.getProperty("user.name"))
|
|
| 119 |
setStringCell2(properties, line++, 0, "macro", scriptName) |
|
| 120 |
setStringCell2(properties, line++, 0, "corpora", corpora.collect { it.getName() }.join(","))
|
|
| 121 |
setStringCell2(properties, line++, 0, "inputFile", inputFile.getAbsolutePath()) |
|
| 122 |
setStringCell2(properties, line++, 0, "inputIsCQL", inputIsCQL.toString()) |
|
| 123 |
setStringCell2(properties, line++, 0, "inputWordProperty", inputWordProperty) |
|
| 124 |
setStringCell2(properties, line++, 0, "outputFile", outputFile.getAbsolutePath()) |
|
| 125 |
setStringCell2(properties, line++, 0, "odsOutputFormat", odsOutputFormat.toString()) |
|
| 126 |
setStringCell2(properties, line++, 0, "outputWordProperty", outputWordProperty) |
|
| 127 |
setStringCell2(properties, line++, 0, "groupByQuery", groupByQuery.toString()) |
|
| 128 |
|
|
| 129 |
} else {
|
|
| 130 |
outputFile = new File(outputFile.getParentFile(), name+".tsv") |
|
| 131 |
output = new FileWriter(outputFile) |
|
| 132 |
} |
|
| 133 |
|
|
| 134 |
def setRow(table, row, corpus, string, frequency) {
|
|
| 135 |
cell=table.getCellByPosition(0, row) |
|
| 136 |
cell.setStringValue(corpus) |
|
| 137 |
cell=table.getCellByPosition(1, row) |
|
| 138 |
cell.setStringValue(string) |
|
| 139 |
cell=table.getCellByPosition(2, row) |
|
| 140 |
cell.setDoubleValue(frequency) |
|
| 141 |
} |
|
| 142 |
|
|
| 143 |
def setRow2(table, row, corpus, string1, string2, frequency) {
|
|
| 144 |
cell=table.getCellByPosition(0, row) |
|
| 145 |
cell.setStringValue(corpus) |
|
| 146 |
cell=table.getCellByPosition(1, row) |
|
| 147 |
cell.setStringValue(string1) |
|
| 148 |
cell=table.getCellByPosition(2, row) |
|
| 149 |
cell.setStringValue((frequency == 0)?'':string2) |
|
| 150 |
cell=table.getCellByPosition(3, row) |
|
| 151 |
cell.setDoubleValue(frequency) |
|
| 152 |
} |
|
| 153 |
|
|
| 154 |
def setStringCell(table, row, col, string) {
|
|
| 155 |
cell=table.getCellByPosition(col, row) |
|
| 156 |
cell.setStringValue(string) |
|
| 157 |
return cell |
|
| 158 |
} |
|
| 159 |
|
|
| 160 |
def setStringCellBold(table, row, col, string) {
|
|
| 161 |
cell=table.getCellByPosition(col, row) |
|
| 162 |
cell.setStringValue(string) |
|
| 163 |
cell.setFont(font) |
|
| 164 |
return cell |
|
| 165 |
} |
|
| 166 |
|
|
| 167 |
def setStringCell2(table, row, col, string1, string2) {
|
|
| 168 |
cell=table.getCellByPosition(col, row) |
|
| 169 |
cell.setStringValue(string1) |
|
| 170 |
cell=table.getCellByPosition(col+1, row) |
|
| 171 |
cell.setStringValue(string2) |
|
| 172 |
return cell |
|
| 173 |
} |
|
| 174 |
|
|
| 175 |
if (odsOutputFormat) {
|
|
| 176 |
if (groupByQuery) {
|
|
| 177 |
setStringCellBold(table, 0, 0, "corpus") |
|
| 178 |
setStringCellBold(table, 0, 1, inputIsCQL?"CQL":outputWordProperty+" query") |
|
| 179 |
cell = setStringCellBold(table, 0, 2, "f") |
|
| 180 |
cell.setHorizontalAlignment(HorizontalAlignmentType.RIGHT) |
|
| 181 |
} else {
|
|
| 182 |
setStringCellBold(table, 0, 0, "corpus") |
|
| 183 |
setStringCellBold(table, 0, 1, inputIsCQL?"CQL":outputWordProperty+" query") |
|
| 184 |
setStringCellBold(table, 0, 2, outputWordProperty+" values") |
|
| 185 |
cell = setStringCellBold(table, 0, 3, "f") |
|
| 186 |
cell.setHorizontalAlignment(HorizontalAlignmentType.RIGHT) |
|
| 187 |
} |
|
| 188 |
} else {
|
|
| 189 |
if (groupByQuery) {
|
|
| 190 |
output.println "corpus\t"+(inputIsCQL?"CQL":outputWordProperty+" query")+"\tf" |
|
| 191 |
} else {
|
|
| 192 |
output.println "corpus\t"+(inputIsCQL?"CQL":outputWordProperty+" query")+"\t"+(outputWordProperty+" values")+"\tf" |
|
| 193 |
} |
|
| 194 |
} |
|
| 195 |
|
|
| 196 |
row = 1 |
|
| 197 |
|
|
| 198 |
corpora.each { corpus ->
|
|
| 199 |
|
|
| 200 |
corpusName = corpus.getName() |
|
| 201 |
seqN = 0 |
|
| 202 |
inputFile.eachLine() { line ->
|
|
| 203 |
|
|
| 204 |
cql = "" |
|
| 205 |
subCorpus = "NIndex"+seqN++ |
|
| 206 |
|
|
| 207 |
if (line.length() > 0) {
|
|
| 208 |
if (inputIsCQL) {
|
|
| 209 |
cql = line |
|
| 210 |
} else {
|
|
| 211 |
cql = "[$inputWordProperty=\""+line.replaceAll("\\p{Blank}+", "\"] [$inputWordProperty=\"",)+"\"]"
|
|
| 212 |
} |
|
| 213 |
|
|
| 214 |
corpusEngine.cqpQuery(corpusName, "$subCorpus", cql) |
|
| 215 |
nmatches = corpusEngine.subCorpusSize("$corpusName:$subCorpus")
|
|
| 216 |
if (nmatches == 0) {
|
|
| 217 |
if (groupByQuery) {
|
|
| 218 |
if (odsOutputFormat) {
|
|
| 219 |
setRow(table, row++, corpusName, line, 0) |
|
| 220 |
} else {
|
|
| 221 |
output.println corpusName+"\t"+line+"\t0" |
|
| 222 |
} |
|
| 223 |
} else {
|
|
| 224 |
if (odsOutputFormat) {
|
|
| 225 |
setRow2(table, row++, corpusName, line, line, 0) |
|
| 226 |
} else {
|
|
| 227 |
output.println corpusName+"\t"+line+"\t\t0" |
|
| 228 |
} |
|
| 229 |
} |
|
| 230 |
} else {
|
|
| 231 |
if (groupByQuery) {
|
|
| 232 |
if (odsOutputFormat) {
|
|
| 233 |
setRow(table, row++, corpusName, line, nmatches) |
|
| 234 |
} else {
|
|
| 235 |
output.println corpusName+"\t"+line+"\t$nmatches" |
|
| 236 |
} |
|
| 237 |
} else {
|
|
| 238 |
starts = corpusEngine.dumpSubCorpus("$corpusName:$subCorpus", ICqiClient.CQI_CONST_FIELD_MATCH, 0, nmatches-1)
|
|
| 239 |
ends = corpusEngine.dumpSubCorpus("$corpusName:$subCorpus", ICqiClient.CQI_CONST_FIELD_MATCHEND, 0, nmatches-1)
|
|
| 240 |
[starts, ends].transpose().collect {
|
|
| 241 |
corpusEngine.cpos2Str("$corpusName.$outputWordProperty", (it[0]..it[1]) as int[]).join(' ')
|
|
| 242 |
}.countBy { it }.sort { -it.value }.each {
|
|
| 243 |
if (odsOutputFormat) {
|
|
| 244 |
setRow2(table, row++, corpusName, line, it.key, it.value) |
|
| 245 |
} else {
|
|
| 246 |
output.println corpusName+"\t"+line+"\t"+it.key+"\t"+it.value |
|
| 247 |
} |
|
| 248 |
} |
|
| 249 |
} |
|
| 250 |
} |
|
| 251 |
} |
|
| 252 |
} |
|
| 253 |
} |
|
| 254 |
|
|
| 255 |
if (odsOutputFormat) {
|
|
| 256 |
ods.save(outputFile) |
|
| 257 |
} else {
|
|
| 258 |
output.close() |
|
| 259 |
} |
|
| 260 |
|
|
| 261 |
return true |
|
| tmp/org.txm.groovy.core/src/groovy/org/txm/macro/cqp/CQPUtils.groovy (revision 2051) | ||
|---|---|---|
| 1 |
// Copyright © 2017 ENS de Lyon, CNRS, University of Franche-Comté |
|
| 2 |
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses) |
|
| 3 |
// @author sheiden |
|
| 4 |
|
|
| 5 |
package org.txm.macro.cqp |
|
| 6 |
|
|
| 7 |
import org.txm.searchengine.cqp.corpus.* |
|
| 8 |
|
|
| 9 |
def getCorpusByName(name) {
|
|
| 10 |
def cl = CorpusManager.getCorpusManager().getCorpora() |
|
| 11 |
def rc = cl.find {
|
|
| 12 |
it.getName() == name |
|
| 13 |
} |
|
| 14 |
|
|
| 15 |
if (rc) return rc |
|
| 16 |
|
|
| 17 |
def rsc = null |
|
| 18 |
rc = cl.find { c ->
|
|
| 19 |
def scl = c.getSubcorpora() |
|
| 20 |
def rrsc = scl.find { sc ->
|
|
| 21 |
sc.getName() == name |
|
| 22 |
} |
|
| 23 |
rsc = rrsc |
|
| 24 |
} |
|
| 25 |
|
|
| 26 |
return rsc |
|
| 27 |
} |
|
| 28 |
|
|
| 29 |
def getCorpora(def script) {
|
|
| 30 |
|
|
| 31 |
def scriptName = this.class.getSimpleName() |
|
| 32 |
|
|
| 33 |
if (script.binding.variables["args"]) {
|
|
| 34 |
def args = script.binding.variables["args"] |
|
| 35 |
if (args["corpus"]) {
|
|
| 36 |
return [getCorpusByName(args["corpus"])] |
|
| 37 |
} else if (args["corpora"]) {
|
|
| 38 |
return args["corpora"].tokenize(',').collect {
|
|
| 39 |
return getCorpusByName(it) |
|
| 40 |
} |
|
| 41 |
} else {
|
|
| 42 |
if (script.corpusViewSelection == null || !(script.corpusViewSelection instanceof Subcorpus || script.corpusViewSelection instanceof MainCorpus)) {
|
|
| 43 |
return null |
|
| 44 |
} |
|
| 45 |
return script.corpusViewSelections.findAll { it instanceof Subcorpus || it instanceof MainCorpus }
|
|
| 46 |
} |
|
| 47 |
} else {
|
|
| 48 |
if (script.corpusViewSelection == null || !(script.corpusViewSelection instanceof Subcorpus || script.corpusViewSelection instanceof MainCorpus)) {
|
|
| 49 |
return null |
|
| 50 |
} |
|
| 51 |
return script.corpusViewSelections.findAll { it instanceof Subcorpus || it instanceof MainCorpus }
|
|
| 52 |
} |
|
| 53 |
|
|
| 54 |
} |
|
| 55 |
|
|
| 56 |
def newTestSubCorpusName(name) {
|
|
| 57 |
def cl = CorpusManager.getCorpusManager().getCorpora() |
|
| 58 |
def rc = cl.find {
|
|
| 59 |
it.getName() == name |
|
| 60 |
} |
|
| 61 |
|
|
| 62 |
if (!rc) {
|
|
| 63 |
println "** corpus '"+name+"' not found." |
|
| 64 |
return null |
|
| 65 |
} |
|
| 66 |
|
|
| 67 |
def scl = rc.getSubcorpora() |
|
| 68 |
def n = 1 |
|
| 69 |
def loop = true |
|
| 70 |
while (loop) {
|
|
| 71 |
def rsc = scl.find { sc ->
|
|
| 72 |
sc.getName() == "CORPUS"+n |
|
| 73 |
} |
|
| 74 |
if (rsc) n++ else loop = false |
|
| 75 |
} |
|
| 76 |
|
|
| 77 |
return "CORPUS"+n |
|
| 78 |
} |
|
| 79 |
|
|
| 80 |
def positions2cql(matches) {
|
|
| 81 |
// [0],[1],[2-3],[4-5],[6] |
|
| 82 |
// 0,1,2-3,4-5,6 |
|
| 83 |
// "(n1:[])|(n2:[] []{1}) :: (n1 = 0)|(n1 = 1)|(n1 = 6)|(n2 = 2)|(n2 = 4)"
|
|
| 84 |
|
|
| 85 |
def singletons = [] |
|
| 86 |
def intervals = [:] |
|
| 87 |
matches.tokenize(',').each { interval ->
|
|
| 88 |
if (interval.indexOf('-') != -1) { // start-end
|
|
| 89 |
(start, end) = interval.tokenize('-') as int[]
|
|
| 90 |
l = end-start |
|
| 91 |
if (intervals[l]) {
|
|
| 92 |
intervals.put(l, intervals[l] << start) |
|
| 93 |
} else {
|
|
| 94 |
intervals.put(l, [start]) |
|
| 95 |
} |
|
| 96 |
} else { // singleton
|
|
| 97 |
singletons << interval.toInteger() |
|
| 98 |
} |
|
| 99 |
} |
|
| 100 |
|
|
| 101 |
def singletonPositions = "" |
|
| 102 |
def intervalsCQLs = "" |
|
| 103 |
def intervalsPositions = "" |
|
| 104 |
|
|
| 105 |
// "(n1:[])|(n2:[] []{1}) :: (n1 = 0)|(n1 = 1)|(n1 = 6)|(n2 = 2)|(n2 = 4)"
|
|
| 106 |
|
|
| 107 |
def firstSingleton = true |
|
| 108 |
singletons.each {
|
|
| 109 |
if (!firstSingleton) singletonPositions += "|" |
|
| 110 |
firstSingleton = false |
|
| 111 |
singletonPositions += "n1="+it+"" |
|
| 112 |
} |
|
| 113 |
|
|
| 114 |
def firstInterval = true |
|
| 115 |
intervals.each { length, positions ->
|
|
| 116 |
if (!firstInterval) intervalsCQLs += "|" |
|
| 117 |
if (!firstInterval) intervalsPositions += "|" |
|
| 118 |
firstInterval = false |
|
| 119 |
intervalsCQLs += "n"+(length+1)+":[][]{"+length+"}"
|
|
| 120 |
def firstPosition = true |
|
| 121 |
positions.each {
|
|
| 122 |
if (!firstPosition) intervalsPositions += "|" |
|
| 123 |
firstPosition = false |
|
| 124 |
intervalsPositions += "n"+(length+1)+"="+it |
|
| 125 |
} |
|
| 126 |
} |
|
| 127 |
|
|
| 128 |
def q = "" |
|
| 129 |
if (singletons.size() > 0) {
|
|
| 130 |
q += "n1:[]" |
|
| 131 |
} |
|
| 132 |
if (singletons.size() > 0 && intervals.size() > 0) {
|
|
| 133 |
q += "|" |
|
| 134 |
} |
|
| 135 |
if (intervals.size() > 0) {
|
|
| 136 |
q += intervalsCQLs |
|
| 137 |
} |
|
| 138 |
q += "::" |
|
| 139 |
if (singletons.size() > 0) {
|
|
| 140 |
q += singletonPositions |
|
| 141 |
} |
|
| 142 |
if (singletons.size() > 0 && intervals.size() > 0) {
|
|
| 143 |
q += "|" |
|
| 144 |
} |
|
| 145 |
if (intervals.size() > 0) {
|
|
| 146 |
q += intervalsPositions |
|
| 147 |
} |
|
| 148 |
|
|
| 149 |
return q |
|
| 150 |
|
|
| 151 |
} |
|
| 152 |
|
|
| 153 |
def corpus2positions(def corpus, def maxMatches, def pretty = false) {
|
|
| 154 |
|
|
| 155 |
def res = "" |
|
| 156 |
def matches = corpus.getMatches() |
|
| 157 |
|
|
| 158 |
if (matches.size() == 0) {
|
|
| 159 |
return "<empty>" |
|
| 160 |
} else {
|
|
| 161 |
def first = 1 |
|
| 162 |
matches.take(maxMatches).each {
|
|
| 163 |
if (first) { first = 0 } else {
|
|
| 164 |
if (pretty) {
|
|
| 165 |
res += ", " |
|
| 166 |
} else {
|
|
| 167 |
res += "," |
|
| 168 |
} |
|
| 169 |
} |
|
| 170 |
// res += "[" |
|
| 171 |
if (it.getStart() < it.getEnd()) {
|
|
| 172 |
res += it.getStart()+"-"+it.getEnd() |
|
| 173 |
} else {
|
|
| 174 |
res += it.getStart() |
|
| 175 |
} |
|
| 176 |
// res += "]" |
|
| 177 |
} |
|
| 178 |
} |
|
| 179 |
|
|
| 180 |
if (matches.size() > maxMatches) {
|
|
| 181 |
res += "..." |
|
| 182 |
} |
|
| 183 |
|
|
| 184 |
return res |
|
| 185 |
} |
|
| 186 |
|
|
| 187 |
methods = this.getClass().declaredMethods.findAll { !it.synthetic }.name.sort().unique()
|
|
| 188 |
methods.removeAll(['main', 'run']) |
|
| 189 |
println "** CQPUtils is not meant to be called directly.\nPlease import its methods ${methods}\nwith the following declaration:\nimport org.txm.macro.cqp.CQPUtilsMacro"
|
|
| 190 |
|
|
| 191 |
/* |
|
| 192 |
___ ___ ___ ___ ___ ___ |
|
| 193 |
/\ \ /\__\ /\__\ /\ \ |\__\ /\ \ |
|
| 194 |
/::\ \ /::| | /::| | /::\ \ |:| | /::\ \ |
|
| 195 |
/:/\:\ \ /:|:| | /:|:| | /:/\:\ \ |:| | /:/\:\ \ |
|
| 196 |
/::\~\:\ \ /:/|:| |__ /:/|:| |__ /::\~\:\ \ |:|__|__ /::\~\:\ \ |
|
| 197 |
/:/\:\ \:\__\ /:/ |:| /\__\ /:/ |:| /\__\ /:/\:\ \:\__\ ____/::::\__\ /:/\:\ \:\__\ |
|
| 198 |
\/__\:\/:/ / \/__|:|/:/ / \/__|:|/:/ / \:\~\:\ \/__/ \::::/~~/~ \:\~\:\ \/__/ |
|
| 199 |
\::/ / |:/:/ / |:/:/ / \:\ \:\__\ ~~|:|~~| \:\ \:\__\ |
|
| 200 |
/:/ / |::/ / |::/ / \:\ \/__/ |:| | \:\ \/__/ |
|
| 201 |
/:/ / /:/ / /:/ / \:\__\ |:| | \:\__\ |
|
| 202 |
\/__/ \/__/ \/__/ \/__/ \|__| \/__/ |
|
| 203 |
|
|
| 204 |
|
|
| 205 |
|
|
| 206 |
0 1 2 3 4 5 6 7 8 9 |
|
| 207 |
|
|
| 208 |
+ + + + |
|
| 209 |
0 1 2 3 4 5 6 7 8 9 |
|
| 210 |
+0,+1,+2,+3 |
|
| 211 |
n:[] :: n = 0|n = 1|n = 2|n = 3 |
|
| 212 |
|
|
| 213 |
[ ] [ ] |
|
| 214 |
0 1 2 3 4 5 6 7 8 9 |
|
| 215 |
0-1,2-3 |
|
| 216 |
n:[] []{1} :: n = 0
|
|
| 217 |
n:[] []{1} :: n = 2
|
|
| 218 |
|
|
| 219 |
[ ] + |
|
| 220 |
0 1 2 3 4 5 6 7 8 9 |
|
| 221 |
0-2,+3 |
|
| 222 |
n:[] []{2} :: n = 0
|
|
| 223 |
n:[] :: n = 3 |
|
| 224 |
|
|
| 225 |
|
|
| 226 |
*/ |
|
| 227 |
|
|
| 228 |
/* |
|
| 229 |
|
|
| 230 |
Touching intervals mergers |
|
| 231 |
========================== |
|
| 232 |
|
|
| 233 |
A) from http://stackoverflow.com/questions/31670849/merge-overlapping-intervals |
|
| 234 |
|
|
| 235 |
import java.util.*; |
|
| 236 |
|
|
| 237 |
public class Ideone |
|
| 238 |
{
|
|
| 239 |
public static void main (String[] args) throws java.lang.Exception |
|
| 240 |
{
|
|
| 241 |
ArrayList<Interval> x = new ArrayList<>(); |
|
| 242 |
|
|
| 243 |
x.add(new Interval(1, 3)); |
|
| 244 |
x.add(new Interval(2, 6)); |
|
| 245 |
x.add(new Interval(8, 10)); |
|
| 246 |
x.add(new Interval(15, 18)); |
|
| 247 |
x.add(new Interval(17, 20)); |
|
| 248 |
|
|
| 249 |
x = merge(x); |
|
| 250 |
|
|
| 251 |
for(Interval i : x) |
|
| 252 |
{
|
|
| 253 |
System.out.println(i.getStart() + " " + i.getEnd()); |
|
| 254 |
} |
|
| 255 |
} |
|
| 256 |
|
|
| 257 |
public static ArrayList<Interval> merge(ArrayList<Interval> intervals) {
|
|
| 258 |
|
|
| 259 |
if(intervals.size() == 0 || intervals.size() == 1) |
|
| 260 |
return intervals; |
|
| 261 |
|
|
| 262 |
Collections.sort(intervals, new IntervalComparator()); |
|
| 263 |
|
|
| 264 |
Interval first = intervals.get(0); |
|
| 265 |
int start = first.getStart(); |
|
| 266 |
int end = first.getEnd(); |
|
| 267 |
|
|
| 268 |
ArrayList<Interval> result = new ArrayList<Interval>(); |
|
| 269 |
|
|
| 270 |
for (int i = 1; i < intervals.size(); i++) {
|
|
| 271 |
Interval current = intervals.get(i); |
|
| 272 |
if (current.getStart() <= end) {
|
|
| 273 |
end = Math.max(current.getEnd(), end); |
|
| 274 |
} else {
|
|
| 275 |
result.add(new Interval(start, end)); |
|
| 276 |
start = current.getStart(); |
|
| 277 |
end = current.getEnd(); |
|
| 278 |
} |
|
| 279 |
} |
|
| 280 |
|
|
| 281 |
result.add(new Interval(start, end)); |
|
| 282 |
return result; |
|
| 283 |
} |
|
| 284 |
} |
|
| 285 |
|
|
| 286 |
class Interval |
|
| 287 |
{
|
|
| 288 |
private int start; |
|
| 289 |
private int end; |
|
| 290 |
|
|
| 291 |
Interval() {
|
|
| 292 |
start = 0; |
|
| 293 |
end = 0; |
|
| 294 |
} |
|
| 295 |
|
|
| 296 |
Interval(int s, int e) |
|
| 297 |
{
|
|
| 298 |
start = s; |
|
| 299 |
end = e; |
|
| 300 |
} |
|
| 301 |
|
|
| 302 |
public int getStart() {
|
|
| 303 |
return start; |
|
| 304 |
} |
|
| 305 |
|
|
| 306 |
public int getEnd() {
|
|
| 307 |
return end; |
|
| 308 |
} |
|
| 309 |
} |
|
| 310 |
|
|
| 311 |
class IntervalComparator implements Comparator<Interval> |
|
| 312 |
{
|
|
| 313 |
public int compare(Interval i1, Interval i2) |
|
| 314 |
{
|
|
| 315 |
return i1.getStart() - i2.getStart(); |
|
| 316 |
} |
|
| 317 |
} |
|
| 318 |
|
|
| 319 |
B) from http://www.programcreek.com/2012/12/leetcode-merge-intervals/ |
|
| 320 |
|
|
| 321 |
public List<Interval> merge(List<Interval> intervals) {
|
|
| 322 |
List<Interval> result = new ArrayList<Interval>(); |
|
| 323 |
|
|
| 324 |
if(intervals==null||intervals.size()==0) |
|
| 325 |
return result; |
|
| 326 |
|
|
| 327 |
Collections.sort(intervals, new Comparator<Interval>(){
|
|
| 328 |
public int compare(Interval i1, Interval i2){
|
|
| 329 |
if(i1.start!=i2.start) |
|
| 330 |
return i1.start-i2.start; |
|
| 331 |
else |
|
| 332 |
return i1.end-i2.end; |
|
| 333 |
} |
|
| 334 |
}); |
|
| 335 |
|
|
| 336 |
Interval pre = intervals.get(0); |
|
| 337 |
for(int i=0; i<intervals.size(); i++){
|
|
| 338 |
Interval curr = intervals.get(i); |
|
| 339 |
if(curr.start>pre.end){
|
|
| 340 |
result.add(pre); |
|
| 341 |
pre = curr; |
|
| 342 |
}else{
|
|
| 343 |
Interval merged = new Interval(pre.start, Math.max(pre.end, curr.end)); |
|
| 344 |
pre = merged; |
|
| 345 |
} |
|
| 346 |
} |
|
| 347 |
result.add(pre); |
|
| 348 |
|
|
| 349 |
return result; |
|
| 350 |
} |
|
| 351 |
*/ |
|
| tmp/org.txm.groovy.core/src/groovy/org/txm/macro/text/ExcelDir2XMLMacro.groovy (revision 2051) | ||
|---|---|---|
| 1 |
package org.txm.macro.text |
|
| 2 |
|
|
| 3 |
import org.kohsuke.args4j.* |
|
| 4 |
import groovy.transform.Field |
|
| 5 |
import java.nio.charset.Charset |
|
| 6 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
| 7 |
import org.txm.utils.* |
|
| 8 |
import javax.xml.stream.* |
|
| 9 |
import java.net.URL |
|
| 10 |
|
|
| 11 |
import org.apache.poi.ss.usermodel.* |
|
| 12 |
import org.apache.poi.hssf.usermodel.* |
|
| 13 |
import org.apache.poi.xssf.usermodel.* |
|
| 14 |
import org.apache.poi.ss.util.* |
|
| 15 |
|
|
| 16 |
@Field @Option(name="inputDirectory", usage="répertoire des fichiers Excel à traiter", widget="Folder", required=true, def="") |
|
| 17 |
def inputDirectory |
|
| 18 |
|
|
| 19 |
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="") |
|
| 20 |
def sheetName |
|
| 21 |
|
|
| 22 |
@Field @Option(name="rootTag", usage="Root tag name", widget="String", required=false, def="root") |
|
| 23 |
def rootTag |
|
| 24 |
|
|
| 25 |
@Field @Option(name="textTag", usage="Text tag name", widget="String", required=false, def="unit") |
|
| 26 |
def textTag |
|
| 27 |
|
|
| 28 |
@Field @Option(name="metadataColumnList", usage="metadataColumnList column list separated with comma", widget="String", required=false, def="meta1,meta2") |
|
| 29 |
def metadataColumnList |
|
| 30 |
|
|
| 31 |
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2") |
|
| 32 |
def dateColumnList |
|
| 33 |
|
|
| 34 |
@Field @Option(name="textColumnList", usage="textColumnList column list separated with comma", widget="String", required=false, def="textColumnList1,textColumnList2") |
|
| 35 |
def textColumnList |
|
| 36 |
|
|
| 37 |
if (!ParametersDialog.open(this)) return |
|
| 38 |
|
|
| 39 |
if (!inputDirectory.exists()) {
|
|
| 40 |
println "** ExcelDir2XML: no '"+inputDirectory.name+"' directory found. Aborting." |
|
| 41 |
return false |
|
| 42 |
} |
|
| 43 |
|
|
| 44 |
if (!inputDirectory.canRead()) {
|
|
| 45 |
println "** ExcelDir2XML: '"+inputDirectory.name+"' directory not readable. Aborting." |
|
| 46 |
return false |
|
| 47 |
} |
|
| 48 |
|
|
| 49 |
def f = [] |
|
| 50 |
inputDirectory.eachFileMatch(~/.*xlsx/) { f << it }
|
|
| 51 |
|
|
| 52 |
if (f.size() == 0) {
|
|
| 53 |
println "** ExcelDir2XML: no .xlsx file found. Aborting." |
|
| 54 |
return false |
|
| 55 |
} |
|
| 56 |
|
|
| 57 |
try {
|
|
| 58 |
|
|
| 59 |
f.sort { it.name }.each { inputFile ->
|
|
| 60 |
|
|
| 61 |
res = gse.run(Excel2XMLMacro, ["args":[ |
|
| 62 |
|
|
| 63 |
"inputFile":inputFile, |
|
| 64 |
"sheetName":sheetName, |
|
| 65 |
"metadataColumnList":metadataColumnList, |
|
| 66 |
"dateColumnList":dateColumnList, |
|
| 67 |
"textColumnList":textColumnList, |
|
| 68 |
"rootTag":rootTag, |
|
| 69 |
"textTag":textTag, |
|
| 70 |
|
|
| 71 |
"selection":selection, |
|
| 72 |
"selections":selections, |
|
| 73 |
"corpusViewSelection":corpusViewSelection, |
|
| 74 |
"corpusViewSelections":corpusViewSelections, |
|
| 75 |
"monitor":monitor]]) |
|
| 76 |
if (!res) println "** problem calling Excel2XMLMacro." |
|
| 77 |
} |
|
| 78 |
|
|
| 79 |
} catch (Exception e) {
|
|
| 80 |
println "** ExcelDir2XML: unable to read input files. Aborting." |
|
| 81 |
println e.getLocalizedMessage() |
|
| 82 |
println e.printStackTrace() |
|
| 83 |
return false |
|
| 84 |
} |
|
| 85 |
|
|
| 86 |
return true |
|
| tmp/org.txm.groovy.core/src/groovy/org/txm/macro/text/Excel2XMLMacro.groovy (revision 2051) | ||
|---|---|---|
| 1 |
package org.txm.macro.text |
|
| 2 |
|
|
| 3 |
import org.kohsuke.args4j.* |
|
| 4 |
import groovy.transform.Field |
|
| 5 |
import java.nio.charset.Charset |
|
| 6 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
| 7 |
import org.txm.utils.* |
|
| 8 |
import javax.xml.stream.* |
|
| 9 |
import java.net.URL |
|
| 10 |
import java.text.SimpleDateFormat |
|
| 11 |
|
|
| 12 |
import org.apache.poi.ss.usermodel.* |
|
| 13 |
import org.apache.poi.hssf.usermodel.* |
|
| 14 |
import org.apache.poi.xssf.usermodel.* |
|
| 15 |
import org.apache.poi.ss.util.* |
|
| 16 |
|
|
| 17 |
def stringToIndent = { str -> org.txm.utils.AsciiUtils.buildAttributeId(org.txm.utils.AsciiUtils.convertNonAscii(str)).toLowerCase() }
|
|
| 18 |
|
|
| 19 |
// from http://www.java-connect.com/apache-poi-tutorials/read-all-type-of-excel-cell-value-as-string-using-poi |
|
| 20 |
def getCellValueAsString = { cell ->
|
|
| 21 |
strCellValue = null |
|
| 22 |
if (cell != null) {
|
|
| 23 |
switch (cell.getCellType()) {
|
|
| 24 |
case Cell.CELL_TYPE_STRING: |
|
| 25 |
strCellValue = cell.toString() |
|
| 26 |
break |
|
| 27 |
case Cell.CELL_TYPE_NUMERIC: |
|
| 28 |
if (DateUtil.isCellDateFormatted(cell)) {
|
|
| 29 |
SimpleDateFormat dateFormat = new SimpleDateFormat("dd/MM/yyyy")
|
|
| 30 |
strCellValue = dateFormat.format(cell.getDateCellValue()) |
|
| 31 |
} else {
|
|
| 32 |
value = cell.getNumericCellValue() |
|
| 33 |
longValue = value.longValue() |
|
| 34 |
strCellValue = new String(longValue.toString()) |
|
| 35 |
} |
|
| 36 |
break |
|
| 37 |
case Cell.CELL_TYPE_BOOLEAN: |
|
| 38 |
strCellValue = new String(new Boolean(cell.getBooleanCellValue()).toString()) |
|
| 39 |
break |
|
| 40 |
case Cell.CELL_TYPE_BLANK: |
|
| 41 |
strCellValue = "" |
|
| 42 |
break |
|
| 43 |
} |
|
| 44 |
} |
|
| 45 |
if (strCellValue == null) strCellValue = "" |
|
| 46 |
return strCellValue |
|
| 47 |
} |
|
| 48 |
|
|
| 49 |
@Field @Option(name="inputFile", usage="fichier Excel à traiter", widget="File", required=true, def="") |
|
| 50 |
File inputFile |
|
| 51 |
|
|
| 52 |
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="") |
|
| 53 |
def sheetName |
|
| 54 |
|
|
| 55 |
@Field @Option(name="rootTag", usage="root tag name", widget="String", required=false, def="root") |
|
| 56 |
def rootTag |
|
| 57 |
|
|
| 58 |
@Field @Option(name="textTag", usage="line unit tag name", widget="String", required=false, def="unit") |
|
| 59 |
def textTag |
|
| 60 |
|
|
| 61 |
@Field @Option(name="metadataColumnList", usage="metadata columns list separated by comma", widget="String", required=false, def="meta1,meta2") |
|
| 62 |
def metadataColumnList |
|
| 63 |
|
|
| 64 |
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2") |
|
| 65 |
def dateColumnList |
|
| 66 |
|
|
| 67 |
@Field @Option(name="textColumnList", usage="text columns list separated by comma", widget="String", required=false, def="textColumnList1,textColumnList2") |
|
| 68 |
def textColumnList |
|
| 69 |
|
|
| 70 |
if (!ParametersDialog.open(this)) return |
|
| 71 |
|
|
| 72 |
if (!inputFile.exists()) {
|
|
| 73 |
println "** Excel2XML: no '"+inputFile.name+"' file found. Aborting." |
|
| 74 |
return false |
|
| 75 |
} |
|
| 76 |
|
|
| 77 |
if (!inputFile.canRead()) {
|
|
| 78 |
println "** Excel2XML: '"+inputFile.name+"' file not readable. Aborting." |
|
| 79 |
return false |
|
| 80 |
} |
|
| 81 |
|
|
| 82 |
try {
|
|
| 83 |
|
|
| 84 |
metadataColumnList = metadataColumnList.split(",").collect { it.trim() }
|
|
| 85 |
dateColumnList = dateColumnList.split(",").collect { it.trim() }
|
|
| 86 |
textColumnList = textColumnList.split(",").collect { it.trim() }
|
|
| 87 |
textTag = textTag.trim() |
|
| 88 |
rootTag = rootTag.trim() |
|
| 89 |
|
|
| 90 |
wb = WorkbookFactory.create(inputFile) |
|
| 91 |
|
|
| 92 |
if (sheetName.length() == 0) {
|
|
| 93 |
ws = wb.getSheetAt(0) |
|
| 94 |
} else {
|
|
| 95 |
ws = wb.getSheet(sheetName) |
|
| 96 |
if (ws == null) {
|
|
| 97 |
println "** Excel2XML: no '"+sheetName+" found. Aborting." |
|
| 98 |
return false |
|
| 99 |
} |
|
| 100 |
} |
|
| 101 |
|
|
| 102 |
if (ws == null) {
|
|
| 103 |
println "** Excel2XML: no sheet found. Aborting." |
|
| 104 |
return false |
|
| 105 |
} |
|
| 106 |
|
|
| 107 |
nRows = ws.getPhysicalNumberOfRows() |
|
| 108 |
println nRows+" rows." |
|
| 109 |
|
|
| 110 |
firstRow = ws.getRow(0) |
|
| 111 |
colMax = firstRow.getLastCellNum() |
|
| 112 |
|
|
| 113 |
headers = (0..colMax-1).collect { getCellValueAsString(firstRow.getCell(it)) }
|
|
| 114 |
|
|
| 115 |
println "Headers: $headers" |
|
| 116 |
|
|
| 117 |
normalizedHeaders = headers.collect { stringToIndent(it) }
|
|
| 118 |
|
|
| 119 |
ok = true |
|
| 120 |
metadataColumnList.each { m ->
|
|
| 121 |
if (!headers.contains(m)) {
|
|
| 122 |
println "** Excel2XML: missing metadataColumnList column: $m" |
|
| 123 |
ok = false |
|
| 124 |
} |
|
| 125 |
} |
|
| 126 |
textColumnList.each { t ->
|
|
| 127 |
if (!headers.contains(t)) {
|
|
| 128 |
println "** Excel2XML: missing textColumnList column: $t" |
|
| 129 |
ok = false |
|
| 130 |
} |
|
| 131 |
} |
|
| 132 |
|
|
| 133 |
if (!ok) { return false }
|
|
| 134 |
|
|
| 135 |
metadataColumnIndex = metadataColumnList.collect { headers.indexOf(it) }
|
|
| 136 |
dateColumnsIndex = dateColumnList.collect { headers.indexOf(it) }
|
|
| 137 |
textColumnIndex = textColumnList.collect { headers.indexOf(it) }
|
|
| 138 |
|
|
| 139 |
println "metadataColumnList = "+metadataColumnList |
|
| 140 |
println "metadataColumnIndex = "+metadataColumnIndex |
|
| 141 |
|
|
| 142 |
println "dateColumnList = "+dateColumnList |
|
| 143 |
println "dateColumnsIndex = "+dateColumnsIndex |
|
| 144 |
|
|
| 145 |
println "textColumnList = "+textColumnList |
|
| 146 |
println "textColumnIndex = "+textColumnIndex |
|
| 147 |
|
|
| 148 |
name = inputFile.getName() |
|
| 149 |
idx = name.lastIndexOf(".")
|
|
| 150 |
|
|
| 151 |
if (idx > 0) name = name.substring(0, idx) |
|
| 152 |
outputFile = new File(inputFile.getParentFile(), name+".xml") |
|
| 153 |
|
|
| 154 |
factory = XMLOutputFactory.newInstance() |
|
| 155 |
output = new FileOutputStream(outputFile) |
|
| 156 |
writer = factory.createXMLStreamWriter(output, "UTF-8") |
|
| 157 |
|
|
| 158 |
writer.writeStartDocument("UTF-8","1.0")
|
|
| 159 |
writer.writeCharacters("\n") // simple XML formating
|
|
| 160 |
writer.writeStartElement(rootTag) |
|
| 161 |
writer.writeCharacters("\n")
|
|
| 162 |
|
|
| 163 |
pb_n = 1 |
|
| 164 |
|
|
| 165 |
(1..nRows-1).each { rowIndex ->
|
|
| 166 |
|
|
| 167 |
writer.writeCharacters(" ")
|
|
| 168 |
writer.writeEmptyElement("pb") // <pb/> to get one page per input line (don't forget high number of words per page in import module)
|
|
| 169 |
writer.writeAttribute("n", ""+pb_n++)
|
|
| 170 |
writer.writeCharacters("\n") // simple XML formating
|
|
| 171 |
|
|
| 172 |
writer.writeCharacters(" ")
|
|
| 173 |
writer.writeStartElement(textTag) |
|
| 174 |
metadataColumnIndex.each { colIndex -> // build an attribute for each metadata
|
|
| 175 |
String s = getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex)); |
|
| 176 |
if (s == null) s =""; |
|
| 177 |
value = s.replaceAll("\n", ";").trim()
|
|
| 178 |
writer.writeAttribute(normalizedHeaders[colIndex], value) |
|
| 179 |
if (colIndex in dateColumnsIndex) { // also split date attributes in day+month+year attributes
|
|
| 180 |
matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/)
|
|
| 181 |
writer.writeAttribute(normalizedHeaders[colIndex]+"jour", matches[0][1]) |
|
| 182 |
writer.writeAttribute(normalizedHeaders[colIndex]+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value)))
|
|
| 183 |
writer.writeAttribute(normalizedHeaders[colIndex]+"mois", matches[0][2]) |
|
| 184 |
writer.writeAttribute(normalizedHeaders[colIndex]+"annee", matches[0][3]) |
|
| 185 |
} |
|
| 186 |
} |
|
| 187 |
writer.writeCharacters("\n")
|
|
| 188 |
|
|
| 189 |
writer.writeCharacters(" ")
|
|
| 190 |
writer.writeStartElement("metadata")
|
|
| 191 |
writer.writeStartElement("list")
|
|
| 192 |
writer.writeAttribute("type", "unordered")
|
|
| 193 |
writer.writeCharacters("\n")
|
|
| 194 |
|
|
| 195 |
metadataColumnIndex.each { colIndex ->
|
|
| 196 |
writer.writeStartElement("item")
|
|
| 197 |
writer.writeCharacters(headers[colIndex]+" : "+getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex)).replaceAll("\n", ";"))
|
|
| 198 |
writer.writeEndElement() // item |
|
| 199 |
writer.writeCharacters("\n")
|
|
| 200 |
} |
|
| 201 |
writer.writeCharacters(" ")
|
|
| 202 |
writer.writeEndElement() // list |
|
| 203 |
writer.writeEndElement() // head |
|
| 204 |
writer.writeCharacters("\n")
|
|
| 205 |
|
|
| 206 |
textColumnIndex.each { colIndex ->
|
|
| 207 |
|
|
| 208 |
writer.writeCharacters(" ")
|
|
| 209 |
writer.writeStartElement(normalizedHeaders[colIndex]) |
|
| 210 |
writer.writeStartElement("p")
|
|
| 211 |
writer.writeStartElement("head")
|
|
| 212 |
writer.writeStartElement("hi")
|
|
| 213 |
writer.writeCharacters(headers[colIndex]+" : ") |
|
| 214 |
writer.writeEndElement() // hi |
|
| 215 |
writer.writeEndElement() // head |
|
| 216 |
value = getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex)) |
|
| 217 |
|
|
| 218 |
if (value ==~ /(?s)^[A-Z]{3}: [^;\n]+? +[;\n].*/) {
|
|
| 219 |
value.findAll( /(?s)[A-Z]{3}: ([^;\n]+?) +[;\n]/ ).each { desc ->
|
|
| 220 |
writer.writeStartElement("descripteur")
|
|
| 221 |
matches = (desc =~ /(?s)([A-Z]{3}): ([^;\n]+?) +[;\n]/)
|
|
| 222 |
writer.writeAttribute("type", matches[0][1])
|
|
| 223 |
writer.writeCharacters(matches[0][2]) |
|
| 224 |
writer.writeEndElement() // descripteur |
|
| 225 |
} |
|
| 226 |
} else {
|
|
| 227 |
writer.writeCharacters(value) |
|
| 228 |
} |
|
| 229 |
writer.writeEndElement() // p |
|
| 230 |
writer.writeEndElement() // textColumn |
|
| 231 |
writer.writeCharacters("\n")
|
|
| 232 |
} |
|
| 233 |
|
|
| 234 |
writer.writeCharacters(" ")
|
|
| 235 |
writer.writeEndElement() // textTag |
|
| 236 |
writer.writeCharacters("\n")
|
|
| 237 |
} |
|
| 238 |
|
|
| 239 |
writer.writeEndElement() // rootTag |
|
| 240 |
writer.writeCharacters("\n")
|
|
| 241 |
writer.close() |
|
| 242 |
output.close() |
|
| 243 |
println "Result file: $outputFile" |
|
| 244 |
|
|
| 245 |
} catch (Exception e) {
|
|
| 246 |
println "** Excel2XML: unable to read input file. Aborting." |
|
| 247 |
println e.getLocalizedMessage() |
|
| 248 |
println e.printStackTrace() |
|
| 249 |
return false |
|
| 250 |
} |
|
| 251 |
|
|
| 252 |
return true |
|
Formats disponibles : Unified diff