Revision 2051

tmp/org.txm.groovy.core/src/groovy/org/txm/macro/commands/NIndexMacro.groovy (revision 2051)
1
// Copyright © 2017 ENS de Lyon, CNRS, University of Franche-Comté
2
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
3
// @author sheiden
4

  
5
package org.txm.macro.commands
6

  
7
import org.kohsuke.args4j.*
8
import groovy.transform.Field
9
import org.txm.rcpapplication.swt.widget.parameters.*
10
import org.txm.searchengine.cqp.CQPSearchEngine
11
import org.txm.Toolbox
12
import org.txm.macro.cqp.CQPUtils
13
import org.txm.searchengine.cqp.ICqiClient
14
import org.odftoolkit.simple.SpreadsheetDocument
15
import org.odftoolkit.odfdom.doc.table.OdfTable
16
import org.odftoolkit.odfdom.doc.table.OdfTableRow
17
import org.odftoolkit.odfdom.doc.table.OdfTableCell
18
import org.odftoolkit.simple.table.Table
19
import org.odftoolkit.simple.Document
20
import org.odftoolkit.simple.style.Border
21
import org.odftoolkit.simple.style.Font
22
import org.odftoolkit.simple.style.StyleTypeDefinitions
23
import org.odftoolkit.simple.style.StyleTypeDefinitions.FontStyle
24
import org.odftoolkit.simple.table.Row
25
import org.odftoolkit.simple.table.Cell
26
import org.odftoolkit.simple.style.StyleTypeDefinitions.CellBordersType
27
import org.odftoolkit.simple.style.StyleTypeDefinitions.HorizontalAlignmentType
28

  
29
// BEGINNING OF PARAMETERS
30

  
31
@Field @Option(name="inputFile", usage="input sequence list file (one sequence per line)", widget="File", required=true, def="")
32
def inputFile
33

  
34
@Field @Option(name="inputIsCQL", usage="input sequences are CQL queries", widget="Boolean", required=true, def="false")
35
def inputIsCQL
36

  
37
@Field @Option(name="inputWordProperty", usage="implicit word property to use", widget="String", required=true, def="word")
38
String inputWordProperty
39

  
40
@Field @Option(name="outputFile", usage="results output File (TSV format)", widget="File", required=true, def="")
41
def outputFile
42

  
43
@Field @Option(name="odsOutputFormat", usage="use ODS as output format", widget="Boolean", required=true, def="false")
44
def odsOutputFormat
45

  
46
@Field @Option(name="outputWordProperty", usage="output word property to use", widget="String", required=true, def="word")
47
String outputWordProperty
48

  
49
@Field @Option(name="groupByQuery", usage="provide frequency of each CQL match values or just the total frequency of each CQL match", widget="Boolean", required=true, def="false")
50
def groupByQuery
51

  
52
// Open the parameters input dialog box
53
if (!ParametersDialog.open(this)) return
54

  
55
// END OF PARAMETERS
56

  
57
def scriptName = this.class.getSimpleName()
58

  
59
if (!(inputFile && inputFile.isFile() && inputFile.canRead() )) {
60

  
61
	println "** $scriptName: impossible to read input file. Aborting."
62
	return false
63
}
64

  
65
corpusEngine = CQPSearchEngine.getCqiClient()
66

  
67
utils = new CQPUtils()
68

  
69
corpora = utils.getCorpora(this)
70

  
71
if ((corpora == null) || corpora.size() == 0) {
72
	println "** $scriptName: please select a corpus in the Corpus view or provide a corpus name. Aborting."
73
	return false
74
}
75

  
76
def getComputerName()
77
{
78
	env = System.getenv()
79
	if (env.containsKey("COMPUTERNAME"))
80
		return env.get("COMPUTERNAME")
81
	else if (env.containsKey("HOSTNAME"))
82
		return env.get("HOSTNAME")
83
	else
84
		return InetAddress.getLocalHost().getHostName()
85
}
86

  
87
def fullName = outputFile.getName()
88
def i = fullName.lastIndexOf(".")
89
def name
90
if (i == -1) {
91
	name = fullName
92
} else {
93
	name = fullName.substring(0, i)
94
}
95

  
96
if (odsOutputFormat) {
97

  
98
	try {
99
		 Class.forName( "org.odftoolkit.simple.SpreadsheetDocument")
100
	} catch(e) {
101
		 println "** $scriptName: the simple-odf-0.8.1-incubating.jar file is needed in the \$HOME/TXM/scripts/lib directory to write in ODS format. Please uncheck the odsOutputFormat option. Aborting."
102
		return false
103
	}
104

  
105
	outputFile = new File(outputFile.getParentFile(), name+".ods")
106
	ods = SpreadsheetDocument.newSpreadsheetDocument()
107
	table = ods.getTableByName("Sheet1")
108
	font = new Font("Arial", FontStyle.BOLD, 11)
109

  
110
	properties = Table.newTable(ods);
111
		properties.setTableName("Properties")
112
	setStringCellBold(properties, 0, 0, "Name")
113
	setStringCellBold(properties, 0, 1, "Value")
114
	line = 1
115
	setStringCell2(properties, line++, 0, "date", new Date().format("dd/MM/yyyy"))
116
	setStringCell2(properties, line++, 0, "time", new Date().format("HH'h'mm"))
117
	setStringCell2(properties, line++, 0, "machine", getComputerName())
118
	setStringCell2(properties, line++, 0, "user", System.getProperty("user.name"))
119
	setStringCell2(properties, line++, 0, "macro", scriptName)
120
	setStringCell2(properties, line++, 0, "corpora", corpora.collect { it.getName() }.join(","))
121
	setStringCell2(properties, line++, 0, "inputFile", inputFile.getAbsolutePath())
122
	setStringCell2(properties, line++, 0, "inputIsCQL", inputIsCQL.toString())
123
	setStringCell2(properties, line++, 0, "inputWordProperty", inputWordProperty)
124
	setStringCell2(properties, line++, 0, "outputFile", outputFile.getAbsolutePath())
125
	setStringCell2(properties, line++, 0, "odsOutputFormat", odsOutputFormat.toString())
126
	setStringCell2(properties, line++, 0, "outputWordProperty", outputWordProperty)
127
	setStringCell2(properties, line++, 0, "groupByQuery", groupByQuery.toString())
128

  
129
} else {
130
	outputFile = new File(outputFile.getParentFile(), name+".tsv")
131
	output = new FileWriter(outputFile)
132
}
133

  
134
def setRow(table, row, corpus, string, frequency) {
135
	cell=table.getCellByPosition(0, row)
136
	cell.setStringValue(corpus)
137
	cell=table.getCellByPosition(1, row)
138
	cell.setStringValue(string)
139
	cell=table.getCellByPosition(2, row)
140
	cell.setDoubleValue(frequency)
141
}
142

  
143
def setRow2(table, row, corpus, string1, string2, frequency) {
144
	cell=table.getCellByPosition(0, row)
145
	cell.setStringValue(corpus)
146
	cell=table.getCellByPosition(1, row)
147
	cell.setStringValue(string1)
148
	cell=table.getCellByPosition(2, row)
149
	cell.setStringValue((frequency == 0)?'':string2)
150
	cell=table.getCellByPosition(3, row)
151
	cell.setDoubleValue(frequency)
152
}
153

  
154
def setStringCell(table, row, col, string) {
155
	cell=table.getCellByPosition(col, row)
156
	cell.setStringValue(string)
157
	return cell
158
}
159

  
160
def setStringCellBold(table, row, col, string) {
161
	cell=table.getCellByPosition(col, row)
162
	cell.setStringValue(string)
163
	cell.setFont(font)
164
	return cell
165
}
166

  
167
def setStringCell2(table, row, col, string1, string2) {
168
	cell=table.getCellByPosition(col, row)
169
	cell.setStringValue(string1)
170
	cell=table.getCellByPosition(col+1, row)
171
	cell.setStringValue(string2)
172
	return cell
173
}
174

  
175
if (odsOutputFormat) {
176
	if (groupByQuery) {
177
		setStringCellBold(table, 0, 0, "corpus")
178
		setStringCellBold(table, 0, 1, inputIsCQL?"CQL":outputWordProperty+" query")
179
		cell = setStringCellBold(table, 0, 2, "f")
180
		cell.setHorizontalAlignment(HorizontalAlignmentType.RIGHT)
181
	} else {
182
		setStringCellBold(table, 0, 0, "corpus")
183
		setStringCellBold(table, 0, 1, inputIsCQL?"CQL":outputWordProperty+" query")
184
		setStringCellBold(table, 0, 2, outputWordProperty+" values")
185
		cell = setStringCellBold(table, 0, 3, "f")
186
		cell.setHorizontalAlignment(HorizontalAlignmentType.RIGHT)
187
	}
188
} else {
189
	if (groupByQuery) {
190
		output.println "corpus\t"+(inputIsCQL?"CQL":outputWordProperty+" query")+"\tf"
191
	} else {
192
		output.println "corpus\t"+(inputIsCQL?"CQL":outputWordProperty+" query")+"\t"+(outputWordProperty+" values")+"\tf"
193
	}
194
}
195

  
196
row = 1
197

  
198
corpora.each { corpus ->
199

  
200
	corpusName = corpus.getName()
201
	seqN = 0
202
	inputFile.eachLine() { line ->
203

  
204
		cql = ""
205
		subCorpus = "NIndex"+seqN++
206

  
207
		if (line.length() > 0) {
208
			if (inputIsCQL) {
209
				cql = line
210
			} else {
211
				cql = "[$inputWordProperty=\""+line.replaceAll("\\p{Blank}+", "\"] [$inputWordProperty=\"",)+"\"]"
212
			}
213

  
214
			corpusEngine.cqpQuery(corpusName, "$subCorpus", cql)
215
			nmatches = corpusEngine.subCorpusSize("$corpusName:$subCorpus")
216
			if (nmatches == 0) {
217
				if (groupByQuery) {
218
					if (odsOutputFormat) {
219
						setRow(table, row++, corpusName, line, 0)
220
					} else {
221
						output.println corpusName+"\t"+line+"\t0"
222
					}
223
				} else {
224
					if (odsOutputFormat) {
225
						setRow2(table, row++, corpusName, line, line, 0)
226
					} else {
227
						output.println corpusName+"\t"+line+"\t\t0"
228
					}
229
				}
230
			} else {
231
				if (groupByQuery) {
232
					if (odsOutputFormat) {
233
						setRow(table, row++, corpusName, line, nmatches)
234
					} else {
235
						output.println corpusName+"\t"+line+"\t$nmatches"
236
					}
237
				} else {
238
					starts = corpusEngine.dumpSubCorpus("$corpusName:$subCorpus", ICqiClient.CQI_CONST_FIELD_MATCH, 0, nmatches-1)
239
					ends = corpusEngine.dumpSubCorpus("$corpusName:$subCorpus", ICqiClient.CQI_CONST_FIELD_MATCHEND, 0, nmatches-1)
240
					[starts, ends].transpose().collect {
241
						corpusEngine.cpos2Str("$corpusName.$outputWordProperty", (it[0]..it[1]) as int[]).join(' ')
242
					}.countBy { it }.sort { -it.value }.each {
243
						if (odsOutputFormat) {
244
							setRow2(table, row++, corpusName, line, it.key, it.value)
245
						} else {
246
							output.println corpusName+"\t"+line+"\t"+it.key+"\t"+it.value
247
						}
248
					}
249
				}
250
			}
251
		}
252
	}
253
}
254

  
255
if (odsOutputFormat) {
256
	ods.save(outputFile)
257
} else {
258
	output.close()
259
}
260

  
261
return true
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/cqp/CQPUtils.groovy (revision 2051)
1
// Copyright © 2017 ENS de Lyon, CNRS, University of Franche-Comté
2
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
3
// @author sheiden
4

  
5
package org.txm.macro.cqp
6

  
7
import org.txm.searchengine.cqp.corpus.*
8

  
9
def getCorpusByName(name) {
10
	def cl = CorpusManager.getCorpusManager().getCorpora()
11
	def rc = cl.find {
12
		it.getName() == name
13
	}
14
	
15
	if (rc) return rc
16
	
17
	def rsc = null
18
	rc = cl.find { c ->
19
		def scl = c.getSubcorpora()
20
		def rrsc = scl.find { sc ->
21
			sc.getName() == name
22
		}
23
		rsc = rrsc
24
	}
25

  
26
	return rsc
27
}
28

  
29
def getCorpora(def script) {
30

  
31
	def scriptName = this.class.getSimpleName()
32

  
33
	if (script.binding.variables["args"]) {
34
		def args = script.binding.variables["args"]
35
		if (args["corpus"]) {
36
			return [getCorpusByName(args["corpus"])]
37
		} else if (args["corpora"]) {
38
			return args["corpora"].tokenize(',').collect {
39
				return getCorpusByName(it)
40
			}
41
		} else {
42
			if (script.corpusViewSelection == null || !(script.corpusViewSelection instanceof Subcorpus || script.corpusViewSelection instanceof MainCorpus)) {
43
				return null
44
			}
45
			return script.corpusViewSelections.findAll { it instanceof Subcorpus || it instanceof MainCorpus }
46
		}
47
	} else {
48
		if (script.corpusViewSelection == null || !(script.corpusViewSelection instanceof Subcorpus || script.corpusViewSelection instanceof MainCorpus)) {
49
			return null
50
		}
51
		return script.corpusViewSelections.findAll { it instanceof Subcorpus || it instanceof MainCorpus }
52
	}
53

  
54
}
55

  
56
def newTestSubCorpusName(name) {
57
	def cl = CorpusManager.getCorpusManager().getCorpora()
58
	def rc = cl.find {
59
		it.getName() == name
60
	}
61
	
62
	if (!rc) {
63
		println "** corpus '"+name+"' not found."
64
		return null
65
	}
66
	
67
	def scl = rc.getSubcorpora()
68
	def n = 1
69
	def loop = true
70
	while (loop) {
71
		def rsc = scl.find { sc ->
72
			sc.getName() == "CORPUS"+n
73
		}
74
		if (rsc) n++ else loop = false
75
	}
76

  
77
	return "CORPUS"+n
78
}
79

  
80
def positions2cql(matches) {
81
	// [0],[1],[2-3],[4-5],[6]
82
	// 0,1,2-3,4-5,6
83
	// "(n1:[])|(n2:[] []{1}) :: (n1 = 0)|(n1 = 1)|(n1 = 6)|(n2 = 2)|(n2 = 4)"
84

  
85
	def singletons = []
86
	def intervals = [:]
87
	matches.tokenize(',').each { interval ->
88
		if (interval.indexOf('-') != -1) { // start-end
89
			(start, end) = interval.tokenize('-') as int[]
90
			l = end-start
91
			if (intervals[l]) {
92
				intervals.put(l, intervals[l] << start)
93
			} else {
94
				intervals.put(l, [start])
95
			}
96
		} else { // singleton
97
			singletons << interval.toInteger()
98
		}
99
	}
100
	
101
	def singletonPositions = ""
102
	def intervalsCQLs = ""
103
	def intervalsPositions = ""
104
	
105
	// "(n1:[])|(n2:[] []{1}) :: (n1 = 0)|(n1 = 1)|(n1 = 6)|(n2 = 2)|(n2 = 4)"
106

  
107
	def firstSingleton = true
108
	singletons.each {
109
		if (!firstSingleton) singletonPositions += "|"
110
		firstSingleton = false
111
		singletonPositions += "n1="+it+""
112
	}
113

  
114
	def firstInterval = true
115
	intervals.each { length, positions ->
116
		if (!firstInterval) intervalsCQLs += "|"
117
		if (!firstInterval) intervalsPositions += "|"
118
		firstInterval = false
119
		intervalsCQLs += "n"+(length+1)+":[][]{"+length+"}"
120
		def firstPosition = true
121
		positions.each {
122
			if (!firstPosition) intervalsPositions += "|"
123
			firstPosition = false
124
			intervalsPositions += "n"+(length+1)+"="+it
125
		}
126
	}
127

  
128
	def q = ""
129
	if (singletons.size() > 0) {
130
		q += "n1:[]"
131
	}
132
	if (singletons.size() > 0 && intervals.size() > 0) {
133
		q += "|"
134
	}
135
	if (intervals.size() > 0) {
136
		q += intervalsCQLs
137
	}
138
	q += "::"
139
	if (singletons.size() > 0) {
140
		q += singletonPositions
141
	}
142
	if (singletons.size() > 0 && intervals.size() > 0) {
143
		q += "|"
144
	}
145
	if (intervals.size() > 0) {
146
		q += intervalsPositions
147
	}
148
	
149
	return q
150
	
151
}
152

  
153
def corpus2positions(def corpus, def maxMatches, def pretty = false) {
154
	
155
	def res = ""
156
	def matches = corpus.getMatches()
157

  
158
	if (matches.size() == 0) {
159
		return "<empty>"
160
	} else {
161
		def first = 1
162
		matches.take(maxMatches).each {
163
			if (first) { first = 0 } else {
164
				if (pretty) {
165
					res += ", "
166
				} else {
167
					res += ","
168
				}
169
			}
170
			// res += "["
171
			if (it.getStart() < it.getEnd()) {
172
				res += it.getStart()+"-"+it.getEnd()
173
			} else {
174
				res += it.getStart()
175
			}
176
			// res += "]"
177
		}
178
	}
179

  
180
	if (matches.size() > maxMatches) {
181
		res += "..."
182
	}
183
	
184
	return res
185
}
186

  
187
methods = this.getClass().declaredMethods.findAll { !it.synthetic }.name.sort().unique()
188
methods.removeAll(['main', 'run'])
189
println "** CQPUtils is not meant to be called directly.\nPlease import its methods ${methods}\nwith the following declaration:\nimport org.txm.macro.cqp.CQPUtilsMacro"
190

  
191
/*
192
	  ___           ___           ___           ___           ___           ___
193
	 /\  \         /\__\         /\__\         /\  \         |\__\         /\  \
194
	/::\  \       /::|  |       /::|  |       /::\  \        |:|  |       /::\  \
195
   /:/\:\  \     /:|:|  |      /:|:|  |      /:/\:\  \       |:|  |      /:/\:\  \
196
  /::\~\:\  \   /:/|:|  |__   /:/|:|  |__   /::\~\:\  \      |:|__|__   /::\~\:\  \
197
 /:/\:\ \:\__\ /:/ |:| /\__\ /:/ |:| /\__\ /:/\:\ \:\__\ ____/::::\__\ /:/\:\ \:\__\
198
 \/__\:\/:/  / \/__|:|/:/  / \/__|:|/:/  / \:\~\:\ \/__/ \::::/~~/~    \:\~\:\ \/__/
199
	  \::/  /      |:/:/  /      |:/:/  /   \:\ \:\__\    ~~|:|~~|      \:\ \:\__\
200
	  /:/  /       |::/  /       |::/  /     \:\ \/__/      |:|  |       \:\ \/__/
201
	 /:/  /        /:/  /        /:/  /       \:\__\        |:|  |        \:\__\
202
	 \/__/         \/__/         \/__/         \/__/         \|__|         \/__/
203

  
204

  
205

  
206
0 1 2 3 4 5 6 7 8 9
207

  
208
+ + + +
209
0 1 2 3 4 5 6 7 8 9
210
+0,+1,+2,+3
211
n:[] :: n = 0|n = 1|n = 2|n = 3
212

  
213
[ ] [ ]
214
0 1 2 3 4 5 6 7 8 9
215
0-1,2-3
216
n:[] []{1} :: n = 0
217
n:[] []{1} :: n = 2
218

  
219
[   ] +
220
0 1 2 3 4 5 6 7 8 9
221
0-2,+3
222
n:[] []{2} :: n = 0
223
n:[] :: n = 3
224

  
225

  
226
 */
227

  
228
/*
229

  
230
Touching intervals mergers
231
==========================
232

  
233
A) from http://stackoverflow.com/questions/31670849/merge-overlapping-intervals
234

  
235
import java.util.*;
236

  
237
public class Ideone
238
{
239
	public static void main (String[] args) throws java.lang.Exception
240
	{
241
		ArrayList<Interval> x = new ArrayList<>();
242

  
243
		x.add(new Interval(1, 3));
244
		x.add(new Interval(2, 6));
245
		x.add(new Interval(8, 10));
246
		x.add(new Interval(15, 18));
247
		x.add(new Interval(17, 20));
248

  
249
		x = merge(x);
250

  
251
		for(Interval i : x)
252
		{
253
			System.out.println(i.getStart() + " " + i.getEnd());
254
		}
255
	}
256

  
257
	public static ArrayList<Interval> merge(ArrayList<Interval> intervals) {
258

  
259
		if(intervals.size() == 0 || intervals.size() == 1)
260
			return intervals;
261

  
262
		Collections.sort(intervals, new IntervalComparator());
263

  
264
		Interval first = intervals.get(0);
265
		int start = first.getStart();
266
		int end = first.getEnd();
267

  
268
		ArrayList<Interval> result = new ArrayList<Interval>();
269

  
270
		for (int i = 1; i < intervals.size(); i++) {
271
			Interval current = intervals.get(i);
272
			if (current.getStart() <= end) {
273
				end = Math.max(current.getEnd(), end);
274
			} else {
275
				result.add(new Interval(start, end));
276
				start = current.getStart();
277
				end = current.getEnd();
278
			}
279
		}
280

  
281
		result.add(new Interval(start, end));
282
		return result;
283
	}
284
}
285

  
286
class Interval
287
{
288
	private int start;
289
	private int end;
290

  
291
	Interval() {
292
		start = 0;
293
		end = 0;
294
	}
295

  
296
	Interval(int s, int e)
297
	{
298
		start = s;
299
		end = e;
300
	}
301

  
302
	public int getStart() {
303
		return start;
304
	}
305

  
306
	public int getEnd() {
307
		return end;
308
	}
309
}
310

  
311
class IntervalComparator implements Comparator<Interval>
312
{
313
	public int compare(Interval i1, Interval i2)
314
	{
315
		return i1.getStart() - i2.getStart();
316
	}
317
}
318

  
319
B) from http://www.programcreek.com/2012/12/leetcode-merge-intervals/
320

  
321
public List<Interval> merge(List<Interval> intervals) {
322
	List<Interval> result = new ArrayList<Interval>();
323
 
324
	if(intervals==null||intervals.size()==0)
325
		return result;
326
 
327
	Collections.sort(intervals, new Comparator<Interval>(){
328
		public int compare(Interval i1, Interval i2){
329
			if(i1.start!=i2.start)
330
				return i1.start-i2.start;
331
			else
332
				return i1.end-i2.end;
333
		}
334
	});
335
 
336
	Interval pre = intervals.get(0);
337
	for(int i=0; i<intervals.size(); i++){
338
		Interval curr = intervals.get(i);
339
		if(curr.start>pre.end){
340
			result.add(pre);
341
			pre = curr;
342
		}else{
343
			Interval merged = new Interval(pre.start, Math.max(pre.end, curr.end));
344
			pre = merged;
345
		}
346
	}
347
	result.add(pre);
348
 
349
	return result;
350
}
351
 */
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/text/ExcelDir2XMLMacro.groovy (revision 2051)
1
package org.txm.macro.text
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6
import org.txm.rcpapplication.swt.widget.parameters.*
7
import org.txm.utils.*
8
import javax.xml.stream.*
9
import java.net.URL
10

  
11
import org.apache.poi.ss.usermodel.*
12
import org.apache.poi.hssf.usermodel.*
13
import org.apache.poi.xssf.usermodel.*
14
import org.apache.poi.ss.util.*
15

  
16
@Field @Option(name="inputDirectory", usage="répertoire des fichiers Excel à traiter", widget="Folder", required=true, def="")
17
def inputDirectory
18

  
19
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="")
20
def sheetName
21

  
22
@Field @Option(name="rootTag", usage="Root tag name", widget="String", required=false, def="root")
23
def rootTag
24

  
25
@Field @Option(name="textTag", usage="Text tag name", widget="String", required=false, def="unit")
26
def textTag
27

  
28
@Field @Option(name="metadataColumnList", usage="metadataColumnList column list separated with comma", widget="String", required=false, def="meta1,meta2")
29
def metadataColumnList
30

  
31
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2")
32
def dateColumnList
33

  
34
@Field @Option(name="textColumnList", usage="textColumnList column list separated with comma", widget="String", required=false, def="textColumnList1,textColumnList2")
35
def textColumnList
36

  
37
if (!ParametersDialog.open(this)) return
38

  
39
if (!inputDirectory.exists()) {
40
	println "** ExcelDir2XML: no '"+inputDirectory.name+"' directory found. Aborting."
41
	return false
42
}
43

  
44
if (!inputDirectory.canRead()) {
45
	println "** ExcelDir2XML: '"+inputDirectory.name+"' directory not readable. Aborting."
46
	return false
47
}
48

  
49
def f = []
50
inputDirectory.eachFileMatch(~/.*xlsx/) { f << it }
51

  
52
if (f.size() == 0) {
53
	println "** ExcelDir2XML: no .xlsx file found. Aborting."
54
	return false
55
}
56

  
57
try {
58

  
59
f.sort { it.name }.each { inputFile ->
60

  
61
	res = gse.run(Excel2XMLMacro, ["args":[
62

  
63
"inputFile":inputFile,
64
"sheetName":sheetName,
65
"metadataColumnList":metadataColumnList,
66
"dateColumnList":dateColumnList,
67
"textColumnList":textColumnList,
68
"rootTag":rootTag,
69
"textTag":textTag,
70
			
71
				"selection":selection,
72
				"selections":selections,
73
				"corpusViewSelection":corpusViewSelection,
74
				"corpusViewSelections":corpusViewSelections,
75
				"monitor":monitor]])
76
			if (!res) println "** problem calling Excel2XMLMacro."
77
}
78

  
79
} catch (Exception e) {
80
	println "** ExcelDir2XML: unable to read input files. Aborting."
81
	println e.getLocalizedMessage()
82
	println e.printStackTrace()
83
	return false
84
}
85

  
86
return true
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/text/Excel2XMLMacro.groovy (revision 2051)
1
package org.txm.macro.text
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6
import org.txm.rcpapplication.swt.widget.parameters.*
7
import org.txm.utils.*
8
import javax.xml.stream.*
9
import java.net.URL
10
import java.text.SimpleDateFormat
11

  
12
import org.apache.poi.ss.usermodel.*
13
import org.apache.poi.hssf.usermodel.*
14
import org.apache.poi.xssf.usermodel.*
15
import org.apache.poi.ss.util.*
16

  
17
def stringToIndent = { str -> org.txm.utils.AsciiUtils.buildAttributeId(org.txm.utils.AsciiUtils.convertNonAscii(str)).toLowerCase() }
18

  
19
// from http://www.java-connect.com/apache-poi-tutorials/read-all-type-of-excel-cell-value-as-string-using-poi
20
def getCellValueAsString = { cell ->
21
        strCellValue = null
22
        if (cell != null) {
23
            switch (cell.getCellType()) {
24
            case Cell.CELL_TYPE_STRING:
25
                strCellValue = cell.toString()
26
                break
27
            case Cell.CELL_TYPE_NUMERIC:
28
                if (DateUtil.isCellDateFormatted(cell)) {
29
                    SimpleDateFormat dateFormat = new SimpleDateFormat("dd/MM/yyyy")
30
                    strCellValue = dateFormat.format(cell.getDateCellValue())
31
                } else {
32
                    value = cell.getNumericCellValue()
33
                    longValue = value.longValue()
34
                    strCellValue = new String(longValue.toString())
35
                }
36
                break
37
            case Cell.CELL_TYPE_BOOLEAN:
38
                strCellValue = new String(new Boolean(cell.getBooleanCellValue()).toString())
39
                break
40
            case Cell.CELL_TYPE_BLANK:
41
                strCellValue = ""
42
                break
43
            }
44
        }
45
        if (strCellValue == null) strCellValue = ""
46
        return strCellValue
47
}
48

  
49
@Field @Option(name="inputFile", usage="fichier Excel à traiter", widget="File", required=true, def="")
50
File inputFile
51

  
52
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="")
53
def sheetName
54

  
55
@Field @Option(name="rootTag", usage="root tag name", widget="String", required=false, def="root")
56
def rootTag
57

  
58
@Field @Option(name="textTag", usage="line unit tag name", widget="String", required=false, def="unit")
59
def textTag
60

  
61
@Field @Option(name="metadataColumnList", usage="metadata columns list separated by comma", widget="String", required=false, def="meta1,meta2")
62
def metadataColumnList
63

  
64
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2")
65
def dateColumnList
66

  
67
@Field @Option(name="textColumnList", usage="text columns list separated by comma", widget="String", required=false, def="textColumnList1,textColumnList2")
68
def textColumnList
69

  
70
if (!ParametersDialog.open(this)) return
71

  
72
if (!inputFile.exists()) {
73
	println "** Excel2XML: no '"+inputFile.name+"' file found. Aborting."
74
	return false
75
}
76

  
77
if (!inputFile.canRead()) {
78
	println "** Excel2XML: '"+inputFile.name+"' file not readable. Aborting."
79
	return false
80
}
81

  
82
try {
83

  
84
metadataColumnList = metadataColumnList.split(",").collect { it.trim() }
85
dateColumnList = dateColumnList.split(",").collect { it.trim() }
86
textColumnList = textColumnList.split(",").collect { it.trim() }
87
textTag = textTag.trim()
88
rootTag = rootTag.trim()
89

  
90
wb = WorkbookFactory.create(inputFile)
91

  
92
if (sheetName.length() == 0) {
93
	ws = wb.getSheetAt(0)
94
} else {
95
	ws = wb.getSheet(sheetName)
96
	if (ws == null) {
97
		println "** Excel2XML: no '"+sheetName+" found. Aborting."
98
		return false
99
	}
100
}
101

  
102
if (ws == null) {
103
	println "** Excel2XML: no sheet found. Aborting."
104
	return false
105
}
106

  
107
nRows = ws.getPhysicalNumberOfRows()
108
println nRows+" rows."
109
 
110
firstRow = ws.getRow(0)
111
colMax = firstRow.getLastCellNum()
112

  
113
headers = (0..colMax-1).collect { getCellValueAsString(firstRow.getCell(it)) }
114

  
115
println "Headers: $headers"
116

  
117
normalizedHeaders = headers.collect { stringToIndent(it) }
118

  
119
ok = true
120
metadataColumnList.each { m ->
121
	if (!headers.contains(m)) {
122
		println "** Excel2XML: missing metadataColumnList column: $m"
123
		ok = false
124
	}
125
}
126
textColumnList.each { t ->
127
	if (!headers.contains(t)) {
128
		println "** Excel2XML: missing textColumnList column: $t"
129
		ok = false
130
	}
131
}
132

  
133
if (!ok) { return false }
134

  
135
metadataColumnIndex = metadataColumnList.collect { headers.indexOf(it) }
136
dateColumnsIndex = dateColumnList.collect { headers.indexOf(it) }
137
textColumnIndex = textColumnList.collect { headers.indexOf(it) }
138

  
139
println "metadataColumnList = "+metadataColumnList
140
println "metadataColumnIndex = "+metadataColumnIndex
141

  
142
println "dateColumnList = "+dateColumnList
143
println "dateColumnsIndex = "+dateColumnsIndex
144

  
145
println "textColumnList = "+textColumnList
146
println "textColumnIndex = "+textColumnIndex
147

  
148
name = inputFile.getName()
149
idx = name.lastIndexOf(".")
150

  
151
if (idx > 0) name = name.substring(0, idx)
152
outputFile = new File(inputFile.getParentFile(), name+".xml")
153

  
154
factory = XMLOutputFactory.newInstance()
155
output = new FileOutputStream(outputFile)
156
writer = factory.createXMLStreamWriter(output, "UTF-8")
157

  
158
writer.writeStartDocument("UTF-8","1.0")
159
writer.writeCharacters("\n") // simple XML formating
160
writer.writeStartElement(rootTag)
161
writer.writeCharacters("\n")
162

  
163
pb_n = 1
164

  
165
(1..nRows-1).each { rowIndex ->
166

  
167
	writer.writeCharacters("  ")
168
	writer.writeEmptyElement("pb") // <pb/> to get one page per input line (don't forget high number of words per page in import module)
169
	writer.writeAttribute("n", ""+pb_n++)
170
	writer.writeCharacters("\n") // simple XML formating
171

  
172
	writer.writeCharacters("  ")
173
	writer.writeStartElement(textTag)
174
	metadataColumnIndex.each { colIndex -> // build an attribute for each metadata
175
		String s = getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex));
176
		if (s == null) s ="";
177
		value = s.replaceAll("\n", ";").trim()
178
		writer.writeAttribute(normalizedHeaders[colIndex], value)
179
		if (colIndex in dateColumnsIndex) { // also split date attributes in day+month+year attributes
180
			matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/)
181
			writer.writeAttribute(normalizedHeaders[colIndex]+"jour", matches[0][1])
182
			writer.writeAttribute(normalizedHeaders[colIndex]+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value)))
183
			writer.writeAttribute(normalizedHeaders[colIndex]+"mois", matches[0][2])
184
			writer.writeAttribute(normalizedHeaders[colIndex]+"annee", matches[0][3])
185
		}
186
	}
187
	writer.writeCharacters("\n")
188

  
189
	writer.writeCharacters("    ")
190
	writer.writeStartElement("metadata")
191
	writer.writeStartElement("list")
192
	writer.writeAttribute("type", "unordered")
193
	writer.writeCharacters("\n")
194

  
195
 	metadataColumnIndex.each { colIndex ->
196
 		writer.writeStartElement("item")
197
 		writer.writeCharacters(headers[colIndex]+" : "+getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex)).replaceAll("\n", ";"))
198
		writer.writeEndElement() // item
199
		writer.writeCharacters("\n")
200
	}
201
	writer.writeCharacters("    ")
202
	writer.writeEndElement() // list
203
	writer.writeEndElement() // head
204
	writer.writeCharacters("\n")
205

  
206
   	textColumnIndex.each { colIndex ->
207
		
208
    	writer.writeCharacters("    ")
209
    	writer.writeStartElement(normalizedHeaders[colIndex])
210
		writer.writeStartElement("p")
211
		writer.writeStartElement("head")
212
		writer.writeStartElement("hi")
213
		writer.writeCharacters(headers[colIndex]+" : ")
214
		writer.writeEndElement() // hi
215
		writer.writeEndElement() // head
216
		value = getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex))
217

  
218
		if (value ==~ /(?s)^[A-Z]{3}:  [^;\n]+? +[;\n].*/) {
219
			value.findAll( /(?s)[A-Z]{3}:  ([^;\n]+?) +[;\n]/ ).each { desc ->
220
				writer.writeStartElement("descripteur")
221
				matches = (desc =~ /(?s)([A-Z]{3}):  ([^;\n]+?) +[;\n]/)
222
				writer.writeAttribute("type", matches[0][1])
223
				writer.writeCharacters(matches[0][2])
224
				writer.writeEndElement() // descripteur
225
			}
226
		} else {
227
			writer.writeCharacters(value)
228
		}
229
		writer.writeEndElement() // p
230
		writer.writeEndElement() // textColumn
231
		writer.writeCharacters("\n")
232
	}
233

  
234
	writer.writeCharacters("  ")
235
	writer.writeEndElement() // textTag
236
	writer.writeCharacters("\n")
237
}
238

  
239
writer.writeEndElement() // rootTag
240
writer.writeCharacters("\n")
241
writer.close()
242
output.close()
243
println "Result file: $outputFile"
244

  
245
} catch (Exception e) {
246
	println "** Excel2XML: unable to read input file. Aborting."
247
	println e.getLocalizedMessage()
248
	println e.printStackTrace()
249
	return false
250
}
251

  
252
return true
tmp/org.txm.searchengine.cqp.core/src/org/txm/searchengine/cqp/CQPSearchEngine.java (revision 2051)
10 10
import org.txm.core.engines.EngineType;
11 11
import org.txm.core.messages.TXMCoreMessages;
12 12
import org.txm.core.results.TXMResult;
13
import org.txm.importer.cwb.PatchCwbRegistry;
14 13
import org.txm.libs.cqp.CQPLibPreferences;
15 14
import org.txm.objects.CorpusBuild;
16 15
import org.txm.objects.Match;
17
import org.txm.objects.Project;
18 16
import org.txm.searchengine.core.EmptySelection;
19 17
import org.txm.searchengine.core.Query;
20 18
import org.txm.searchengine.core.SearchEngine;
......
29 27
import org.txm.searchengine.cqp.corpus.query.CQLQuery;
30 28
import org.txm.utils.OSDetector;
31 29
import org.txm.utils.Sh;
32
import org.txm.utils.io.IOUtils;
33 30
import org.txm.utils.logger.Log;
34 31

  
35 32
public class CQPSearchEngine extends SearchEngine {

Also available in: Unified diff