Révision 3210

tmp/org.txm.groovy.core/src/groovy/org/txm/macro/table/Excel2XMLWithoutAPIMacro2.groovy (revision 3210)
1
package org.txm.macro.table
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6
import org.txm.rcpapplication.swt.widget.parameters.*
7
import org.txm.utils.*
8
import javax.xml.stream.*
9
import java.net.URL
10
import java.text.SimpleDateFormat
11

  
12
import org.apache.poi.ss.usermodel.*
13
import org.apache.poi.hssf.usermodel.*
14
import org.apache.poi.xssf.usermodel.*
15
import org.apache.poi.ss.usermodel.WorkbookFactory
16
import org.apache.poi.ss.util.*
17

  
18
def stringToIndent = { str -> org.txm.utils.AsciiUtils.buildAttributeId(org.txm.utils.AsciiUtils.convertNonAscii(str)).toLowerCase() }
19

  
20
// from http://www.java-connect.com/apache-poi-tutorials/read-all-type-of-excel-cell-value-as-string-using-poi
21
def getCellValueAsString = { cell ->
22
	strCellValue = null
23
	if (cell != null) {
24
		switch (cell.getCellType()) {
25
			case CellType.STRING:
26
				strCellValue = cell.toString()
27
				break
28
			case CellType.NUMERIC:
29
				if (DateUtil.isCellDateFormatted(cell)) {
30
					SimpleDateFormat dateFormat = new SimpleDateFormat("dd/MM/yyyy")
31
					strCellValue = dateFormat.format(cell.getDateCellValue())
32
				} else {
33
					value = cell.getNumericCellValue()
34
					longValue = value.longValue()
35
					strCellValue = new String(longValue.toString())
36
				}
37
				break
38
			case CellType.BOOLEAN:
39
				strCellValue = new String(new Boolean(cell.getBooleanCellValue()).toString())
40
				break
41
			case CellType.BLANK:
42
				strCellValue = ""
43
				break
44
		}
45
	}
46
	if (strCellValue == null) strCellValue = ""
47
	return strCellValue
48
}
49

  
50
@Field @Option(name="inputFile", usage="fichier Excel à traiter", widget="File", required=true, def="")
51
		File inputFile
52

  
53
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="")
54
		def sheetName
55

  
56
@Field @Option(name="rootTag", usage="root tag name", widget="String", required=false, def="root")
57
		def rootTag
58

  
59
@Field @Option(name="textTag", usage="line unit tag name", widget="String", required=false, def="unit")
60
		def textTag
61

  
62
@Field @Option(name="metadataColumnList", usage="metadata columns list separated by comma", widget="String", required=false, def="meta1,meta2")
63
		def metadataColumnList
64

  
65
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2")
66
		def dateColumnList
67

  
68
@Field @Option(name="textColumnList", usage="text columns list separated by comma", widget="String", required=false, def="textColumnList1,textColumnList2")
69
		def textColumnList
70

  
71
@Field @Option(name="EmbedInTEI", usage="text columns list separated by comma", widget="Boolean", required=false, def="false")
72
		def EmbedInTEI
73

  
74
if (!ParametersDialog.open(this)) return
75

  
76
	if (!inputFile.exists()) {
77
		println "** Excel2XML: no '"+inputFile.name+"' file found. Aborting."
78
		return false
79
	}
80

  
81
if (!inputFile.canRead()) {
82
	println "** Excel2XML: '"+inputFile.name+"' file not readable. Aborting."
83
	return false
84
}
85

  
86
try {
87
	
88
	metadataColumnList = metadataColumnList.split(",").collect { it.trim() }
89
	dateColumnList = dateColumnList.split(",").collect { it.trim() }
90
	textColumnList = textColumnList.split(",").collect { it.trim() }
91
	textTag = textTag.trim()
92
	rootTag = rootTag.trim()
93
	
94
	wb = WorkbookFactory.create(inputFile)
95
	
96
	if (sheetName.length() == 0) {
97
		ws = wb.getSheetAt(0)
98
	} else {
99
		ws = wb.getSheet(sheetName)
100
		if (ws == null) {
101
			println "** Excel2XML: no '"+sheetName+" found. Aborting."
102
			return false
103
		}
104
	}
105
	
106
	if (ws == null) {
107
		println "** Excel2XML: no sheet found. Aborting."
108
		return false
109
	}
110
	
111
	nRows = ws.getPhysicalNumberOfRows()
112
	println nRows+" rows."
113
	
114
	firstRow = ws.getRow(0)
115
	colMax = firstRow.getLastCellNum()
116
	
117
	headers = (0..colMax-1).collect { getCellValueAsString(firstRow.getCell(it)) }
118
	
119
	println "Headers: $headers"
120
	
121
	normalizedHeaders = headers.collect { stringToIndent(it) }
122
	
123
	ok = true
124
	metadataColumnList.each { m ->
125
		if (!headers.contains(m)) {
126
			println "** Excel2XML: missing metadataColumnList column: $m"
127
			ok = false
128
		}
129
	}
130
	textColumnList.each { t ->
131
		if (!headers.contains(t)) {
132
			println "** Excel2XML: missing textColumnList column: $t"
133
			ok = false
134
		}
135
	}
136
	
137
	if (!ok) { return false }
138
	
139
	metadataColumnIndex = metadataColumnList.collect { headers.indexOf(it) }
140
	dateColumnsIndex = dateColumnList.collect { headers.indexOf(it) }
141
	textColumnIndex = textColumnList.collect { headers.indexOf(it) }
142
	
143
	println "metadataColumnList = "+metadataColumnList
144
	println "metadataColumnIndex = "+metadataColumnIndex
145
	
146
	println "dateColumnList = "+dateColumnList
147
	println "dateColumnsIndex = "+dateColumnsIndex
148
	
149
	println "textColumnList = "+textColumnList
150
	println "textColumnIndex = "+textColumnIndex
151
	
152
	name = inputFile.getName()
153
	idx = name.lastIndexOf(".")
154
	
155
	if (idx > 0) name = name.substring(0, idx)
156
	outputFile = new File(inputFile.getParentFile(), name+".xml")
157
	
158
	factory = XMLOutputFactory.newInstance()
159
	output = new FileOutputStream(outputFile)
160
	writer = factory.createXMLStreamWriter(output, "UTF-8")
161
	
162
	writer.writeStartDocument("UTF-8","1.0")
163
	writer.writeCharacters("\n") // simple XML formating
164
	
165
	if (EmbedInTEI) {
166
		writer.writeStartElement("TEI")
167
		writer.writeStartElement("teiHeader")
168
		writer.writeEndElement() // teiHeader
169
		writer.writeStartElement("text")
170
		writer.writeCharacters("\n")
171
	}
172
	
173
	writer.writeStartElement(rootTag)
174
	writer.writeCharacters("\n")
175
	
176
	pb_n = 1
177
	
178
	(1..nRows-1).each { rowIndex ->
179
		
180
		writer.writeCharacters("  ")
181
		writer.writeEmptyElement("pb") // <pb/> to get one page per input line (don't forget high number of words per page in import module)
182
		writer.writeAttribute("n", ""+pb_n++)
183
		writer.writeCharacters("\n") // simple XML formating
184
		
185
		writer.writeCharacters("  ")
186
		writer.writeStartElement(textTag)
187
		metadataColumnIndex.each { colIndex -> // build an attribute for each metadata
188
			def row = ws.getRow(rowIndex)
189
			if (row != null) {
190
				String s = getCellValueAsString(row.getCell(colIndex));
191
				if (s == null) s ="";
192
				value = s.replaceAll("\n", ";").trim()
193
				writer.writeAttribute(normalizedHeaders[colIndex], value)
194
				if (colIndex in dateColumnsIndex) { // also split date attributes in day+month+year attributes
195
					matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/)
196
					writer.writeAttribute(normalizedHeaders[colIndex]+"jour", matches[0][1])
197
					writer.writeAttribute(normalizedHeaders[colIndex]+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value)))
198
					writer.writeAttribute(normalizedHeaders[colIndex]+"mois", matches[0][2])
199
					writer.writeAttribute(normalizedHeaders[colIndex]+"annee", matches[0][3])
200
				}
201
			}
202
		}
203
		writer.writeCharacters("\n")
204
		
205
		writer.writeCharacters("    ")
206
		writer.writeStartElement("metadata")
207
		writer.writeStartElement("list")
208
		writer.writeAttribute("type", "unordered")
209
		writer.writeCharacters("\n")
210
		
211
		metadataColumnIndex.each { colIndex ->
212
			
213
			def row = ws.getRow(rowIndex)
214
			if (row != null) {
215
				writer.writeStartElement("item")
216
				writer.writeCharacters(headers[colIndex]+" : "+getCellValueAsString(row.getCell(colIndex)).replaceAll("\n", ";"))
217
				writer.writeEndElement() // item
218
				writer.writeCharacters("\n")
219
			}
220
		}
221
		writer.writeCharacters("    ")
222
		writer.writeEndElement() // list
223
		writer.writeEndElement() // head
224
		writer.writeCharacters("\n")
225
		
226
		textColumnIndex.each { colIndex ->
227
			
228
			writer.writeCharacters("    ")
229
			writer.writeStartElement(normalizedHeaders[colIndex])
230
			writer.writeStartElement("p")
231
			writer.writeStartElement("head")
232
			writer.writeStartElement("hi")
233
			writer.writeCharacters(headers[colIndex]+" : ")
234
			writer.writeEndElement() // hi
235
			writer.writeEndElement() // head
236
			def row = ws.getRow(rowIndex)
237
			if (row != null) {
238
				value = getCellValueAsString(row.getCell(colIndex))
239
				
240
				if (value ==~ /(?s)^[A-Z]{3}:  [^;\n]+? +[;\n].*/) {
241
					value.findAll( /(?s)[A-Z]{3}:  ([^;\n]+?) +[;\n]/ ).each { desc ->
242
						writer.writeStartElement("descripteur")
243
						matches = (desc =~ /(?s)([A-Z]{3}):  ([^;\n]+?) +[;\n]/)
244
						writer.writeAttribute("type", matches[0][1])
245
						writer.writeCharacters(matches[0][2])
246
						writer.writeEndElement() // descripteur
247
					}
248
				} else {
249
					writer.writeCharacters(value)
250
				}
251
				writer.writeEndElement() // p
252
				writer.writeEndElement() // textColumn
253
				writer.writeCharacters("\n")
254
			}
255
		}
256
		
257
		writer.writeCharacters("  ")
258
		writer.writeEndElement() // textTag
259
		writer.writeCharacters("\n")
260
	}
261
	
262
	writer.writeEndElement() // rootTag
263
	writer.writeCharacters("\n")
264
	if (EmbedInTEI) {
265
		writer.writeEndElement() // text
266
		writer.writeCharacters("\n")
267
		writer.writeEndElement() // TEI
268
		writer.writeCharacters("\n")
269
	}
270
	
271
	writer.close()
272
	output.close()
273
	println "Result file: $outputFile"
274
	
275
} catch (Exception e) {
276
	println "** Excel2XML: unable to read input file. Aborting."
277
	println e.getLocalizedMessage()
278
	println e.printStackTrace()
279
	return false
280
}
281

  
282
return true
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/table/Excel2XMLMacro2.groovy (revision 3210)
1
package org.txm.macro.table
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6

  
7
import org.txm.libs.msoffice.ReadExcel
8
import org.txm.rcpapplication.swt.widget.parameters.*
9
import org.txm.utils.*
10
import javax.xml.stream.*
11
import java.net.URL
12
import java.text.SimpleDateFormat
13

  
14
import org.apache.poi.ss.usermodel.*
15
import org.apache.poi.hssf.usermodel.*
16
import org.apache.poi.xssf.usermodel.*
17
import org.apache.poi.ss.usermodel.WorkbookFactory
18
import org.apache.poi.ss.util.*
19

  
20
def stringToIndent = { str -> org.txm.utils.AsciiUtils.buildAttributeId(org.txm.utils.AsciiUtils.convertNonAscii(str)).toLowerCase() }
21

  
22
@Field @Option(name="inputFile", usage="fichier Excel à traiter", widget="File", required=true, def="")
23
		File inputFile
24

  
25
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="")
26
		def sheetName
27

  
28
@Field @Option(name="EmbedInTEI", usage="embed xml content in  TEI text element", widget="Boolean", required=false, def="false")
29
		def EmbedInTEI
30
		
31
@Field @Option(name="rootTag", usage="root tag name", widget="String", required=false, def="root")
32
		def rootTag
33

  
34
@Field @Option(name="textTag", usage="line unit tag name", widget="String", required=false, def="unit")
35
		def textTag
36

  
37
@Field @Option(name="metadataColumnList", usage="metadata columns list separated by comma", widget="String", required=false, def="meta1,meta2")
38
		def metadataColumnList
39

  
40
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2")
41
		def dateColumnList
42

  
43
@Field @Option(name="textColumnList", usage="text columns list separated by comma", widget="String", required=false, def="textColumnList1,textColumnList2")
44
		def textColumnList
45
		
46
@Field @Option(name="prefixContentColumnList", usage="columns which content is a semi-colon separated prefixes list", widget="String", required=false, def="")
47
		def prefixContentColumnList
48
		
49
@Field @Option(name="listContentColumnList", usage="columns which content is a semi-colon separated list", widget="String", required=false, def="")
50
		def listContentColumnList
51

  
52

  
53
if (!ParametersDialog.open(this)) return
54

  
55
	if (!inputFile.exists()) {
56
		println "** Excel2XML: no '"+inputFile.name+"' file found. Aborting."
57
		return false
58
	}
59

  
60
if (!inputFile.canRead()) {
61
	println "** Excel2XML: '"+inputFile.name+"' file not readable. Aborting."
62
	return false
63
}
64

  
65
try {
66
	metadataColumnList = metadataColumnList.split(",").collect { it.trim() }
67
	dateColumnList = dateColumnList.split(",").collect { it.trim() }
68
	textColumnList = textColumnList.split(",").collect { it.trim() }
69
	prefixContentColumnList = prefixContentColumnList.split(",").collect { it.trim() }
70
	listContentColumnList = listContentColumnList.split(",").collect { it.trim() }
71
	textTag = textTag.trim()
72
	rootTag = rootTag.trim()
73
	
74
	ReadExcel excel = new ReadExcel(inputFile, sheetName);
75
	excel.readHeaders();
76
	
77
	headers = excel.getHeaders()
78
	
79
	println "Headers: $headers"
80
	
81
	ok = true
82
	metadataColumnList.each { m ->
83
		if (!headers.contains(m)) {
84
			println "** Excel2XML: missing metadataColumnList column: $m"
85
			ok = false
86
		}
87
	}
88
	textColumnList.each { t ->
89
		if (!headers.contains(t)) {
90
			println "** Excel2XML: missing textColumnList column: $t"
91
			ok = false
92
		}
93
	}
94
	
95
	if (!ok) { return false }
96
	
97
	println "metadataColumnList = "+metadataColumnList
98
	println "dateColumnList = "+dateColumnList
99
	println "textColumnList = "+textColumnList
100
	
101
	name = inputFile.getName()
102
	idx = name.lastIndexOf(".")
103
	
104
	if (idx > 0) name = name.substring(0, idx)
105
	outputFile = new File(inputFile.getParentFile(), name+".xml")
106
	
107
	factory = XMLOutputFactory.newInstance()
108
	output = new FileOutputStream(outputFile)
109
	writer = factory.createXMLStreamWriter(output, "UTF-8")
110
	
111
	writer.writeStartDocument("UTF-8","1.0")
112
	writer.writeCharacters("\n") // simple XML formating
113
	
114
	if (EmbedInTEI) {
115
		writer.writeStartElement("TEI")
116
		writer.writeStartElement("teiHeader")
117
		writer.writeEndElement() // teiHeader
118
		writer.writeStartElement("text")
119
		writer.writeCharacters("\n")
120
	}
121
	
122
	if (rootTag != null && rootTag.length() > 0) {
123
		writer.writeStartElement(rootTag)
124
		writer.writeCharacters("\n")
125
	}
126
	
127
	pb_n = 1
128
	
129
	//(1..nRows-1).each { rowIndex ->
130
	def record = null
131
	while (excel.readRecord()) {
132
		
133
		record = excel.getRecord()
134
		if (record.isEmpty()) continue;
135
		
136
		//println "record=$record"
137
		
138
		writer.writeCharacters("  ")
139
		writer.writeEmptyElement("pb") // <pb/> to get one page per input line (don't forget high number of words per page in import module)
140
		writer.writeAttribute("n", ""+pb_n++)
141
		writer.writeCharacters("\n") // simple XML formating
142
		
143
		writer.writeCharacters("  ")
144
		if (textTag != null && textTag.length() > 0) {
145
			writer.writeStartElement(textTag)
146
			
147
			//metadataColumnIndex.each { colIndex -> // build an attribute for each metadata
148
			for (def metadataName : metadataColumnList) {
149
				String s = record.get(metadataName);
150
				if (s == null) s = "";
151
				
152
				value = s.replaceAll("\n", ";").trim()
153
				writer.writeAttribute(stringToIndent(metadataName), value)
154
				
155
				if (metadataName in dateColumnList) { // also split date attributes in day+month+year attributes
156
					matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/)
157
					writer.writeAttribute(metadataName+"-jour", matches[0][1])
158
					writer.writeAttribute(metadataName+"-joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value)))
159
					writer.writeAttribute(metadataName+"-mois", matches[0][2])
160
					writer.writeAttribute(metadataName+"-annee", matches[0][3])
161
				}
162
			}
163
			writer.writeCharacters("\n")
164
		}
165
		
166
		writer.writeCharacters("    ")
167
		writer.writeStartElement("metadata")
168
		writer.writeStartElement("list")
169
		writer.writeAttribute("type", "unordered")
170
		writer.writeCharacters("\n")
171
		
172
		//metadataColumnIndex.each { colIndex ->
173
		for (def metadataName : metadataColumnList) {
174
			writer.writeStartElement("item")
175
			writer.writeCharacters(metadataName+" : "+record.get(metadataName).replaceAll("\n", ";"))
176
			writer.writeEndElement() // item
177
			writer.writeCharacters("\n")
178
		}
179
		writer.writeCharacters("    ")
180
		writer.writeEndElement() // list
181
		writer.writeEndElement() // head
182
		writer.writeCharacters("\n")
183
		
184
		//textColumnIndex.each { colIndex ->
185
		for (def textColumnName : textColumnList) {
186
			
187
			writer.writeCharacters("    ")
188
			writer.writeStartElement(stringToIndent(textColumnName))
189
			writer.writeStartElement("p")
190
			writer.writeStartElement("head")
191
			writer.writeStartElement("hi")
192
			writer.writeCharacters(textColumnName+" : ")
193
			writer.writeEndElement() // hi
194
			writer.writeEndElement() // head
195
			
196
			value = record.get(textColumnName)
197
			
198
			if (value ==~ /(?s)^[A-Z]{3}:  [^;\n]+? +[;\n].*/) {
199
				value.findAll( /(?s)[A-Z]{3}:  ([^;\n]+?) +[;\n]/ ).each { desc ->
200
					writer.writeStartElement("descripteur")
201
					matches = (desc =~ /(?s)([A-Z]{3}):  ([^;\n]+?) +[;\n]/)
202
					writer.writeAttribute("type", matches[0][1])
203
					writer.writeCharacters(matches[0][2])
204
					writer.writeEndElement() // descripteur
205
				}
206
			} else {
207
				writer.writeCharacters(value)
208
			}
209
			writer.writeEndElement() // p
210
			writer.writeEndElement() // textColumn
211
			writer.writeCharacters("\n")
212
		}
213
		
214
		writer.writeCharacters("  ")
215
		
216
		if (textTag != null && textTag.length() > 0) {
217
			writer.writeEndElement() // textTag
218
			writer.writeCharacters("\n")
219
		}
220
	}
221
	
222
	if (rootTag != null && rootTag.length() > 0) {
223
		writer.writeEndElement() // rootTag
224
		writer.writeCharacters("\n")
225
	}
226
	
227
	if (EmbedInTEI) {
228
		writer.writeEndElement() // text
229
		writer.writeCharacters("\n")
230
		writer.writeEndElement() // TEI
231
		writer.writeCharacters("\n")
232
	}
233
	
234
	writer.close()
235
	output.close()
236
	println "Result file: $outputFile"
237
	
238
} catch (Exception e) {
239
	println "** Excel2XML: unable to read input file. Aborting."
240
	println e.getLocalizedMessage()
241
	println e.printStackTrace()
242
	return false
243
}
244

  
245
return true
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/table/Table2TextCorpusMacro.groovy (revision 3210)
1
package org.txm.macro.csv
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6
import org.txm.rcp.swt.widget.parameters.*
7
import org.txm.utils.*
8
import javax.xml.stream.*
9
import java.net.URL
10
import org.txm.libs.msoffice.ReadExcel
11

  
12
@Field @Option(name="inputFile", usage="CSV File", widget="File", required=false, def="file.xlsx")
13
		File inputFile;
14

  
15
@Field @Option(name="outputDirectory", usage="output directory", widget="Folder", required=false, def="directory")
16
		File outputDirectory;
17

  
18
@Field @Option(name="textsSeparator", usage="Texts", widget="Separator", required=false, def="Texts")
19
		def textsSeparator
20

  
21
@Field @Option(name="textIDColumn", usage="<text> id column", widget="String", required=false, def="Identifiant de la notice")
22
		def textIDColumn;
23

  
24
@Field @Option(name="textSelector", usage="<text> lines selector format is column=value", widget="String", required=false, def="Type de notice=Notice sommaire")
25
		def textSelector;
26

  
27
@Field @Option(name="textMetadataColumnList", usage="<text> metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)")
28
		def textMetadataColumnList;
29

  
30
@Field @Option(name="textContentColumnList", usage="<text> textual content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences")
31
		def textContentColumnList;
32

  
33
@Field @Option(name="structuresSeparator", usage="Structures", widget="Separator", required=false, def="Structures")
34
		def structuresSeparator
35

  
36
@Field @Option(name="structureTag", usage="structure element to create", widget="String", required=false, def="div")
37
		def structureTag;
38

  
39
@Field @Option(name="structureSelector", usage="column_to_test=regexp", widget="String", required=false, def="Type de notice=Notice sujet")
40
		def structureSelector;
41

  
42
@Field @Option(name="textJoinColumn", usage="jointure column, values should point to the textIDColumn values", widget="String", required=false, def="Lien notice principale")
43
		def textJoinColumn;
44

  
45
@Field @Option(name="structureMetadataColumnList", usage="structure metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)")
46
		def structureMetadataColumnList;
47

  
48
@Field @Option(name="structureContentColumnList", usage="structure textual content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences")
49
		def structureContentColumnList;
50

  
51
@Field @Option(name="typesSeparator", usage="Columns types", widget="Separator", required=false, def="Columns types")
52
		def typesSeparator
53

  
54
@Field @Option(name="dateColumnTypeList", usage="metadata columns of type=Date", widget="String", required=false, def="")
55
		def dateColumnTypeList
56

  
57
@Field @Option(name="prefixesColumnTypeList", usage="metadata columns of type=Prefixes", widget="String", required=false, def="")
58
		def prefixesColumnTypeList
59

  
60
@Field @Option(name="listColumnTypeList", usage="metadata columns of type=List semi-colon separated", widget="String", required=false, def="")
61
		def listColumnTypeList
62

  
63
		@Field @Option(name="debug", usage="Show devug messages", widget="Boolean", required=false, def="false")
64
		def debug
65
//@Field @Option(name="structureOrderColumn", usage="structure column coding structure order", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences")
66
//		def structureOrderColumn;
67

  
68
if (!ParametersDialog.open(this)) return;
69

  
70
textMetadataColumnList = textMetadataColumnList.split(",").collect { it.trim() }
71
textContentColumnList = textContentColumnList.split(",").collect { it.trim() }
72
structureMetadataColumnList = structureMetadataColumnList.split(",").collect { it.trim() }
73
structureContentColumnList = structureContentColumnList.split(",").collect { it.trim() }
74
dateColumnTypeList = dateColumnTypeList.split(",").collect { it.trim() }
75
prefixesColumnTypeList = prefixesColumnTypeList.split(",").collect { it.trim() }
76
listColumnTypeList = listColumnTypeList.split(",").collect { it.trim() }
77
if (textSelector != null && textSelector.contains("=")) {
78
	textSelector = textSelector.trim()
79
	textSelector = [textSelector.substring(0, textSelector.indexOf("=")).trim(), textSelector.substring(textSelector.indexOf("=")+1).trim()]
80
} else {
81
	textSelector = [null, null]
82
}
83
if (structureSelector != null && structureSelector.contains("=")) {
84
	structureSelector = structureSelector.trim()
85
	structureSelector = [structureSelector.substring(0, structureSelector.indexOf("=")).trim(), structureSelector.substring(structureSelector.indexOf("=")+1).trim()]
86
} else {
87
	structureSelector = [null, null]
88
}
89

  
90
println "textIDColumn, textJoinColumn=$textIDColumn, $textJoinColumn"
91
println "textMetadataColumnList columns: $textMetadataColumnList"
92
println "textContentColumnList columns: $textContentColumnList"
93
println "structureMetadataColumnList columns: $structureMetadataColumnList"
94
println "structureContentColumnList columns: $structureContentColumnList"
95

  
96
println "text selector="+textSelector
97
println "structure selector="+structureSelector
98
println "structureTag="+structureTag
99

  
100
def reader = null
101
try {
102
	reader = new TableReader(inputFile);
103
	println "Reading $inputFile with TableReader..."
104
} catch(Exception e) {
105
	reader = new ReadExcel(inputFile, null);
106
	println "Reading $inputFile with ReadExcel..."
107
}
108

  
109
if (!reader.readHeaders()) {
110
	println "** Error: no header"
111
	return
112
}
113
def headers = Arrays.asList(reader.getHeaders())
114
println "$inputFile table column names: $headers"
115

  
116
def ok = true
117
def hash = ["selection":[textIDColumn, textJoinColumn], "textMetadataColumnList":textMetadataColumnList,
118
	"textContentColumnList":textContentColumnList, "structureMetadataColumnList":structureMetadataColumnList,
119
	"structureContentColumnList":structureContentColumnList]
120
for (def key : hash.keySet()) {
121
	for (def m : hash[key]) {
122
		if (!headers.contains(m)) {
123
			println "** Error: missing $key column: $m"
124
			ok = false
125
		}
126
}
127
}
128

  
129
if (!ok) { return; }
130

  
131
// group records by text
132
def texts = new LinkedHashMap()
133
def nRecord = 0
134
def nRecordToWrite = 0
135
while (reader.readRecord()) {
136
nRecord++
137
//println "record="+reader.getRecord().get(textSelector[0])+" "+reader.getRecord().get(structureSelector[0])
138

  
139
String id = reader.get(textIDColumn).trim()
140
String join = reader.get(textJoinColumn).trim()
141
String textSelectorValue = reader.get(textSelector[0]).trim()
142
String structureSelectorValue = reader.get(structureSelector[0]).trim()
143

  
144
if (textIDColumn != null && textJoinColumn != null && textIDColumn.length() > 0 && textJoinColumn.length() > 0) {
145
	if (textSelectorValue != null && structureSelectorValue != null) {
146
		
147
		if (textSelectorValue != null && textSelectorValue.matches(textSelector[1])) {
148
			if (!texts.containsKey(id)) texts[id] = []
149
			texts[id].add(0, reader.getRecord())
150
		} else if (structureSelectorValue != null && structureSelectorValue.matches(structureSelector[1])) {
151
			if (!texts.containsKey(join)) texts[join] = []
152
			texts[join].add(reader.getRecord())
153
			nRecordToWrite++
154
		} else {
155
			// ignore record
156
		}
157
	} else { // no text&structure selector set, take the record if 'id' or 'join' are set
158
		if (id.length() > 0) {
159
			if (!texts.containsKey(id)) texts[id] = []
160
			texts[id].add(0, reader.getRecord())
161
		} else if (join.length() > 0) {
162
			if (!texts.containsKey(join)) texts[join] = []
163
			texts[join].add(reader.getRecord())
164
			nRecordToWrite++
165
		} else {
166
			// ignore record
167
		}
168
	}
169
} else { // no id & join parameter set -> regroup all record in one file
170
	if (!texts.containsKey("result.xml")) texts["result.xml"] = []
171
	texts["result.xml"].add(reader.getRecord())
172
	nRecordToWrite++
173
}
174
}
175

  
176
println "N records: "+nRecord
177
println "N records to write: "+nRecordToWrite
178
println "N texts to build: "+texts.size()
179
if (texts.size()  == 0) {
180
println "No text found. Aborting."
181
return
182
}
183
outputDirectory.mkdir()
184

  
185
for (def id : texts.keySet()) {
186
def toWrite = texts[id]
187
def text = toWrite[0]
188
String textSelectorValue = text.get(textSelector[0]).trim()
189
if (textSelectorValue == null || (textSelectorValue != null && textSelectorValue.matches(textSelector[1]))) {
190
	if (debug) println "Processing text: $id"
191
	
192
	File outputfile = new File(outputDirectory, id+".xml")
193
	XMLOutputFactory factory = XMLOutputFactory.newInstance()
194
	FileOutputStream output = new FileOutputStream(outputfile)
195
	XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8")
196
	
197
	writer.writeStartDocument("UTF-8","1.0")
198
	writer.writeCharacters("\n") // simple XML formating
199
	writer.writeStartElement("TEI")
200
	writer.writeCharacters("\n") // simple XML formating
201
	writer.writeStartElement("teiHeader")
202
	writer.writeEndElement() // teiHeader
203
	writer.writeCharacters("\n") // simple XML formating
204
	writer.writeStartElement("text")
205
	writer.writeAttribute("id", id)
206
	
207
	for (String att : textMetadataColumnList) {
208
		if (att in dateColumnTypeList) {
209
			writeMetadataDate(text, att, writer)
210
		} else {
211
			writer.writeAttribute(AsciiUtils.buildAttributeId(att), text.get(att)) // struct
212
		}
213
	}
214
	
215
	writer.writeCharacters("\n") // simple XML formating
216
	
217
	writer.writeStartElement("metadata")
218
	writer.writeStartElement("list")
219
	writer.writeAttribute("type", "unordered")
220
	writer.writeCharacters("\n")
221
	textMetadataColumnList.each { att ->
222
		writer.writeStartElement("item")
223
		writer.writeCharacters(att+" : "+text.get(att).replaceAll("\n", ";"))
224
		writer.writeEndElement() // item
225
		writer.writeCharacters("\n")
226
	}
227
	writer.writeEndElement() // list
228
	writer.writeEndElement() // metadata
229
	writer.writeCharacters("\n")
230
	
231
	for (String att : textContentColumnList) {
232
		
233
		if (att in prefixesColumnTypeList) {
234
			
235
			writePrefixTextContent(text, att, writer)
236
		} else if (att in listColumnTypeList) {
237
			
238
			writeListTextContent(text, att, writer)
239
		} else {
240
			
241
			writer.writeStartElement("p");
242
			writer.writeAttribute("type", att.trim())
243
			writer.writeCharacters("\n") // simple XML formating
244
			
245
			writer.writeStartElement("head")
246
			writer.writeStartElement("hi")
247
			writer.writeCharacters(att+" : ")
248
			writer.writeEndElement() // hi
249
			writer.writeEndElement() // head
250
			
251
			writer.writeCharacters(text.get(att)) // get textColumnList content
252
			writer.writeEndElement() // t
253
			writer.writeCharacters("\n") // simple XML formating)
254
		}
255
	}
256
	
257
	int pb_n = 1;
258
	for (int i = 1 ; i < toWrite.size() ; i++) {
259
		def record = toWrite[i]
260
		
261
		writer.writeEmptyElement("pb") // <pb/>
262
		writer.writeAttribute("n", ""+pb_n++)
263
		writer.writeAttribute("type", "record")
264
		
265
		writer.writeStartElement(structureTag)
266
		
267
		for (String att : structureMetadataColumnList) {
268
			if (att in dateColumnTypeList) {
269
				writeMetadataDate(record, att, writer)
270
			} else {
271
				writer.writeAttribute(AsciiUtils.buildAttributeId(att), record.get(att)) // struct
272
			}
273
		}
274
		writer.writeCharacters("\n")
275
		
276
		writer.writeStartElement("metadata")
277
		writer.writeStartElement("list")
278
		writer.writeAttribute("type", "unordered")
279
		writer.writeCharacters("\n")
280
		structureMetadataColumnList.each { att ->
281
			writer.writeStartElement("item")
282
			writer.writeCharacters(att+" : "+record.get(att).replaceAll("\n", ";"))
283
			writer.writeEndElement() // item
284
			writer.writeCharacters("\n")
285
		}
286
		writer.writeEndElement() // list
287
		writer.writeEndElement() // metadata
288
		writer.writeCharacters("\n")
289
		
290
		for (String att : structureContentColumnList) {
291
			if (att in prefixesColumnTypeList) {
292
				writePrefixTextContent(text, att, writer)
293
			} else if (att in listColumnTypeList) {
294
				writeListTextContent(text, att, writer)
295
			} else {
296
				writer.writeStartElement("p");
297
				writer.writeAttribute("type", att.trim())
298
				writer.writeAttribute("id", record.get(textIDColumn))
299
				writer.writeCharacters("\n") // simple XML formating
300
				
301
				writer.writeStartElement("head")
302
				writer.writeStartElement("hi")
303
				writer.writeCharacters(att+" : ")
304
				writer.writeEndElement() // hi
305
				writer.writeEndElement() // head
306
				
307
				writer.writeCharacters(record.get(att)) // get textColumnList content
308
				writer.writeEndElement() // t
309
				writer.writeCharacters("\n") // simple XML formating
310
			}
311
		}
312
		
313
		writer.writeEndElement() // struct
314
		writer.writeCharacters("\n") // simple XML formating
315
	}
316
	
317
	writer.writeEndElement() // text
318
	writer.writeCharacters("\n") // simple XML formating
319
	writer.writeEndElement() // TEI
320
	writer.close()
321
	output.close()
322
	reader.close()
323
} else {
324
	// error
325
	println "ERROR: '$id' text group with  no text line"
326
}
327
}
328

  
329
def writeListTextContent(def record, def att, def writer) {
330
writer.writeCharacters("\n")
331
found = false
332
def value = record.get(att)
333
value.findAll( /(?s)([^\n]+?)[\n]/ ).each { desc ->
334
	found = true
335
	writer.writeCharacters("\t")
336
	writer.writeStartElement("p")
337
	writer.writeAttribute("rend", "list")
338
	matches = (desc =~ /(?s)([^\n]+?)[\n]/)
339
	writer.writeCharacters(matches[0][1])
340
	writer.writeEndElement() // p
341
	writer.writeCharacters("\n")
342
}
343
if (!found) {
344
	writer.writeCharacters("\t")
345
	writer.writeStartElement("p")
346
	writer.writeAttribute("rend", "no-list")
347
	writer.writeCharacters(value)
348
	writer.writeEndElement() // p
349
	writer.writeCharacters("\n")
350
}
351
}
352

  
353
def writePrefixTextContent(def record, def att, def writer) {
354
writer.writeCharacters("\n")
355
writer.writeStartElement("list")
356
writer.writeAttribute("rend", "prefixes")
357
writer.writeAttribute("type", "unordered")
358
writer.writeCharacters("\n")
359
found = false
360
def value = record.get(att)
361
value.findAll( /(?s)[A-Z]{3}:? *([^;\n]+?) +[;\n]/ ).each { desc ->
362
	found = true
363
	writer.writeCharacters("\t")
364
	writer.writeStartElement("item")
365
	matches = (desc =~ /(?s)([A-Z]{3}):? *([^;\n]+?) +[;\n]/)
366
	writer.writeAttribute("type", matches[0][1])
367
	writer.writeStartElement("span")
368
	writer.writeCharacters(matches[0][1]+" ")
369
	writer.writeEndElement() // span
370
	writer.writeCharacters(matches[0][2])
371
	writer.writeEndElement() // item
372
	writer.writeCharacters("\n")
373
}
374
if (!found) {
375
	writer.writeCharacters(value)
376
}
377
writer.writeEndElement() // list
378
writer.writeCharacters("\n")
379
}
380

  
381
def writeMetadataDate(def record, def att, def writer) {
382
String value = record.get(att)
383
String att_normalized = AsciiUtils.buildAttributeId(att)
384
matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/)
385
writer.writeAttribute(att_normalized+"jour", matches[0][1])
386
writer.writeAttribute(att_normalized+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value)))
387
writer.writeAttribute(att_normalized+"mois", matches[0][2])
388
writer.writeAttribute(att_normalized+"annee", matches[0][3])
389
writer.writeAttribute(att_normalized+"tri", matches[0][3]+"-"+matches[0][2]+"-"+matches[0][1])
390
}
391

  
392
/*
393
 String name = inputFile.getName()
394
 int idx = name.lastIndexOf(".")
395
 if (idx > 0) name = name.substring(0, idx)
396
 */
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/table/Excel2XMLWithoutAPIMacro.groovy (revision 3210)
1
package org.txm.macro.table
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6
import org.txm.rcpapplication.swt.widget.parameters.*
7
import org.txm.utils.*
8
import javax.xml.stream.*
9
import java.net.URL
10
import java.text.SimpleDateFormat
11

  
12
import org.apache.poi.ss.usermodel.*
13
import org.apache.poi.hssf.usermodel.*
14
import org.apache.poi.xssf.usermodel.*
15
import org.apache.poi.ss.usermodel.WorkbookFactory
16
import org.apache.poi.ss.util.*
17

  
18
def stringToIndent = { str -> org.txm.utils.AsciiUtils.buildAttributeId(org.txm.utils.AsciiUtils.convertNonAscii(str)).toLowerCase() }
19

  
20
// from http://www.java-connect.com/apache-poi-tutorials/read-all-type-of-excel-cell-value-as-string-using-poi
21
def getCellValueAsString = { cell ->
22
	strCellValue = null
23
	if (cell != null) {
24
		switch (cell.getCellType()) {
25
			case CellType.STRING:
26
				strCellValue = cell.toString()
27
				break
28
			case CellType.NUMERIC:
29
				if (DateUtil.isCellDateFormatted(cell)) {
30
					SimpleDateFormat dateFormat = new SimpleDateFormat("dd/MM/yyyy")
31
					strCellValue = dateFormat.format(cell.getDateCellValue())
32
				} else {
33
					value = cell.getNumericCellValue()
34
					longValue = value.longValue()
35
					strCellValue = new String(longValue.toString())
36
				}
37
				break
38
			case CellType.BOOLEAN:
39
				strCellValue = new String(new Boolean(cell.getBooleanCellValue()).toString())
40
				break
41
			case CellType.BLANK:
42
				strCellValue = ""
43
				break
44
		}
45
	}
46
	if (strCellValue == null) strCellValue = ""
47
	return strCellValue
48
}
49

  
50
@Field @Option(name="inputFile", usage="fichier Excel à traiter", widget="File", required=true, def="")
51
		File inputFile
52

  
53
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="")
54
		def sheetName
55

  
56
@Field @Option(name="rootTag", usage="root tag name", widget="String", required=false, def="root")
57
		def rootTag
58

  
59
@Field @Option(name="textTag", usage="line unit tag name", widget="String", required=false, def="unit")
60
		def textTag
61

  
62
@Field @Option(name="metadataColumnList", usage="metadata columns list separated by comma", widget="String", required=false, def="meta1,meta2")
63
		def metadataColumnList
64

  
65
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2")
66
		def dateColumnList
67

  
68
@Field @Option(name="textColumnList", usage="text columns list separated by comma", widget="String", required=false, def="textColumnList1,textColumnList2")
69
		def textColumnList
70

  
71
@Field @Option(name="EmbedInTEI", usage="text columns list separated by comma", widget="Boolean", required=false, def="false")
72
		def EmbedInTEI
73

  
74
if (!ParametersDialog.open(this)) return
75

  
76
	if (!inputFile.exists()) {
77
		println "** Excel2XML: no '"+inputFile.name+"' file found. Aborting."
78
		return false
79
	}
80

  
81
if (!inputFile.canRead()) {
82
	println "** Excel2XML: '"+inputFile.name+"' file not readable. Aborting."
83
	return false
84
}
85

  
86
try {
87
	
88
	metadataColumnList = metadataColumnList.split(",").collect { it.trim() }
89
	dateColumnList = dateColumnList.split(",").collect { it.trim() }
90
	textColumnList = textColumnList.split(",").collect { it.trim() }
91
	textTag = textTag.trim()
92
	rootTag = rootTag.trim()
93
	
94
	wb = WorkbookFactory.create(inputFile)
95
	
96
	if (sheetName.length() == 0) {
97
		ws = wb.getSheetAt(0)
98
	} else {
99
		ws = wb.getSheet(sheetName)
100
		if (ws == null) {
101
			println "** Excel2XML: no '"+sheetName+" found. Aborting."
102
			return false
103
		}
104
	}
105
	
106
	if (ws == null) {
107
		println "** Excel2XML: no sheet found. Aborting."
108
		return false
109
	}
110
	
111
	nRows = ws.getPhysicalNumberOfRows()
112
	println nRows+" rows."
113
	
114
	firstRow = ws.getRow(0)
115
	colMax = firstRow.getLastCellNum()
116
	
117
	headers = (0..colMax-1).collect { getCellValueAsString(firstRow.getCell(it)) }
118
	
119
	println "Headers: $headers"
120
	
121
	normalizedHeaders = headers.collect { stringToIndent(it) }
122
	
123
	ok = true
124
	metadataColumnList.each { m ->
125
		if (!headers.contains(m)) {
126
			println "** Excel2XML: missing metadataColumnList column: $m"
127
			ok = false
128
		}
129
	}
130
	textColumnList.each { t ->
131
		if (!headers.contains(t)) {
132
			println "** Excel2XML: missing textColumnList column: $t"
133
			ok = false
134
		}
135
	}
136
	
137
	if (!ok) { return false }
138
	
139
	metadataColumnIndex = metadataColumnList.collect { headers.indexOf(it) }
140
	dateColumnsIndex = dateColumnList.collect { headers.indexOf(it) }
141
	textColumnIndex = textColumnList.collect { headers.indexOf(it) }
142
	
143
	println "metadataColumnList = "+metadataColumnList
144
	println "metadataColumnIndex = "+metadataColumnIndex
145
	
146
	println "dateColumnList = "+dateColumnList
147
	println "dateColumnsIndex = "+dateColumnsIndex
148
	
149
	println "textColumnList = "+textColumnList
150
	println "textColumnIndex = "+textColumnIndex
151
	
152
	name = inputFile.getName()
153
	idx = name.lastIndexOf(".")
154
	
155
	if (idx > 0) name = name.substring(0, idx)
156
	outputFile = new File(inputFile.getParentFile(), name+".xml")
157
	
158
	factory = XMLOutputFactory.newInstance()
159
	output = new FileOutputStream(outputFile)
160
	writer = factory.createXMLStreamWriter(output, "UTF-8")
161
	
162
	writer.writeStartDocument("UTF-8","1.0")
163
	writer.writeCharacters("\n") // simple XML formating
164
	
165
	if (EmbedInTEI) {
166
		writer.writeStartElement("TEI")
167
		writer.writeStartElement("teiHeader")
168
		writer.writeEndElement() // teiHeader
169
		writer.writeStartElement("text")
170
		writer.writeCharacters("\n")
171
	}
172
	
173
	writer.writeStartElement(rootTag)
174
	writer.writeCharacters("\n")
175
	
176
	pb_n = 1
177
	
178
	(1..nRows-1).each { rowIndex ->
179
		
180
		writer.writeCharacters("  ")
181
		writer.writeEmptyElement("pb") // <pb/> to get one page per input line (don't forget high number of words per page in import module)
182
		writer.writeAttribute("n", ""+pb_n++)
183
		writer.writeCharacters("\n") // simple XML formating
184
		
185
		writer.writeCharacters("  ")
186
		writer.writeStartElement(textTag)
187
		metadataColumnIndex.each { colIndex -> // build an attribute for each metadata
188
			def row = ws.getRow(rowIndex)
189
			if (row != null) {
190
				String s = getCellValueAsString(row.getCell(colIndex));
191
				if (s == null) s ="";
192
				value = s.replaceAll("\n", ";").trim()
193
				writer.writeAttribute(normalizedHeaders[colIndex], value)
194
				if (colIndex in dateColumnsIndex) { // also split date attributes in day+month+year attributes
195
					matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/)
196
					writer.writeAttribute(normalizedHeaders[colIndex]+"jour", matches[0][1])
197
					writer.writeAttribute(normalizedHeaders[colIndex]+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value)))
198
					writer.writeAttribute(normalizedHeaders[colIndex]+"mois", matches[0][2])
199
					writer.writeAttribute(normalizedHeaders[colIndex]+"annee", matches[0][3])
200
				}
201
			}
202
		}
203
		writer.writeCharacters("\n")
204
		
205
		writer.writeCharacters("    ")
206
		writer.writeStartElement("metadata")
207
		writer.writeStartElement("list")
208
		writer.writeAttribute("type", "unordered")
209
		writer.writeCharacters("\n")
210
		
211
		metadataColumnIndex.each { colIndex ->
212
			
213
			def row = ws.getRow(rowIndex)
214
			if (row != null) {
215
				writer.writeStartElement("item")
216
				writer.writeCharacters(headers[colIndex]+" : "+getCellValueAsString(row.getCell(colIndex)).replaceAll("\n", ";"))
217
				writer.writeEndElement() // item
218
				writer.writeCharacters("\n")
219
			}
220
		}
221
		writer.writeCharacters("    ")
222
		writer.writeEndElement() // list
223
		writer.writeEndElement() // head
224
		writer.writeCharacters("\n")
225
		
226
		textColumnIndex.each { colIndex ->
227
			
228
			writer.writeCharacters("    ")
229
			writer.writeStartElement(normalizedHeaders[colIndex])
230
			writer.writeStartElement("p")
231
			writer.writeStartElement("head")
232
			writer.writeStartElement("hi")
233
			writer.writeCharacters(headers[colIndex]+" : ")
234
			writer.writeEndElement() // hi
235
			writer.writeEndElement() // head
236
			def row = ws.getRow(rowIndex)
237
			if (row != null) {
238
				value = getCellValueAsString(row.getCell(colIndex))
239
				
240
				if (value ==~ /(?s)^[A-Z]{3}:  [^;\n]+? +[;\n].*/) {
241
					value.findAll( /(?s)[A-Z]{3}:  ([^;\n]+?) +[;\n]/ ).each { desc ->
242
						writer.writeStartElement("descripteur")
243
						matches = (desc =~ /(?s)([A-Z]{3}):  ([^;\n]+?) +[;\n]/)
244
						writer.writeAttribute("type", matches[0][1])
245
						writer.writeCharacters(matches[0][2])
246
						writer.writeEndElement() // descripteur
247
					}
248
				} else {
249
					writer.writeCharacters(value)
250
				}
251
				writer.writeEndElement() // p
252
				writer.writeEndElement() // textColumn
253
				writer.writeCharacters("\n")
254
			}
255
		}
256
		
257
		writer.writeCharacters("  ")
258
		writer.writeEndElement() // textTag
259
		writer.writeCharacters("\n")
260
	}
261
	
262
	writer.writeEndElement() // rootTag
263
	writer.writeCharacters("\n")
264
	if (EmbedInTEI) {
265
		writer.writeEndElement() // text
266
		writer.writeCharacters("\n")
267
		writer.writeEndElement() // TEI
268
		writer.writeCharacters("\n")
269
	}
270
	
271
	writer.close()
272
	output.close()
273
	println "Result file: $outputFile"
274
	
275
} catch (Exception e) {
276
	println "** Excel2XML: unable to read input file. Aborting."
277
	println e.getLocalizedMessage()
278
	println e.printStackTrace()
279
	return false
280
}
281

  
282
return true
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/table/Excel2XMLMacro.groovy (revision 3210)
1
package org.txm.macro.table
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6

  
7
import org.txm.libs.msoffice.ReadExcel
8
import org.txm.rcpapplication.swt.widget.parameters.*
9
import org.txm.utils.*
10
import javax.xml.stream.*
11
import java.net.URL
12
import java.text.SimpleDateFormat
13

  
14
import org.apache.poi.ss.usermodel.*
15
import org.apache.poi.hssf.usermodel.*
16
import org.apache.poi.xssf.usermodel.*
17
import org.apache.poi.ss.usermodel.WorkbookFactory
18
import org.apache.poi.ss.util.*
19

  
20
def stringToIndent = { str -> org.txm.utils.AsciiUtils.buildAttributeId(org.txm.utils.AsciiUtils.convertNonAscii(str)).toLowerCase() }
21

  
22
@Field @Option(name="inputFile", usage="fichier Excel à traiter", widget="File", required=true, def="")
23
		File inputFile
24

  
25
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="")
26
		def sheetName
27

  
28
@Field @Option(name="EmbedInTEI", usage="embed xml content in  TEI text element", widget="Boolean", required=false, def="false")
29
		def EmbedInTEI
30
		
31
@Field @Option(name="rootTag", usage="root tag name", widget="String", required=false, def="root")
32
		def rootTag
33

  
34
@Field @Option(name="textTag", usage="line unit tag name", widget="String", required=false, def="unit")
35
		def textTag
36

  
37
@Field @Option(name="metadataColumnList", usage="metadata columns list separated by comma", widget="String", required=false, def="meta1,meta2")
38
		def metadataColumnList
39

  
40
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2")
41
		def dateColumnList
42

  
43
@Field @Option(name="textColumnList", usage="text columns list separated by comma", widget="String", required=false, def="textColumnList1,textColumnList2")
44
		def textColumnList
45
		
46
@Field @Option(name="prefixContentColumnList", usage="columns which content is a semi-colon separated prefixes list", widget="String", required=false, def="")
47
		def prefixContentColumnList
48
		
49
@Field @Option(name="listContentColumnList", usage="columns which content is a semi-colon separated list", widget="String", required=false, def="")
50
		def listContentColumnList
51

  
52

  
53
if (!ParametersDialog.open(this)) return
54

  
55
	if (!inputFile.exists()) {
56
		println "** Excel2XML: no '"+inputFile.name+"' file found. Aborting."
57
		return false
58
	}
59

  
60
if (!inputFile.canRead()) {
61
	println "** Excel2XML: '"+inputFile.name+"' file not readable. Aborting."
62
	return false
63
}
64

  
65
try {
66
	metadataColumnList = metadataColumnList.split(",").collect { it.trim() }
67
	dateColumnList = dateColumnList.split(",").collect { it.trim() }
68
	textColumnList = textColumnList.split(",").collect { it.trim() }
69
	prefixContentColumnList = prefixContentColumnList.split(",").collect { it.trim() }
70
	listContentColumnList = listContentColumnList.split(",").collect { it.trim() }
71
	textTag = textTag.trim()
72
	rootTag = rootTag.trim()
73
	
74
	ReadExcel excel = new ReadExcel(inputFile, sheetName);
75
	excel.readHeaders();
76
	
77
	headers = excel.getHeaders()
78
	
79
	println "Headers: $headers"
80
	
81
	ok = true
82
	metadataColumnList.each { m ->
83
		if (!headers.contains(m)) {
84
			println "** Excel2XML: missing metadataColumnList column: $m"
85
			ok = false
86
		}
87
	}
88
	textColumnList.each { t ->
89
		if (!headers.contains(t)) {
90
			println "** Excel2XML: missing textColumnList column: $t"
91
			ok = false
92
		}
93
	}
94
	
95
	if (!ok) { return false }
96
	
97
	println "metadataColumnList = "+metadataColumnList
98
	println "dateColumnList = "+dateColumnList
99
	println "textColumnList = "+textColumnList
100
	
101
	name = inputFile.getName()
102
	idx = name.lastIndexOf(".")
103
	
104
	if (idx > 0) name = name.substring(0, idx)
105
	outputFile = new File(inputFile.getParentFile(), name+".xml")
106
	
107
	factory = XMLOutputFactory.newInstance()
108
	output = new FileOutputStream(outputFile)
109
	writer = factory.createXMLStreamWriter(output, "UTF-8")
110
	
111
	writer.writeStartDocument("UTF-8","1.0")
112
	writer.writeCharacters("\n") // simple XML formating
113
	
114
	if (EmbedInTEI) {
115
		writer.writeStartElement("TEI")
116
		writer.writeStartElement("teiHeader")
117
		writer.writeEndElement() // teiHeader
118
		writer.writeStartElement("text")
119
		writer.writeCharacters("\n")
120
	}
121
	
122
	if (rootTag != null && rootTag.length() > 0) {
123
		writer.writeStartElement(rootTag)
124
		writer.writeCharacters("\n")
125
	}
126
	
127
	pb_n = 1
128
	
129
	//(1..nRows-1).each { rowIndex ->
130
	def record = null
131
	while (excel.readRecord()) {
132
		
133
		record = excel.getRecord()
134
		if (record.isEmpty()) continue;
135
		
136
		//println "record=$record"
137
		
138
		writer.writeCharacters("  ")
139
		writer.writeEmptyElement("pb") // <pb/> to get one page per input line (don't forget high number of words per page in import module)
140
		writer.writeAttribute("n", ""+pb_n++)
141
		writer.writeCharacters("\n") // simple XML formating
142
		
143
		writer.writeCharacters("  ")
144
		if (textTag != null && textTag.length() > 0) {
145
			writer.writeStartElement(textTag)
146
			
147
			//metadataColumnIndex.each { colIndex -> // build an attribute for each metadata
148
			for (def metadataName : metadataColumnList) {
149
				String s = record.get(metadataName);
150
				if (s == null) s = "";
151
				
152
				value = s.replaceAll("\n", ";").trim()
153
				writer.writeAttribute(stringToIndent(metadataName), value)
154
				
155
				if (metadataName in dateColumnList) { // also split date attributes in day+month+year attributes
156
					matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/)
157
					writer.writeAttribute(metadataName+"-jour", matches[0][1])
158
					writer.writeAttribute(metadataName+"-joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value)))
159
					writer.writeAttribute(metadataName+"-mois", matches[0][2])
160
					writer.writeAttribute(metadataName+"-annee", matches[0][3])
161
				}
162
			}
163
			writer.writeCharacters("\n")
164
		}
165
		
166
		writer.writeCharacters("    ")
167
		writer.writeStartElement("metadata")
168
		writer.writeStartElement("list")
169
		writer.writeAttribute("type", "unordered")
170
		writer.writeCharacters("\n")
171
		
172
		//metadataColumnIndex.each { colIndex ->
173
		for (def metadataName : metadataColumnList) {
174
			writer.writeStartElement("item")
175
			writer.writeCharacters(metadataName+" : "+record.get(metadataName).replaceAll("\n", ";"))
176
			writer.writeEndElement() // item
177
			writer.writeCharacters("\n")
178
		}
179
		writer.writeCharacters("    ")
180
		writer.writeEndElement() // list
181
		writer.writeEndElement() // head
182
		writer.writeCharacters("\n")
183
		
184
		//textColumnIndex.each { colIndex ->
185
		for (def textColumnName : textColumnList) {
186
			
187
			writer.writeCharacters("    ")
188
			writer.writeStartElement(stringToIndent(textColumnName))
189
			writer.writeStartElement("p")
190
			writer.writeStartElement("head")
191
			writer.writeStartElement("hi")
192
			writer.writeCharacters(textColumnName+" : ")
193
			writer.writeEndElement() // hi
194
			writer.writeEndElement() // head
195
			
196
			value = record.get(textColumnName)
197
			
198
			if (value ==~ /(?s)^[A-Z]{3}:  [^;\n]+? +[;\n].*/) {
199
				value.findAll( /(?s)[A-Z]{3}:  ([^;\n]+?) +[;\n]/ ).each { desc ->
200
					writer.writeStartElement("descripteur")
201
					matches = (desc =~ /(?s)([A-Z]{3}):  ([^;\n]+?) +[;\n]/)
202
					writer.writeAttribute("type", matches[0][1])
203
					writer.writeCharacters(matches[0][2])
204
					writer.writeEndElement() // descripteur
205
				}
206
			} else {
207
				writer.writeCharacters(value)
208
			}
209
			writer.writeEndElement() // p
210
			writer.writeEndElement() // textColumn
211
			writer.writeCharacters("\n")
212
		}
213
		
214
		writer.writeCharacters("  ")
215
		
216
		if (textTag != null && textTag.length() > 0) {
217
			writer.writeEndElement() // textTag
218
			writer.writeCharacters("\n")
219
		}
220
	}
221
	
222
	if (rootTag != null && rootTag.length() > 0) {
223
		writer.writeEndElement() // rootTag
224
		writer.writeCharacters("\n")
225
	}
226
	
227
	if (EmbedInTEI) {
228
		writer.writeEndElement() // text
229
		writer.writeCharacters("\n")
230
		writer.writeEndElement() // TEI
231
		writer.writeCharacters("\n")
232
	}
233
	
234
	writer.close()
235
	output.close()
236
	println "Result file: $outputFile"
237
	
238
} catch (Exception e) {
239
	println "** Excel2XML: unable to read input file. Aborting."
240
	println e.getLocalizedMessage()
241
	println e.printStackTrace()
242
	return false
243
}
244

  
245
return true
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/table/TableDir2XMLMacro.groovy (revision 3210)
1
package org.txm.macro.table
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6
import org.txm.rcpapplication.swt.widget.parameters.*
7
import org.txm.utils.*
8
import javax.xml.stream.*
9
import java.net.URL
10

  
11
import org.apache.poi.ss.usermodel.*
12
import org.apache.poi.hssf.usermodel.*
13
import org.apache.poi.xssf.usermodel.*
14
import org.apache.poi.ss.usermodel.WorkbookFactory
15
import org.apache.poi.ss.util.*
16

  
17
@Field @Option(name="inputFile", usage="CSV File", widget="File", required=false, def="file.xlsx")
18
		File inputFile;
19

  
20
@Field @Option(name="outputDirectory", usage="output directory", widget="Folder", required=false, def="directory")
... Ce différentiel a été tronqué car il excède la taille maximale pouvant être affichée.

Formats disponibles : Unified diff