Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / table / Excel2XMLMacro.groovy @ 3058

History | View | Annotate | Download (8.8 kB)

1
package org.txm.macro.table
2

    
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6
import org.txm.rcpapplication.swt.widget.parameters.*
7
import org.txm.utils.*
8
import javax.xml.stream.*
9
import java.net.URL
10
import java.text.SimpleDateFormat
11

    
12
import org.apache.poi.ss.usermodel.*
13
import org.apache.poi.hssf.usermodel.*
14
import org.apache.poi.xssf.usermodel.*
15
import org.apache.poi.ss.usermodel.WorkbookFactory
16
import org.apache.poi.ss.util.*
17

    
18
def stringToIndent = { str -> org.txm.utils.AsciiUtils.buildAttributeId(org.txm.utils.AsciiUtils.convertNonAscii(str)).toLowerCase() }
19

    
20
// from http://www.java-connect.com/apache-poi-tutorials/read-all-type-of-excel-cell-value-as-string-using-poi
21
def getCellValueAsString = { cell ->
22
        strCellValue = null
23
        if (cell != null) {
24
            switch (cell.getCellType()) {
25
            case CellType.STRING:
26
                strCellValue = cell.toString()
27
                break
28
            case CellType.NUMERIC:
29
                if (DateUtil.isCellDateFormatted(cell)) {
30
                    SimpleDateFormat dateFormat = new SimpleDateFormat("dd/MM/yyyy")
31
                    strCellValue = dateFormat.format(cell.getDateCellValue())
32
                } else {
33
                    value = cell.getNumericCellValue()
34
                    longValue = value.longValue()
35
                    strCellValue = new String(longValue.toString())
36
                }
37
                break
38
            case CellType.BOOLEAN:
39
                strCellValue = new String(new Boolean(cell.getBooleanCellValue()).toString())
40
                break
41
            case CellType.BLANK:
42
                strCellValue = ""
43
                break
44
            }
45
        }
46
        if (strCellValue == null) strCellValue = ""
47
        return strCellValue
48
}
49

    
50
@Field @Option(name="inputFile", usage="fichier Excel à traiter", widget="File", required=true, def="")
51
File inputFile
52

    
53
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="")
54
def sheetName
55

    
56
@Field @Option(name="rootTag", usage="root tag name", widget="String", required=false, def="root")
57
def rootTag
58

    
59
@Field @Option(name="textTag", usage="line unit tag name", widget="String", required=false, def="unit")
60
def textTag
61

    
62
@Field @Option(name="metadataColumnList", usage="metadata columns list separated by comma", widget="String", required=false, def="meta1,meta2")
63
def metadataColumnList
64

    
65
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2")
66
def dateColumnList
67

    
68
@Field @Option(name="textColumnList", usage="text columns list separated by comma", widget="String", required=false, def="textColumnList1,textColumnList2")
69
def textColumnList
70

    
71
@Field @Option(name="EmbedInTEI", usage="text columns list separated by comma", widget="Boolean", required=false, def="false")
72
def EmbedInTEI
73

    
74
if (!ParametersDialog.open(this)) return
75

    
76
if (!inputFile.exists()) {
77
        println "** Excel2XML: no '"+inputFile.name+"' file found. Aborting."
78
        return false
79
}
80

    
81
if (!inputFile.canRead()) {
82
        println "** Excel2XML: '"+inputFile.name+"' file not readable. Aborting."
83
        return false
84
}
85

    
86
try {
87

    
88
metadataColumnList = metadataColumnList.split(",").collect { it.trim() }
89
dateColumnList = dateColumnList.split(",").collect { it.trim() }
90
textColumnList = textColumnList.split(",").collect { it.trim() }
91
textTag = textTag.trim()
92
rootTag = rootTag.trim()
93

    
94
wb = WorkbookFactory.create(inputFile)
95

    
96
if (sheetName.length() == 0) {
97
        ws = wb.getSheetAt(0)
98
} else {
99
        ws = wb.getSheet(sheetName)
100
        if (ws == null) {
101
                println "** Excel2XML: no '"+sheetName+" found. Aborting."
102
                return false
103
        }
104
}
105

    
106
if (ws == null) {
107
        println "** Excel2XML: no sheet found. Aborting."
108
        return false
109
}
110

    
111
nRows = ws.getPhysicalNumberOfRows()
112
println nRows+" rows."
113
 
114
firstRow = ws.getRow(0)
115
colMax = firstRow.getLastCellNum()
116

    
117
headers = (0..colMax-1).collect { getCellValueAsString(firstRow.getCell(it)) }
118

    
119
println "Headers: $headers"
120

    
121
normalizedHeaders = headers.collect { stringToIndent(it) }
122

    
123
ok = true
124
metadataColumnList.each { m ->
125
        if (!headers.contains(m)) {
126
                println "** Excel2XML: missing metadataColumnList column: $m"
127
                ok = false
128
        }
129
}
130
textColumnList.each { t ->
131
        if (!headers.contains(t)) {
132
                println "** Excel2XML: missing textColumnList column: $t"
133
                ok = false
134
        }
135
}
136

    
137
if (!ok) { return false }
138

    
139
metadataColumnIndex = metadataColumnList.collect { headers.indexOf(it) }
140
dateColumnsIndex = dateColumnList.collect { headers.indexOf(it) }
141
textColumnIndex = textColumnList.collect { headers.indexOf(it) }
142

    
143
println "metadataColumnList = "+metadataColumnList
144
println "metadataColumnIndex = "+metadataColumnIndex
145

    
146
println "dateColumnList = "+dateColumnList
147
println "dateColumnsIndex = "+dateColumnsIndex
148

    
149
println "textColumnList = "+textColumnList
150
println "textColumnIndex = "+textColumnIndex
151

    
152
name = inputFile.getName()
153
idx = name.lastIndexOf(".")
154

    
155
if (idx > 0) name = name.substring(0, idx)
156
outputFile = new File(inputFile.getParentFile(), name+".xml")
157

    
158
factory = XMLOutputFactory.newInstance()
159
output = new FileOutputStream(outputFile)
160
writer = factory.createXMLStreamWriter(output, "UTF-8")
161

    
162
writer.writeStartDocument("UTF-8","1.0")
163
writer.writeCharacters("\n") // simple XML formating
164

    
165
if (EmbedInTEI) {
166
        writer.writeStartElement("TEI")
167
        writer.writeStartElement("teiHeader")
168
        writer.writeEndElement() // teiHeader
169
        writer.writeStartElement("text")
170
        writer.writeCharacters("\n")
171
}
172

    
173
writer.writeStartElement(rootTag)
174
writer.writeCharacters("\n")
175

    
176
pb_n = 1
177

    
178
(1..nRows-1).each { rowIndex ->
179

    
180
        writer.writeCharacters("  ")
181
        writer.writeEmptyElement("pb") // <pb/> to get one page per input line (don't forget high number of words per page in import module)
182
        writer.writeAttribute("n", ""+pb_n++)
183
        writer.writeCharacters("\n") // simple XML formating
184

    
185
        writer.writeCharacters("  ")
186
        writer.writeStartElement(textTag)
187
        metadataColumnIndex.each { colIndex -> // build an attribute for each metadata
188
                def row = ws.getRow(rowIndex)
189
                if (row != null) {
190
                String s = getCellValueAsString(row.getCell(colIndex));
191
                if (s == null) s ="";
192
                value = s.replaceAll("\n", ";").trim()
193
                writer.writeAttribute(normalizedHeaders[colIndex], value)
194
                if (colIndex in dateColumnsIndex) { // also split date attributes in day+month+year attributes
195
                        matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/)
196
                        writer.writeAttribute(normalizedHeaders[colIndex]+"jour", matches[0][1])
197
                        writer.writeAttribute(normalizedHeaders[colIndex]+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value)))
198
                        writer.writeAttribute(normalizedHeaders[colIndex]+"mois", matches[0][2])
199
                        writer.writeAttribute(normalizedHeaders[colIndex]+"annee", matches[0][3])
200
                }
201
                }
202
        }
203
        writer.writeCharacters("\n")
204

    
205
        writer.writeCharacters("    ")
206
        writer.writeStartElement("metadata")
207
        writer.writeStartElement("list")
208
        writer.writeAttribute("type", "unordered")
209
        writer.writeCharacters("\n")
210

    
211
         metadataColumnIndex.each { colIndex ->
212
                 writer.writeStartElement("item")
213
                 def row = ws.getRow(rowIndex)
214
                 if (row != null) {
215
                         writer.writeCharacters(headers[colIndex]+" : "+getCellValueAsString(row.getCell(colIndex)).replaceAll("\n", ";"))
216
                        writer.writeEndElement() // item
217
                        writer.writeCharacters("\n")
218
                }
219
        }
220
        writer.writeCharacters("    ")
221
        writer.writeEndElement() // list
222
        writer.writeEndElement() // head
223
        writer.writeCharacters("\n")
224

    
225
           textColumnIndex.each { colIndex ->
226
                
227
            writer.writeCharacters("    ")
228
            writer.writeStartElement(normalizedHeaders[colIndex])
229
                writer.writeStartElement("p")
230
                writer.writeStartElement("head")
231
                writer.writeStartElement("hi")
232
                writer.writeCharacters(headers[colIndex]+" : ")
233
                writer.writeEndElement() // hi
234
                writer.writeEndElement() // head
235
                def row = ws.getRow(rowIndex)
236
                if (row != null) {
237
                        value = getCellValueAsString(row.getCell(colIndex))
238

    
239
                        if (value ==~ /(?s)^[A-Z]{3}:  [^;\n]+? +[;\n].*/) {
240
                                value.findAll( /(?s)[A-Z]{3}:  ([^;\n]+?) +[;\n]/ ).each { desc ->
241
                                        writer.writeStartElement("descripteur")
242
                                        matches = (desc =~ /(?s)([A-Z]{3}):  ([^;\n]+?) +[;\n]/)
243
                                        writer.writeAttribute("type", matches[0][1])
244
                                        writer.writeCharacters(matches[0][2])
245
                                        writer.writeEndElement() // descripteur
246
                                }
247
                        } else {
248
                                writer.writeCharacters(value)
249
                        }
250
                        writer.writeEndElement() // p
251
                        writer.writeEndElement() // textColumn
252
                        writer.writeCharacters("\n")
253
                }
254
        }
255

    
256
        writer.writeCharacters("  ")
257
        writer.writeEndElement() // textTag
258
        writer.writeCharacters("\n")
259
}
260

    
261
writer.writeEndElement() // rootTag
262
writer.writeCharacters("\n")
263
if (EmbedInTEI) {
264
        writer.writeEndElement() // text
265
        writer.writeCharacters("\n")
266
        writer.writeEndElement() // TEI
267
        writer.writeCharacters("\n")
268
}
269
writer.close()
270
output.close()
271
println "Result file: $outputFile"
272

    
273
} catch (Exception e) {
274
        println "** Excel2XML: unable to read input file. Aborting."
275
        println e.getLocalizedMessage()
276
        println e.printStackTrace()
277
        return false
278
}
279

    
280
return true