Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / xml / GetXPathMacro.groovy @ 479

History | View | Annotate | Download (13.5 kB)

1
// Copyright © 2016 ENS de Lyon, CNRS, University of Franche-Comté
2
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
3
// @author sheiden
4
// $LastChangedDate: 2013-05-02 11:28:42 +0200 (jeu., 02 mai 2013) $
5
// $LastChangedRevision: 2378 $
6
// $LastChangedBy: sheiden $
7

    
8
// lists all the XML elements matching a given XPath:
9
// XPath is given by the XPath parameter
10
// elements are searched in a file (srcFile parameter) or in the files of a directory (srcDirectory).
11
// files in the directory can be filtered (filterByFileExtension parameter) by a specific file extension (fileExtension parameter)
12
// results can be localized in the source files by line and column number (lineNumber parameter)
13
// results can be wrapped to 100 characters max per line (wrapLines parameter)
14

    
15
package org.txm.macro.xml
16

    
17
// STANDARD DECLARATIONS
18

    
19
import javax.xml.bind.helpers.DefaultValidationEventHandler
20
import javax.xml.xpath.XPathConstants
21
import javax.xml.namespace.NamespaceContext
22
import javax.xml.XMLConstants
23

    
24
import net.sf.saxon.dom.DocumentBuilderFactoryImpl
25
import net.sf.saxon.xpath.XPathFactoryImpl
26
import net.sf.saxon.s9api.Processor
27
import net.sf.saxon.s9api.Serializer
28
import net.sf.saxon.s9api.XdmNode
29

    
30
import org.kohsuke.args4j.Option
31
import groovy.transform.Field
32

    
33
import org.txm.rcpapplication.swt.widget.parameters.*
34
import org.txm.importer.XPathResult
35

    
36
import org.apache.commons.lang3.text.*
37

    
38
// BEGINNING OF PARAMETERS
39

    
40
// Declare each parameter here
41
// (available widget types: Query, File, Folder, String, Text, Boolean, Integer, Float and Date)
42

    
43
@Field @Option(name="srcFile", usage="XML source file", widget="File", required=false, def="")
44
def srcFile
45

    
46
@Field @Option(name="srcDirectory", usage="XML source directory", widget="Folder", required=false, def="")
47
def srcDirectory
48

    
49
@Field @Option(name="filterByFileExtension", usage="filter by file extension", widget="Boolean", required=false, def="true")
50
def filterByFileExtension
51

    
52
@Field @Option(name="fileExtension", usage="file extension to filter (eg .xml)", widget="String", required=false, def=".xml")
53
def fileExtension
54

    
55
@Field @Option(name="XPath", usage="XPath expression", widget="String", required=true, def="//tei:title/text()")
56
def XPath
57

    
58
@Field @Option(name="lineNumber", usage="print line number", widget="Boolean", required=false, def="true")
59
def lineNumber
60

    
61
@Field @Option(name="wrapLines", usage="wrap output lines longer than 100 characters", widget="Boolean", required=false, def="true")
62
def wrapLines
63

    
64
//@Field @Option(name="interactive", usage="open parameters dialog box", widget="Boolean", required=true, def="true")
65
//def interactive
66

    
67
// Open the parameters input dialog box
68
if (!ParametersDialog.open(this)) {
69
        println "** Abandon de la macro."
70
        return false
71
}
72

    
73
// END OF PARAMETERS
74

    
75
lineNumberLen = 0
76
columnNumberLen = 0
77

    
78
def newDocBuilder = { ->
79

    
80
                DocumentBuilderFactoryImpl factory = new DocumentBuilderFactoryImpl()
81
                factory.setNamespaceAware(true)
82
                return factory.newDocumentBuilder()
83
                
84
                /*
85
                use catalog
86
                use local DTD
87
                use entity resolver
88
                
89
                import org.apache.xml.resolver.tools.CatalogResolver;
90
import java.util.Properties;
91
import java.io.*;
92
import javax.xml.transform.*;
93
import javax.xml.transform.stream.StreamResult;
94
import javax.xml.transform.stream.StreamSource;
95

96
tFactory = TransformerFactory.newInstance()
97
resolver = new CatalogResolver()
98
tFactory.setURIResolver(resolver)
99

100
                transformer.setURIResolver(resolver);
101
                transformer.transform(new StreamSource(xmlFile), new 
102
StreamResult(new FileOutputStream(outFile)));
103

104
To set an EntityResolver from a Java application, you need to create a
105
SAXSource rather than a stream source, to instantiate your own XMLReader
106
(the SAX parser), set the EntityResolver on the XMLReader, and then do
107
saxSource.setXMLReader().
108

109

110
catalog.xml
111

112
<catalog xmlns="urn:oasis:names:tc:entity:xmlns:xml:catalog">
113
  <public publicId="-//W3C//DTD HTML 4.01 Transitional//EN" uri="dummy.dtd"/>
114
  <system systemId="http://www.w3.org/TR/html4/loose.dtd" uri="dummy.dtd"/>
115
</catalog>
116

117
dummy.dtd
118
<!ELEMENT html ANY>srcFile
119
                
120
                
121
                */
122
}
123

    
124
def newDoc = { builder, file ->
125

    
126
                return builder.parse(file)
127
}
128

    
129
def newXpath = { config ->
130

    
131
                def xfactory = new XPathFactoryImpl()
132
                xfactory.setConfiguration(config)
133
                def xpath = xfactory.newXPath()
134
                
135
                xpath.setNamespaceContext(new NamespaceContext() {
136
                        String TEINS = "http://www.tei-c.org/ns/1.0"
137
                        String MENS = "http://www.menota.org/ns/1.0"
138
                        String BFMNS = "http://bfm.ens-lsh.fr/ns/1.0"
139
                        String TXMNS = "http://textometrie.org/1.0"
140
                        String XINS = "http://www.w3.org/2001/XInclude"
141
                        String XHTMLNS = "http://www.w3.org/1999/xhtml"
142
                        String FNNS = "http://www.w3.org/2005/xpath-functions"
143

    
144
                        String TEINSNAME = "tei"
145
                        String MENSNAME = "me"
146
                        String BFMNSNAME = "bfm"
147
                        String TXMNSNAME = "txm"
148
                        String XINSNAME = "xi"
149
                        String XHTMLNSNAME = "xhtml"
150
                        String FNNSNAME = "fn"
151

    
152
                        public String getNamespaceURI(String prefix) {
153
                                if (prefix == null) throw new NullPointerException("Null prefix")
154
                                else if (TEINSNAME.equals(prefix)) return TEINS
155
                                else if (MENSNAME.equals(prefix)) return MENS
156
                                else if (BFMNSNAME.equals(prefix)) return BFMNS
157
                                else if (TXMNSNAME.equals(prefix)) return TXMNS
158
                                else if (XINSNAME.equals(prefix)) return XINS
159
                                else if (XHTMLNSNAME.equals(prefix)) return XHTMLNS
160
                                else if (FNNSNAME.equals(prefix)) return FNNS
161
                                else if (XMLConstants.XML_NS_PREFIX.equals(prefix)) return XMLConstants.XML_NS_URI
162
                                return XMLConstants.NULL_NS_URI
163
                        }
164

    
165
                        public String getPrefix(String uri) {
166
                                if (uri == null) throw new NullPointerException("Null prefix")
167
                                else if (TEINS.equals(uri)) return TEINSNAME
168
                                else if (MENS.equals(uri)) return MENSNAME
169
                                else if (BFMNS.equals(uri)) return BFMNSNAME
170
                                else if (TXMNS.equals(uri)) return TXMNSNAME
171
                                else if (XINS.equals(uri)) return XINSNAME
172
                                else if (XHTMLNS.equals(uri)) return XHTMLNSNAME
173
                                else if (FNNS.equals(uri)) return FNNSNAME
174
                                else if("http://www.w3.org/XML/1998/namespace") return XMLConstants.XML_NS_PREFIX
175
                                else if (XMLConstants.XML_NS_URI.equals(uri)) return XMLConstants.XML_NS_PREFIX
176
                                else XMLConstants.NULL_NS_URI
177
                        }
178
        
179
                        public Iterator getPrefixes(String uri) {throw new UnsupportedOperationException()        }
180
                })
181
                
182
                xpath.setNamespaceContext(new NamespaceContext() {
183
                        String TEINS = "http://www.tei-c.org/ns/1.0"
184
                        String MENS = "http://www.menota.org/ns/1.0"
185
                        String BFMNS = "http://bfm.ens-lsh.fr/ns/1.0"
186
                        String TXMNS = "http://textometrie.org/1.0"
187
                        String XINS = "http://www.w3.org/2001/XInclude"
188
                        String XHTMLNS = "http://www.w3.org/1999/xhtml"
189
                        String FNNS = "http://www.w3.org/2005/xpath-functions"
190

    
191
                        String TEINSNAME = "tei"
192
                        String MENSNAME = "me"
193
                        String BFMNSNAME = "bfm"
194
                        String TXMNSNAME = "txm"
195
                        String XINSNAME = "xi"
196
                        String XHTMLNSNAME = "xhtml"
197
                        String FNNSNAME = "fn"
198

    
199
                        public String getNamespaceURI(String prefix) {
200
                                if (prefix == null) throw new NullPointerException("Null prefix")
201
                                else if (TEINSNAME.equals(prefix)) return TEINS
202
                                else if (MENSNAME.equals(prefix)) return MENS
203
                                else if (BFMNSNAME.equals(prefix)) return BFMNS
204
                                else if (TXMNSNAME.equals(prefix)) return TXMNS
205
                                else if (XINSNAME.equals(prefix)) return XINS
206
                                else if (XHTMLNSNAME.equals(prefix)) return XHTMLNS
207
                                else if (FNNSNAME.equals(prefix)) return FNNS
208
                                else if (XMLConstants.XML_NS_PREFIX.equals(prefix)) return XMLConstants.XML_NS_URI
209
                                return XMLConstants.NULL_NS_URI
210
                        }
211

    
212
                        public String getPrefix(String uri) {
213
                                if (uri == null) throw new NullPointerException("Null prefix")
214
                                else if (TEINS.equals(uri)) return TEINSNAME
215
                                else if (MENS.equals(uri)) return MENSNAME
216
                                else if (BFMNS.equals(uri)) return BFMNSNAME
217
                                else if (TXMNS.equals(uri)) return TXMNSNAME
218
                                else if (XINS.equals(uri)) return XINSNAME
219
                                else if (XHTMLNS.equals(uri)) return XHTMLNSNAME
220
                                else if (FNNS.equals(uri)) return FNNSNAME
221
                                else if("http://www.w3.org/XML/1998/namespace") return XMLConstants.XML_NS_PREFIX
222
                                else if (XMLConstants.XML_NS_URI.equals(uri)) return XMLConstants.XML_NS_PREFIX
223
                                else XMLConstants.NULL_NS_URI
224
                        }
225
        
226
                        public Iterator getPrefixes(String uri) {throw new UnsupportedOperationException()        }
227
                })
228
                
229
        return xpath
230
}
231

    
232
def getXPath = { xpath, doc ->
233

    
234
        return xpathProc.evaluate(xpath, doc, XPathConstants.NODESET)
235
}
236

    
237
builder = newDocBuilder()
238
config = builder.getConfiguration()
239
if (lineNumber) {
240
        config.setLineNumbering(true)
241
}
242
xpathProc = newXpath(config)
243

    
244
def serializeNode(node) {
245
        processor = new Processor(false)
246
        serializer = processor.newSerializer()
247
        // Other properties found here: http://www.saxonica.com/html/documentation/javadoc/net/sf/saxon/s9api/Serializer.Property.html
248
        serializer.setOutputProperty(Serializer.Property.OMIT_XML_DECLARATION, "yes")
249
        serializer.setOutputProperty(Serializer.Property.INDENT, "yes")
250
        xdmNode = new XdmNode(node.getUnderlyingNodeInfo())
251
        return(serializer.serializeNodeToString(xdmNode))
252
}
253

    
254
def printValue(node) {
255

    
256
//        println "beginning of printValue"
257
        //println "result type is of type "+node.getClass()
258
                switch (node.getNodeType()) {
259

    
260
                            case 1: // element
261
                                    //println "case 1 : "+node.getClass()
262
//                                    println "beginning of case 1"
263
                                    if (lineNumber) {
264
                                            print sprintf("%${lineNumberLen}d, %${columnNumberLen}d: ", node.getUnderlyingNodeInfo().getLineNumber(), node.getUnderlyingNodeInfo().getColumnNumber())
265
                                    }
266
                                    if (wrapLines) {
267
                                            println WordUtils.wrap(serializeNode(node).replaceAll(/\r\n+/, " ").replaceAll(/\r+/, " ").replaceAll(/\n+/, " ").replaceAll(/ +/, " "), 100)
268
                                    } else {
269
                                            println serializeNode(node)
270
                                    }
271
                                    break
272

    
273
                            case 2: // attribute
274
                                    //println "case 2 : "+node.getClass()
275
//                                    println "beginning of case 2"
276
                                    if (lineNumber) {
277
                                            print sprintf("%${lineNumberLen}d, %${columnNumberLen}d: ", node.getUnderlyingNodeInfo().getLineNumber(), node.getUnderlyingNodeInfo().getColumnNumber())
278
                                    }
279
                                    if (wrapLines) {
280
                                            println WordUtils.wrap(serializeNode(node).replaceAll(/\r\n+/, " ").replaceAll(/\r+/, " ").replaceAll(/\n+/, " ").replaceAll(/ +/, " "), 100)
281
                                    } else {
282
                                            println serializeNode(node)
283
                                    }
284
                             break
285

    
286
                            default:
287
                                    //println "other case : "+node.getClass()
288
//                                    println "beginning of case 3"
289
                                    if (lineNumber) {
290
                                            print sprintf("%${lineNumberLen}d, %${columnNumberLen}d: ", node.getUnderlyingNodeInfo().getLineNumber(), node.getUnderlyingNodeInfo().getColumnNumber())
291
                                    }
292
                                    if (wrapLines) {
293
                                            println WordUtils.wrap(serializeNode(node).replaceAll(/\r\n+/, " ").replaceAll(/\r+/, " ").replaceAll(/\n+/, " ").replaceAll(/ +/, " "), 100)
294
                                    } else {
295
                                            println serializeNode(node)
296
                                    }
297
                             break
298
                        }
299
//                println "end of printValue"
300
}
301

    
302
// from http://stackoverflow.com/questions/453018/number-of-lines-in-a-file-in-java
303
def countLines(File file) throws IOException {
304
    InputStream is = new BufferedInputStream(new FileInputStream(file));
305
    try {
306
        byte[] c = new byte[1024];
307
        int lineCount = 0;
308
        int maxColumn = 0
309
        int columnCount = 0;
310
        int readChars = 0;
311
        boolean endsWithoutNewLine = false;
312
        while ((readChars = is.read(c)) != -1) {
313
            for (int i = 0; i < readChars; ++i) {
314
                    columnCount++
315
                if (c[i] == '\n') {
316
                    ++lineCount
317
                    if (columnCount > maxColumn) { maxColumn = columnCount }
318
                    columnCount=0
319
                }
320
            }
321
            endsWithoutNewLine = (c[readChars - 1] != '\n');
322
        }
323
        if(endsWithoutNewLine) {
324
            ++lineCount;
325
        }
326
        //println "maxColumn = "+maxColumn
327
        return [lineCount, maxColumn]
328
    } finally {
329
        is.close();
330
    }
331
}
332

    
333
if ((srcDirectory==null || srcDirectory.size() == 0) && (srcFile==null || srcFile.size() == 0)) { println "** GetXPathMacro: at least a source file or a source directory must be specified."; return}
334

    
335
if (srcDirectory!=null && srcDirectory.size() > 0 && srcDirectory.exists()) {
336

    
337
        def files = srcDirectory.listFiles()
338
        if (files == null || files.size() == 0) {
339
                println "** GetXPathMacro: No files in $srcDirectory"
340
                return
341
        }
342
        files.sort()
343

    
344
        def noFileSearched = true
345
        
346
        println "GetXPath: input directory = '$srcDirectory', XPath = $XPath"
347

    
348
        for (def xmlFile : files) {
349
        
350
                String name = xmlFile.getName()
351
                
352
                if (filterByFileExtension && !name.endsWith(fileExtension)) { continue }
353
        
354
                println "\n-- $name"
355
                        
356
                noFileSearched = false
357
                
358
                if (lineNumber) {
359
                        (maxLine, maxColumn) = countLines(xmlFile)
360
                        lineNumberLen = Math.log10(maxLine)+1 as int
361
                        columnNumberLen = Math.log10(maxColumn)+1 as int
362
                }
363

    
364
                doc = newDoc(builder, xmlFile)
365
                res = getXPath(XPath, doc)
366

    
367
//                println "res = "+res
368

    
369
                                if (res.getLength() == 0) {
370
                        println "No result."
371
                } else res.each { printValue(it) }
372
        }
373

    
374
        if (noFileSearched) { println "** GetXPath: no file searched." }
375

    
376
} else if (srcFile!=null && srcFile.exists()) {
377
                        def xmlFile = srcFile
378
                        String name = xmlFile.getName()
379

    
380
                        println "GetXPath: file = '$xmlFile', XPath = $XPath"
381
                        
382
                        if (lineNumber) {
383
                                (maxLine, maxColumn) = countLines(xmlFile)
384
                                lineNumberLen = Math.log10(maxLine)+1 as int
385
                                columnNumberLen = Math.log10(maxColumn)+1 as int
386
                        }
387
                        
388
                        doc = newDoc(builder, srcFile)
389
                        res = getXPath(XPath, doc)
390

    
391
//                        println "res = "+res
392
                        
393
                        if (res.getLength() == 0) {
394
                                println "No result."
395
                        } else res.each { printValue(it) }
396
}