Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / DisplayXmlTags.groovy @ 479

History | View | Annotate | Download (4.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2013-05-06 17:38:43 +0200 (lun. 06 mai 2013) $
25
// $LastChangedRevision: 2386 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.importer;
29

    
30
import java.util.HashMap;
31
import java.util.ArrayList;
32
import java.io.File;
33
import javax.xml.stream.*;
34
import java.net.URL;
35

    
36
// TODO: Auto-generated Javadoc
37
/** count and display the tags of an xml file. @author mdecorde */
38
class DisplayXmlTags {
39
        ArrayList<String> paths = new ArrayList<String>(); // contains the xpath of
40
        // the tags
41
        
42
        /** The counts. */
43
        HashMap<String, Integer> counts = new HashMap<String, Integer>(); // contains the counts per tag
44
        
45
        /** The chars. */
46
        HashMap<String, Integer> chars = new HashMap<String, Integer>(); // contains the char counts per tag
47

    
48
        /** The currentpath. */
49
        String currentpath = "";
50
        
51
        /** The sum. */
52
        public int sum = 0;
53

    
54
        /**
55
         * Instantiates a new display xml tags.
56
         *
57
         * @param infile : the file to parse
58
         */
59
        public DisplayXmlTags(File infile) {
60
                if (infile.isDirectory()) {
61
                        for (File f : infile.listFiles()) {
62
                                processxmlFile(f);
63
                        }
64
                } else
65
                        processxmlFile(infile);
66
        }
67

    
68
        /**
69
         * run the script.
70
         *
71
         * @param xmlfile the xmlfile
72
         * @return true, if successful
73
         */
74
        private boolean processxmlFile(File xmlfile)
75
        {
76
                try
77
                {
78
                        URL url = xmlfile.toURI().toURL();
79
                        def inputData = url.openStream();
80
                        def factory = XMLInputFactory.newInstance();
81
                        XMLStreamReader parser = factory.createXMLStreamReader(inputData);
82
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
83
                        {
84
                                switch (event) 
85
                                {
86
                                        case XMLStreamConstants.START_ELEMENT:
87
                                        
88
                                                currentpath += "/"+parser.getLocalName();// append the current tag to the current path
89
                                                if(!paths.contains(currentpath))
90
                                                {
91
                                                        paths.add(currentpath);
92
                                                        counts.put(currentpath, 0);
93
                                                        chars.put(currentpath, 0);
94
                                                }
95
                                                counts.put(currentpath, counts.get(currentpath)+1); // increment path count
96
                                        
97
                                                break;
98
                                        case XMLStreamConstants.END_ELEMENT:
99
                                                currentpath = currentpath.substring(0,currentpath.length() -1 - parser.getLocalName().length()) // remove tag from the path
100
                                                break;
101
                                        case XMLStreamConstants.CHARACTERS:
102
                                                
103
                                                chars.put(currentpath, chars.get(currentpath)+parser.getText().trim().length());
104
                                                sum += parser.getText().trim().length();
105
                                }
106
                        }
107
                        
108
                }
109
                catch(Exception e){println("File "+xmlfile+"\n"+e); return false;}
110
                return true;
111
        }
112

    
113
        /**
114
         * Gets the tag hierarchy.
115
         *
116
         * @return the hierarchy of the tags
117
         */
118
        public ArrayList<String> getTagHierarchy() {
119
                return paths;
120
        }
121

    
122
        /**
123
         * return the counts of a tag.
124
         *
125
         * @param path : the tag path (ex : /TEI/text/p")
126
         * @return the count
127
         */
128
        public int getCount(String path) {
129
                return counts.get(path);
130
        }
131

    
132
        /**
133
         * Gets the counts.
134
         *
135
         * @return all the tags counts
136
         */
137
        public int getCounts() {
138
                return counts;
139
        }
140
        
141
        /**
142
         * return the counts of chars of  a tag.
143
         *
144
         * @param path : the tag path (ex : /TEI/text/p")
145
         * @return the char
146
         */
147
        public int getChar(String path) {
148
                return chars.get(path);
149
        }
150

    
151
        /**
152
         * Gets the chars.
153
         *
154
         * @return all the tags counts
155
         */
156
        public int getChars() {
157
                return chars;
158
        }
159

    
160
        /**
161
         * The main method.
162
         *
163
         * @param args the arguments
164
         */
165
        public static void main(String[] args)
166
        {
167
                String userhome = System.getProperty("user.home"); 
168
                DisplayXmlTags diag = new DisplayXmlTags(new File(userhome, "xml/manuelTXM/Manuel_TEI_FR_0_5.xml"));
169
                ArrayList<String> paths = diag.getPaths();
170
                Collections.sort(paths);
171
                if(paths != null)
172
                        for(String s : paths)
173
                        {
174
                                print s+" : "+diag.getCount(s);
175
                                if(diag.getChar(s) > 0)
176
                                        print " (chars "+diag.getChar(s)+")";
177
                                        
178
                                println ""
179
                        }
180
                
181
                println "total chars : "+diag.sum;
182
        }
183
}