Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / DisplayXmlTags.groovy @ 1688

History | View | Annotate | Download (4.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2013-05-06 17:38:43 +0200 (lun. 06 mai 2013) $
25
// $LastChangedRevision: 2386 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.scripts.importer;
29

    
30
import java.util.HashMap;
31
import java.util.ArrayList;
32
import java.io.File;
33
import javax.xml.stream.*;
34

    
35
import org.txm.utils.io.IOUtils
36

    
37
import java.net.URL;
38

    
39
// TODO: Auto-generated Javadoc
40
/** count and display the tags of an xml file. @author mdecorde */
41
class DisplayXmlTags {
42
        ArrayList<String> paths = new ArrayList<String>(); // contains the xpath of
43
        // the tags
44
        
45
        /** The counts. */
46
        HashMap<String, Integer> counts = new HashMap<String, Integer>(); // contains the counts per tag
47
        
48
        /** The chars. */
49
        HashMap<String, Integer> chars = new HashMap<String, Integer>(); // contains the char counts per tag
50

    
51
        /** The currentpath. */
52
        String currentpath = "";
53
        
54
        /** The sum. */
55
        public int sum = 0;
56

    
57
        /**
58
         * Instantiates a new display xml tags.
59
         *
60
         * @param infile : the file to parse
61
         */
62
        public DisplayXmlTags(File infile) {
63
                if (infile.isDirectory()) {
64
                        for (File f : infile.listFiles(IOUtils.HIDDENFILE_FILTER)) {
65
                                processxmlFile(f);
66
                        }
67
                } else
68
                        processxmlFile(infile);
69
        }
70

    
71
        /**
72
         * run the script.
73
         *
74
         * @param xmlfile the xmlfile
75
         * @return true, if successful
76
         */
77
        private boolean processxmlFile(File xmlfile)
78
        {
79
                def inputData = null;
80
                def factory = null;
81
                try
82
                {
83
                        URL url = xmlfile.toURI().toURL();
84
                        inputData = url.openStream();
85
                         factory = XMLInputFactory.newInstance();
86
                        XMLStreamReader parser = factory.createXMLStreamReader(inputData);
87
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
88
                        {
89
                                switch (event) 
90
                                {
91
                                        case XMLStreamConstants.START_ELEMENT:
92
                                        
93
                                                currentpath += "/"+parser.getLocalName();// append the current tag to the current path
94
                                                if(!paths.contains(currentpath))
95
                                                {
96
                                                        paths.add(currentpath);
97
                                                        counts.put(currentpath, 0);
98
                                                        chars.put(currentpath, 0);
99
                                                }
100
                                                counts.put(currentpath, counts.get(currentpath)+1); // increment path count
101
                                        
102
                                                break;
103
                                        case XMLStreamConstants.END_ELEMENT:
104
                                                currentpath = currentpath.substring(0,currentpath.length() -1 - parser.getLocalName().length()) // remove tag from the path
105
                                                break;
106
                                        case XMLStreamConstants.CHARACTERS:
107
                                                
108
                                                chars.put(currentpath, chars.get(currentpath)+parser.getText().trim().length());
109
                                                sum += parser.getText().trim().length();
110
                                }
111
                        }
112
                        parser.close();
113
                        inputData.close();
114
                }
115
                catch(Exception e){
116
                        println("File "+xmlfile+"\n"+e);
117
                        if (parser != null) parser.close();
118
                        if (inputData != null) inputData.close();
119
                        return false;
120
                }
121
                
122
                return true;
123
        }
124

    
125
        /**
126
         * Gets the tag hierarchy.
127
         *
128
         * @return the hierarchy of the tags
129
         */
130
        public ArrayList<String> getTagHierarchy() {
131
                return paths;
132
        }
133

    
134
        /**
135
         * return the counts of a tag.
136
         *
137
         * @param path : the tag path (ex : /TEI/text/p")
138
         * @return the count
139
         */
140
        public int getCount(String path) {
141
                return counts.get(path);
142
        }
143

    
144
        /**
145
         * Gets the counts.
146
         *
147
         * @return all the tags counts
148
         */
149
        public int getCounts() {
150
                return counts;
151
        }
152
        
153
        /**
154
         * return the counts of chars of  a tag.
155
         *
156
         * @param path : the tag path (ex : /TEI/text/p")
157
         * @return the char
158
         */
159
        public int getChar(String path) {
160
                return chars.get(path);
161
        }
162

    
163
        /**
164
         * Gets the chars.
165
         *
166
         * @return all the tags counts
167
         */
168
        public int getChars() {
169
                return chars;
170
        }
171

    
172
        /**
173
         * The main method.
174
         *
175
         * @param args the arguments
176
         */
177
        public static void main(String[] args)
178
        {
179
                String userhome = System.getProperty("user.home"); 
180
                DisplayXmlTags diag = new DisplayXmlTags(new File(userhome, "xml/manuelTXM/Manuel_TEI_FR_0_5.xml"));
181
                ArrayList<String> paths = diag.getPaths();
182
                Collections.sort(paths);
183
                if(paths != null)
184
                        for(String s : paths)
185
                        {
186
                                print s+" : "+diag.getCount(s);
187
                                if(diag.getChar(s) > 0)
188
                                        print " (chars "+diag.getChar(s)+")";
189
                                        
190
                                println ""
191
                        }
192
                
193
                println "total chars : "+diag.sum;
194
        }
195
}