Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / DisplayXmlTags.groovy @ 1688

History | View | Annotate | Download (4.8 kB)

1 881 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 881 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 881 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 881 mdecorde
// Sophia Antipolis, University of Paris 3.
5 881 mdecorde
//
6 881 mdecorde
// The TXM platform is free software: you can redistribute it
7 881 mdecorde
// and/or modify it under the terms of the GNU General Public
8 881 mdecorde
// License as published by the Free Software Foundation,
9 881 mdecorde
// either version 2 of the License, or (at your option) any
10 881 mdecorde
// later version.
11 881 mdecorde
//
12 881 mdecorde
// The TXM platform is distributed in the hope that it will be
13 881 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 881 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 881 mdecorde
// PURPOSE. See the GNU General Public License for more
16 881 mdecorde
// details.
17 881 mdecorde
//
18 881 mdecorde
// You should have received a copy of the GNU General
19 881 mdecorde
// Public License along with the TXM platform. If not, see
20 881 mdecorde
// http://www.gnu.org/licenses.
21 881 mdecorde
//
22 881 mdecorde
//
23 881 mdecorde
//
24 881 mdecorde
// $LastChangedDate: 2013-05-06 17:38:43 +0200 (lun. 06 mai 2013) $
25 881 mdecorde
// $LastChangedRevision: 2386 $
26 881 mdecorde
// $LastChangedBy: mdecorde $
27 881 mdecorde
//
28 1000 mdecorde
package org.txm.scripts.importer;
29 881 mdecorde
30 881 mdecorde
import java.util.HashMap;
31 881 mdecorde
import java.util.ArrayList;
32 881 mdecorde
import java.io.File;
33 881 mdecorde
import javax.xml.stream.*;
34 1370 mdecorde
35 1370 mdecorde
import org.txm.utils.io.IOUtils
36 1370 mdecorde
37 881 mdecorde
import java.net.URL;
38 881 mdecorde
39 881 mdecorde
// TODO: Auto-generated Javadoc
40 881 mdecorde
/** count and display the tags of an xml file. @author mdecorde */
41 881 mdecorde
class DisplayXmlTags {
42 881 mdecorde
        ArrayList<String> paths = new ArrayList<String>(); // contains the xpath of
43 881 mdecorde
        // the tags
44 881 mdecorde
45 881 mdecorde
        /** The counts. */
46 881 mdecorde
        HashMap<String, Integer> counts = new HashMap<String, Integer>(); // contains the counts per tag
47 881 mdecorde
48 881 mdecorde
        /** The chars. */
49 881 mdecorde
        HashMap<String, Integer> chars = new HashMap<String, Integer>(); // contains the char counts per tag
50 881 mdecorde
51 881 mdecorde
        /** The currentpath. */
52 881 mdecorde
        String currentpath = "";
53 881 mdecorde
54 881 mdecorde
        /** The sum. */
55 881 mdecorde
        public int sum = 0;
56 881 mdecorde
57 881 mdecorde
        /**
58 881 mdecorde
         * Instantiates a new display xml tags.
59 881 mdecorde
         *
60 881 mdecorde
         * @param infile : the file to parse
61 881 mdecorde
         */
62 881 mdecorde
        public DisplayXmlTags(File infile) {
63 881 mdecorde
                if (infile.isDirectory()) {
64 1615 mdecorde
                        for (File f : infile.listFiles(IOUtils.HIDDENFILE_FILTER)) {
65 881 mdecorde
                                processxmlFile(f);
66 881 mdecorde
                        }
67 881 mdecorde
                } else
68 881 mdecorde
                        processxmlFile(infile);
69 881 mdecorde
        }
70 881 mdecorde
71 881 mdecorde
        /**
72 881 mdecorde
         * run the script.
73 881 mdecorde
         *
74 881 mdecorde
         * @param xmlfile the xmlfile
75 881 mdecorde
         * @return true, if successful
76 881 mdecorde
         */
77 881 mdecorde
        private boolean processxmlFile(File xmlfile)
78 881 mdecorde
        {
79 1688 mdecorde
                def inputData = null;
80 1688 mdecorde
                def factory = null;
81 881 mdecorde
                try
82 881 mdecorde
                {
83 881 mdecorde
                        URL url = xmlfile.toURI().toURL();
84 1688 mdecorde
                        inputData = url.openStream();
85 1688 mdecorde
                         factory = XMLInputFactory.newInstance();
86 881 mdecorde
                        XMLStreamReader parser = factory.createXMLStreamReader(inputData);
87 881 mdecorde
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
88 881 mdecorde
                        {
89 881 mdecorde
                                switch (event)
90 881 mdecorde
                                {
91 881 mdecorde
                                        case XMLStreamConstants.START_ELEMENT:
92 881 mdecorde
93 881 mdecorde
                                                currentpath += "/"+parser.getLocalName();// append the current tag to the current path
94 881 mdecorde
                                                if(!paths.contains(currentpath))
95 881 mdecorde
                                                {
96 881 mdecorde
                                                        paths.add(currentpath);
97 881 mdecorde
                                                        counts.put(currentpath, 0);
98 881 mdecorde
                                                        chars.put(currentpath, 0);
99 881 mdecorde
                                                }
100 881 mdecorde
                                                counts.put(currentpath, counts.get(currentpath)+1); // increment path count
101 881 mdecorde
102 881 mdecorde
                                                break;
103 881 mdecorde
                                        case XMLStreamConstants.END_ELEMENT:
104 881 mdecorde
                                                currentpath = currentpath.substring(0,currentpath.length() -1 - parser.getLocalName().length()) // remove tag from the path
105 881 mdecorde
                                                break;
106 881 mdecorde
                                        case XMLStreamConstants.CHARACTERS:
107 881 mdecorde
108 881 mdecorde
                                                chars.put(currentpath, chars.get(currentpath)+parser.getText().trim().length());
109 881 mdecorde
                                                sum += parser.getText().trim().length();
110 881 mdecorde
                                }
111 881 mdecorde
                        }
112 1688 mdecorde
                        parser.close();
113 1688 mdecorde
                        inputData.close();
114 881 mdecorde
                }
115 1688 mdecorde
                catch(Exception e){
116 1688 mdecorde
                        println("File "+xmlfile+"\n"+e);
117 1688 mdecorde
                        if (parser != null) parser.close();
118 1688 mdecorde
                        if (inputData != null) inputData.close();
119 1688 mdecorde
                        return false;
120 1688 mdecorde
                }
121 1688 mdecorde
122 881 mdecorde
                return true;
123 881 mdecorde
        }
124 881 mdecorde
125 881 mdecorde
        /**
126 881 mdecorde
         * Gets the tag hierarchy.
127 881 mdecorde
         *
128 881 mdecorde
         * @return the hierarchy of the tags
129 881 mdecorde
         */
130 881 mdecorde
        public ArrayList<String> getTagHierarchy() {
131 881 mdecorde
                return paths;
132 881 mdecorde
        }
133 881 mdecorde
134 881 mdecorde
        /**
135 881 mdecorde
         * return the counts of a tag.
136 881 mdecorde
         *
137 881 mdecorde
         * @param path : the tag path (ex : /TEI/text/p")
138 881 mdecorde
         * @return the count
139 881 mdecorde
         */
140 881 mdecorde
        public int getCount(String path) {
141 881 mdecorde
                return counts.get(path);
142 881 mdecorde
        }
143 881 mdecorde
144 881 mdecorde
        /**
145 881 mdecorde
         * Gets the counts.
146 881 mdecorde
         *
147 881 mdecorde
         * @return all the tags counts
148 881 mdecorde
         */
149 881 mdecorde
        public int getCounts() {
150 881 mdecorde
                return counts;
151 881 mdecorde
        }
152 881 mdecorde
153 881 mdecorde
        /**
154 881 mdecorde
         * return the counts of chars of  a tag.
155 881 mdecorde
         *
156 881 mdecorde
         * @param path : the tag path (ex : /TEI/text/p")
157 881 mdecorde
         * @return the char
158 881 mdecorde
         */
159 881 mdecorde
        public int getChar(String path) {
160 881 mdecorde
                return chars.get(path);
161 881 mdecorde
        }
162 881 mdecorde
163 881 mdecorde
        /**
164 881 mdecorde
         * Gets the chars.
165 881 mdecorde
         *
166 881 mdecorde
         * @return all the tags counts
167 881 mdecorde
         */
168 881 mdecorde
        public int getChars() {
169 881 mdecorde
                return chars;
170 881 mdecorde
        }
171 881 mdecorde
172 881 mdecorde
        /**
173 881 mdecorde
         * The main method.
174 881 mdecorde
         *
175 881 mdecorde
         * @param args the arguments
176 881 mdecorde
         */
177 881 mdecorde
        public static void main(String[] args)
178 881 mdecorde
        {
179 881 mdecorde
                String userhome = System.getProperty("user.home");
180 881 mdecorde
                DisplayXmlTags diag = new DisplayXmlTags(new File(userhome, "xml/manuelTXM/Manuel_TEI_FR_0_5.xml"));
181 881 mdecorde
                ArrayList<String> paths = diag.getPaths();
182 881 mdecorde
                Collections.sort(paths);
183 881 mdecorde
                if(paths != null)
184 881 mdecorde
                        for(String s : paths)
185 881 mdecorde
                        {
186 881 mdecorde
                                print s+" : "+diag.getCount(s);
187 881 mdecorde
                                if(diag.getChar(s) > 0)
188 881 mdecorde
                                        print " (chars "+diag.getChar(s)+")";
189 881 mdecorde
190 881 mdecorde
                                println ""
191 881 mdecorde
                        }
192 881 mdecorde
193 881 mdecorde
                println "total chars : "+diag.sum;
194 881 mdecorde
        }
195 881 mdecorde
}