root / tmp / org.txm.core / src / java / org / txm / scripts / importer / DisplayXmlTags.groovy @ 1688
History | View | Annotate | Download (4.8 kB)
1 | 881 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 881 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 881 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 881 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 881 | mdecorde | //
|
6 | 881 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 881 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 881 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 881 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 881 | mdecorde | // later version.
|
11 | 881 | mdecorde | //
|
12 | 881 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 881 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 881 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 881 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 881 | mdecorde | // details.
|
17 | 881 | mdecorde | //
|
18 | 881 | mdecorde | // You should have received a copy of the GNU General
|
19 | 881 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 881 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 881 | mdecorde | //
|
22 | 881 | mdecorde | //
|
23 | 881 | mdecorde | //
|
24 | 881 | mdecorde | // $LastChangedDate: 2013-05-06 17:38:43 +0200 (lun. 06 mai 2013) $
|
25 | 881 | mdecorde | // $LastChangedRevision: 2386 $
|
26 | 881 | mdecorde | // $LastChangedBy: mdecorde $
|
27 | 881 | mdecorde | //
|
28 | 1000 | mdecorde | package org.txm.scripts.importer;
|
29 | 881 | mdecorde | |
30 | 881 | mdecorde | import java.util.HashMap; |
31 | 881 | mdecorde | import java.util.ArrayList; |
32 | 881 | mdecorde | import java.io.File; |
33 | 881 | mdecorde | import javax.xml.stream.*; |
34 | 1370 | mdecorde | |
35 | 1370 | mdecorde | import org.txm.utils.io.IOUtils |
36 | 1370 | mdecorde | |
37 | 881 | mdecorde | import java.net.URL; |
38 | 881 | mdecorde | |
39 | 881 | mdecorde | // TODO: Auto-generated Javadoc
|
40 | 881 | mdecorde | /** count and display the tags of an xml file. @author mdecorde */
|
41 | 881 | mdecorde | class DisplayXmlTags { |
42 | 881 | mdecorde | ArrayList<String> paths = new ArrayList<String>(); // contains the xpath of |
43 | 881 | mdecorde | // the tags
|
44 | 881 | mdecorde | |
45 | 881 | mdecorde | /** The counts. */
|
46 | 881 | mdecorde | HashMap<String, Integer> counts = new HashMap<String, Integer>(); // contains the counts per tag |
47 | 881 | mdecorde | |
48 | 881 | mdecorde | /** The chars. */
|
49 | 881 | mdecorde | HashMap<String, Integer> chars = new HashMap<String, Integer>(); // contains the char counts per tag |
50 | 881 | mdecorde | |
51 | 881 | mdecorde | /** The currentpath. */
|
52 | 881 | mdecorde | String currentpath = ""; |
53 | 881 | mdecorde | |
54 | 881 | mdecorde | /** The sum. */
|
55 | 881 | mdecorde | public int sum = 0; |
56 | 881 | mdecorde | |
57 | 881 | mdecorde | /**
|
58 | 881 | mdecorde | * Instantiates a new display xml tags.
|
59 | 881 | mdecorde | *
|
60 | 881 | mdecorde | * @param infile : the file to parse
|
61 | 881 | mdecorde | */
|
62 | 881 | mdecorde | public DisplayXmlTags(File infile) { |
63 | 881 | mdecorde | if (infile.isDirectory()) {
|
64 | 1615 | mdecorde | for (File f : infile.listFiles(IOUtils.HIDDENFILE_FILTER)) { |
65 | 881 | mdecorde | processxmlFile(f); |
66 | 881 | mdecorde | } |
67 | 881 | mdecorde | } else
|
68 | 881 | mdecorde | processxmlFile(infile); |
69 | 881 | mdecorde | } |
70 | 881 | mdecorde | |
71 | 881 | mdecorde | /**
|
72 | 881 | mdecorde | * run the script.
|
73 | 881 | mdecorde | *
|
74 | 881 | mdecorde | * @param xmlfile the xmlfile
|
75 | 881 | mdecorde | * @return true, if successful
|
76 | 881 | mdecorde | */
|
77 | 881 | mdecorde | private boolean processxmlFile(File xmlfile) |
78 | 881 | mdecorde | { |
79 | 1688 | mdecorde | def inputData = null; |
80 | 1688 | mdecorde | def factory = null; |
81 | 881 | mdecorde | try
|
82 | 881 | mdecorde | { |
83 | 881 | mdecorde | URL url = xmlfile.toURI().toURL();
|
84 | 1688 | mdecorde | inputData = url.openStream(); |
85 | 1688 | mdecorde | factory = XMLInputFactory.newInstance(); |
86 | 881 | mdecorde | XMLStreamReader parser = factory.createXMLStreamReader(inputData); |
87 | 881 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
88 | 881 | mdecorde | { |
89 | 881 | mdecorde | switch (event)
|
90 | 881 | mdecorde | { |
91 | 881 | mdecorde | case XMLStreamConstants.START_ELEMENT:
|
92 | 881 | mdecorde | |
93 | 881 | mdecorde | currentpath += "/"+parser.getLocalName();// append the current tag to the current path |
94 | 881 | mdecorde | if(!paths.contains(currentpath))
|
95 | 881 | mdecorde | { |
96 | 881 | mdecorde | paths.add(currentpath); |
97 | 881 | mdecorde | counts.put(currentpath, 0);
|
98 | 881 | mdecorde | chars.put(currentpath, 0);
|
99 | 881 | mdecorde | } |
100 | 881 | mdecorde | counts.put(currentpath, counts.get(currentpath)+1); // increment path count |
101 | 881 | mdecorde | |
102 | 881 | mdecorde | break;
|
103 | 881 | mdecorde | case XMLStreamConstants.END_ELEMENT:
|
104 | 881 | mdecorde | currentpath = currentpath.substring(0,currentpath.length() -1 - parser.getLocalName().length()) // remove tag from the path |
105 | 881 | mdecorde | break;
|
106 | 881 | mdecorde | case XMLStreamConstants.CHARACTERS:
|
107 | 881 | mdecorde | |
108 | 881 | mdecorde | chars.put(currentpath, chars.get(currentpath)+parser.getText().trim().length()); |
109 | 881 | mdecorde | sum += parser.getText().trim().length(); |
110 | 881 | mdecorde | } |
111 | 881 | mdecorde | } |
112 | 1688 | mdecorde | parser.close(); |
113 | 1688 | mdecorde | inputData.close(); |
114 | 881 | mdecorde | } |
115 | 1688 | mdecorde | catch(Exception e){ |
116 | 1688 | mdecorde | println("File "+xmlfile+"\n"+e); |
117 | 1688 | mdecorde | if (parser != null) parser.close(); |
118 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
119 | 1688 | mdecorde | return false; |
120 | 1688 | mdecorde | } |
121 | 1688 | mdecorde | |
122 | 881 | mdecorde | return true; |
123 | 881 | mdecorde | } |
124 | 881 | mdecorde | |
125 | 881 | mdecorde | /**
|
126 | 881 | mdecorde | * Gets the tag hierarchy.
|
127 | 881 | mdecorde | *
|
128 | 881 | mdecorde | * @return the hierarchy of the tags
|
129 | 881 | mdecorde | */
|
130 | 881 | mdecorde | public ArrayList<String> getTagHierarchy() { |
131 | 881 | mdecorde | return paths;
|
132 | 881 | mdecorde | } |
133 | 881 | mdecorde | |
134 | 881 | mdecorde | /**
|
135 | 881 | mdecorde | * return the counts of a tag.
|
136 | 881 | mdecorde | *
|
137 | 881 | mdecorde | * @param path : the tag path (ex : /TEI/text/p")
|
138 | 881 | mdecorde | * @return the count
|
139 | 881 | mdecorde | */
|
140 | 881 | mdecorde | public int getCount(String path) { |
141 | 881 | mdecorde | return counts.get(path);
|
142 | 881 | mdecorde | } |
143 | 881 | mdecorde | |
144 | 881 | mdecorde | /**
|
145 | 881 | mdecorde | * Gets the counts.
|
146 | 881 | mdecorde | *
|
147 | 881 | mdecorde | * @return all the tags counts
|
148 | 881 | mdecorde | */
|
149 | 881 | mdecorde | public int getCounts() { |
150 | 881 | mdecorde | return counts;
|
151 | 881 | mdecorde | } |
152 | 881 | mdecorde | |
153 | 881 | mdecorde | /**
|
154 | 881 | mdecorde | * return the counts of chars of a tag.
|
155 | 881 | mdecorde | *
|
156 | 881 | mdecorde | * @param path : the tag path (ex : /TEI/text/p")
|
157 | 881 | mdecorde | * @return the char
|
158 | 881 | mdecorde | */
|
159 | 881 | mdecorde | public int getChar(String path) { |
160 | 881 | mdecorde | return chars.get(path);
|
161 | 881 | mdecorde | } |
162 | 881 | mdecorde | |
163 | 881 | mdecorde | /**
|
164 | 881 | mdecorde | * Gets the chars.
|
165 | 881 | mdecorde | *
|
166 | 881 | mdecorde | * @return all the tags counts
|
167 | 881 | mdecorde | */
|
168 | 881 | mdecorde | public int getChars() { |
169 | 881 | mdecorde | return chars;
|
170 | 881 | mdecorde | } |
171 | 881 | mdecorde | |
172 | 881 | mdecorde | /**
|
173 | 881 | mdecorde | * The main method.
|
174 | 881 | mdecorde | *
|
175 | 881 | mdecorde | * @param args the arguments
|
176 | 881 | mdecorde | */
|
177 | 881 | mdecorde | public static void main(String[] args) |
178 | 881 | mdecorde | { |
179 | 881 | mdecorde | String userhome = System.getProperty("user.home"); |
180 | 881 | mdecorde | DisplayXmlTags diag = new DisplayXmlTags(new File(userhome, "xml/manuelTXM/Manuel_TEI_FR_0_5.xml")); |
181 | 881 | mdecorde | ArrayList<String> paths = diag.getPaths(); |
182 | 881 | mdecorde | Collections.sort(paths);
|
183 | 881 | mdecorde | if(paths != null) |
184 | 881 | mdecorde | for(String s : paths) |
185 | 881 | mdecorde | { |
186 | 881 | mdecorde | print s+" : "+diag.getCount(s);
|
187 | 881 | mdecorde | if(diag.getChar(s) > 0) |
188 | 881 | mdecorde | print " (chars "+diag.getChar(s)+")"; |
189 | 881 | mdecorde | |
190 | 881 | mdecorde | println ""
|
191 | 881 | mdecorde | } |
192 | 881 | mdecorde | |
193 | 881 | mdecorde | println "total chars : "+diag.sum;
|
194 | 881 | mdecorde | } |
195 | 881 | mdecorde | } |