Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / sql / XMLStatistics.groovy @ 1000

History | View | Annotate | Download (5.4 kB)

1
package org.txm.scripts.scripts
2

    
3
import java.text.Collator;
4
import java.util.Locale;
5

    
6
import org.txm.importer.StaxIdentityParser;
7

    
8
class XMLStatistics extends StaxIdentityParser {
9

    
10
        static String UNICATTRS = "id|src|name|href"
11
        String path = "";
12
        def paths = [:];
13

    
14
        boolean usePaths=false, useAttributes=false, useAttributeValues=false;
15

    
16
        public XMLStatistics(URL url) {
17
                super(url);
18
        }
19

    
20
        public void setUsePaths(boolean b) {
21
                this.usePaths = b
22
        }
23

    
24
        public void setUseAttributes(boolean b) {
25
                this.useAttributes = b
26
        }
27

    
28
        public void setUseAttributeValues(boolean b) {
29
                this.useAttributeValues = b;
30
        }
31

    
32
        public void processCharacters() {
33
                //super.processCharacters()// don't write :)
34
        }
35
        
36
        public void processEndElement() {
37
                if (usePaths) {
38
                        path = path.substring(0, path.lastIndexOf("/"))
39
                }
40
                //super.processEndElement()// don't write :)
41
        }
42

    
43
        public void processStartElement() {
44
                String localname = this.parser.getLocalName();
45
                if (usePaths) {
46
                        path = path + "/$localname"
47
                } else {
48
                        path = localname;
49
                }
50

    
51
                if (this.useAttributes) {
52
                        if (this.useAttributeValues) {
53
                                for(int i = 0 ; i < parser.getAttributeCount() ; i++)
54
                                        if (!parser.getAttributeLocalName(i).matches(UNICATTRS))
55
                                                path += "@"+parser.getAttributeLocalName(i)+"="+parser.getAttributeValue(i)
56
                        } else {
57
                                for(int i = 0 ; i < parser.getAttributeCount() ; i++)
58
                                        path += "@"+parser.getAttributeLocalName(i)
59
                        }
60
                }
61

    
62

    
63
                plusOne()
64
                //super.processStartElement() // don't write :)
65
        }
66

    
67
        public void process() {
68
                File tmp = File.createTempFile("sdfs", "sdfsdf")
69
                process(tmp)
70
                tmp.delete();
71
        }
72

    
73
        private void plusOne() {
74
                if (!paths.containsKey(path)) {
75
                        paths.put(path, 1)
76
                } else {
77
                        paths.put(path, paths.get(path)+1)
78
                }
79
        }
80

    
81
        public void printCounts(boolean sortByPath) {
82
                def keys = new ArrayList(paths.keySet());
83
                if (sortByPath) {
84
                        Collator collator = Collator.getInstance(new Locale("en", "US"));
85
                        collator.setStrength(Collator.TERTIARY);
86
                        Collections.sort(keys, collator)
87
                } else {
88
                        keys.sort() { key -> -paths.get(key) }
89
                }
90
                for (String key : keys) {
91
                        println "$key\t"+paths.get(key)
92
                }
93
        }
94

    
95
        public static void processDirectory(File srcdir, File outfile, String ext, boolean usePaths, boolean useAttributes, boolean useAttributeValues) {
96

    
97
                def results = [:]
98
                def allkeys = new HashSet<String>()
99

    
100
                // counts
101
                for (File infile : srcdir.listFiles()) {
102
                        if (!infile.getName().endsWith(".xml")) continue
103

    
104
                        XMLStatistics diag = new XMLStatistics(infile.toURI().toURL())
105
                        diag.setUsePaths(usePaths)
106
                        diag.setUseAttributes(useAttributes)
107
                        diag.setUseAttributeValues(useAttributeValues)
108
                        diag.process();
109
                        results.put(infile.getName(), diag.getPaths())
110
                        allkeys.addAll(diag.getPaths().keySet())
111
                        println ""
112
                }
113

    
114
                allkeys = new ArrayList(allkeys)
115
                def table = [:]
116
                def files = new ArrayList<String>(results.keySet())
117

    
118
                // create table header
119
                def header = []
120
                header.addAll(files)
121
                header << "TOTAL"
122
                table["", header]
123

    
124
                // Create empty cells
125
                for (int i = 0 ; i < allkeys.size() ; i++) {
126
                        table[allkeys.get(i)] = new ArrayList(files.size())
127
                }
128

    
129
                // fill cells
130
                for (int i = 0 ; i < allkeys.size() ; i++) {
131
                        String key = allkeys.get(i)
132
                        for (int j = 0 ; j < files.size() ; j++) {
133
                                def c = results.get(files.get(j)).get(key)
134
                                if (c != null) {
135
                                        table[key][j] = c
136
                                } else {
137
                                        table[key][j] = 0
138
                                }
139
                        }
140
                }
141

    
142
                // compute column margins
143
                def colmargins = [];
144
                allkeys << "TOTAL"
145
                table["TOTAL"] = colmargins
146
                for (int i = 0 ; i < files.size() ; i++) {
147
                        colmargins[i] = 0;
148
                        for (String key : allkeys) {
149
                                colmargins[i] += table[key].get(i)
150
                        }
151
                }
152

    
153
                // compute line margins
154
                for (int i = 0 ; i < allkeys.size() ; i++) {
155
                        String key = allkeys.get(i)
156
                        int total = 0;
157
                        for (int j = 0 ; j < files.size() ; j++) {
158
                                total += table[key][j]
159
                        }
160
                        table[key][files.size()] = total
161
                }
162

    
163
                // sort keys
164
                allkeys.sort() { it -> -table[it][files.size()] }
165
                allkeys.remove(0)
166
                allkeys << "TOTAL"
167

    
168
                // print
169
                outfile.withWriter("UTF-8") { writer ->
170
                        for(def h : header)
171
                                writer.print "\t"+h
172
                        writer.println ""
173
                        for (int i = 0 ; i < allkeys.size() ; i++) {
174
                                writer.print allkeys[i]
175
                                for (def c : table.get(allkeys[i])) {
176
                                        if (c == 0) { writer.print "\t " }
177
                                        else { writer.print "\t"+c }
178
                                }
179
                                writer.println ""
180
                        }
181
                }
182
        }
183
        
184
        public static void main(String[] args) {
185
                File srcdir = new File("/home/mdecorde/xml/temoignagesnice/corpus Matrice - fichiers xmlisés/");
186
                File outdir = new File("/home/mdecorde/xml/temoignagesnice/corpus Matrice - fichiers xmlisés/stats");
187

    
188
                // Process a directory
189
                outdir.deleteDir()
190
                outdir.mkdir()
191
                processDirectory(srcdir, new File(outdir,"FFF.csv"), ".xml", false, false, false);
192
                processDirectory(srcdir, new File(outdir,"FTF.csv"), ".xml", false, true, false);
193
                processDirectory(srcdir, new File(outdir,"FTT.csv"), ".xml", false, true, true);
194
                processDirectory(srcdir, new File(outdir,"TFF.csv"), ".xml", true, false, false);
195
                processDirectory(srcdir, new File(outdir,"TTF.csv"), ".xml", true, true, false);
196
                processDirectory(srcdir, new File(outdir,"TTT.csv"), ".xml", true, true, true);
197

    
198
//                // Process one file
199
//                File xmlfile = new File("/home/mdecorde/Bureau/matrice/témoignages/CONVERSIONS/ratz/odt-ratz.xml");
200
//                XmlStatistics diag = new XmlStatistics(xmlfile.toURI().toURL());
201
//                diag.setUsePaths(true)
202
//                diag.setUseAttributes(false)
203
//                diag.setUseAttributeValues(false)
204
//                diag.process();
205
//                diag.printCounts(true); 
206
        }
207
}
208