Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / swb / CountNumberOfChar.groovy @ 1688

History | View | Annotate | Download (2.5 kB)

1
package org.txm.scripts.scripts
2

    
3

    
4
import java.util.HashMap;
5
import java.util.ArrayList;
6
import java.io.File;
7
import javax.xml.stream.*;
8
import java.net.URL;
9

    
10
import org.txm.scripts.importer.*;
11

    
12
/**
13
 * Counts the number of chars in tokens in a specific element
14
 * NB: Tokens are separated by whitespaces
15
 * 
16
 * Can set attribute filter
17
 * @author mdecorde
18
 */
19
class CountNumberOfChar {
20
        int total = 0;
21
        int nbWords = 0;
22
        String elem = "w";
23
        String attribute = null;
24
        String regex = null;
25

    
26
        /**
27
         * @param dir a directory
28
         * @param elem the element to focus on
29
         */
30
        public CountNumberOfChar(File dir, String elem, String attribute, String regex) {
31
                this.elem = elem;
32
                this.attribute  = attribute;
33
                this.regex = regex;
34

    
35
                for (File f : dir.listFiles())
36
                        if (ValidateXml.test(f))
37
                                countInXML(f)
38
        }
39

    
40
        protected void countInXML(File f)
41
        {
42
                println "count: "+f
43
                boolean startCount = false
44
                try {
45
                        URL url = f.toURI().toURL();
46
                        def inputData = url.openStream();
47
                        def factory = XMLInputFactory.newInstance();
48
                        XMLStreamReader parser = factory.createXMLStreamReader(inputData);
49
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
50
                        {
51
                                switch (event) {
52
                                        case XMLStreamConstants.START_ELEMENT:
53

    
54
                                                if (parser.getLocalName() == elem) {
55
                                                        if (attribute == null || regex == null) {
56
                                                                startCount = true;
57
                                                                nbWords++
58
                                                        } else {
59
                                                                String attrvalue = parser.getAttributeValue(null, attribute)
60
                                                                if (attrvalue != null && attrvalue.matches(regex)) {
61
                                                                        startCount = true;
62
                                                                        nbWords++
63
                                                                }
64
                                                        }
65
                                                }
66
                                                break;
67
                                        case XMLStreamConstants.END_ELEMENT:
68
                                                if (parser.getLocalName() == elem) {
69
                                                        startCount = false
70
                                                }
71
                                                break;
72
                                        case XMLStreamConstants.CHARACTERS:
73
                                                if (startCount) {
74
                                                        for (String tok : parser.getText().trim()) {
75
                                                                total += tok.length()
76
                                                        }
77
                                                        
78
                                                }
79
                                }
80
                        }
81
                        
82
                        if (parser != null) parser.close();
83
                        if (inputData != null) inputData.close();
84
                }
85
                catch(Exception e){e.printStackTrace();}
86
        }
87

    
88
        public float getMean() {
89
                return ((float)total)/(float)nbWords;
90
        }
91

    
92
        public int getNbWords() {
93
                return nbWords;
94
        }
95

    
96
        public int getTotal() {
97
                return total;
98
        }
99

    
100
        public static void main(def args)
101
        {
102
                CountNumberOfChar c = new CountNumberOfChar(new File(System.getProperty("user.home")+"/TXM/corpora/bfm1/txm"), "form", null, null)
103
                //CountNumberOfChar c = new CountNumberOfChar(new File(System.getProperty("user.home")+"/TXM/corpora/bfm1/txm"), "form", "type", "pon")
104
                System.out.println("Moyenne: "+c.getMean());
105
        }
106
}