Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / swb / CountNumberOfChar.groovy @ 1000

History | View | Annotate | Download (2.5 kB)

1 1000 mdecorde
package org.txm.scripts.scripts
2 321 mdecorde
3 321 mdecorde
4 321 mdecorde
import java.util.HashMap;
5 321 mdecorde
import java.util.ArrayList;
6 321 mdecorde
import java.io.File;
7 321 mdecorde
import javax.xml.stream.*;
8 321 mdecorde
import java.net.URL;
9 321 mdecorde
10 986 mdecorde
import org.txm.scripts.importer.*;
11 321 mdecorde
12 321 mdecorde
/**
13 321 mdecorde
 * Counts the number of chars in tokens in a specific element
14 321 mdecorde
 * NB: Tokens are separated by whitespaces
15 321 mdecorde
 *
16 321 mdecorde
 * Can set attribute filter
17 321 mdecorde
 * @author mdecorde
18 321 mdecorde
 */
19 321 mdecorde
class CountNumberOfChar {
20 321 mdecorde
        int total = 0;
21 321 mdecorde
        int nbWords = 0;
22 321 mdecorde
        String elem = "w";
23 321 mdecorde
        String attribute = null;
24 321 mdecorde
        String regex = null;
25 321 mdecorde
26 321 mdecorde
        /**
27 321 mdecorde
         * @param dir a directory
28 321 mdecorde
         * @param elem the element to focus on
29 321 mdecorde
         */
30 321 mdecorde
        public CountNumberOfChar(File dir, String elem, String attribute, String regex) {
31 321 mdecorde
                this.elem = elem;
32 321 mdecorde
                this.attribute  = attribute;
33 321 mdecorde
                this.regex = regex;
34 321 mdecorde
35 321 mdecorde
                for (File f : dir.listFiles())
36 321 mdecorde
                        if (ValidateXml.test(f))
37 321 mdecorde
                                countInXML(f)
38 321 mdecorde
        }
39 321 mdecorde
40 321 mdecorde
        protected void countInXML(File f)
41 321 mdecorde
        {
42 321 mdecorde
                println "count: "+f
43 321 mdecorde
                boolean startCount = false
44 321 mdecorde
                try {
45 321 mdecorde
                        URL url = f.toURI().toURL();
46 321 mdecorde
                        def inputData = url.openStream();
47 321 mdecorde
                        def factory = XMLInputFactory.newInstance();
48 321 mdecorde
                        XMLStreamReader parser = factory.createXMLStreamReader(inputData);
49 321 mdecorde
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
50 321 mdecorde
                        {
51 321 mdecorde
                                switch (event) {
52 321 mdecorde
                                        case XMLStreamConstants.START_ELEMENT:
53 321 mdecorde
54 321 mdecorde
                                                if (parser.getLocalName() == elem) {
55 321 mdecorde
                                                        if (attribute == null || regex == null) {
56 321 mdecorde
                                                                startCount = true;
57 321 mdecorde
                                                                nbWords++
58 321 mdecorde
                                                        } else {
59 321 mdecorde
                                                                String attrvalue = parser.getAttributeValue(null, attribute)
60 321 mdecorde
                                                                if (attrvalue != null && attrvalue.matches(regex)) {
61 321 mdecorde
                                                                        startCount = true;
62 321 mdecorde
                                                                        nbWords++
63 321 mdecorde
                                                                }
64 321 mdecorde
                                                        }
65 321 mdecorde
                                                }
66 321 mdecorde
                                                break;
67 321 mdecorde
                                        case XMLStreamConstants.END_ELEMENT:
68 321 mdecorde
                                                if (parser.getLocalName() == elem) {
69 321 mdecorde
                                                        startCount = false
70 321 mdecorde
                                                }
71 321 mdecorde
                                                break;
72 321 mdecorde
                                        case XMLStreamConstants.CHARACTERS:
73 321 mdecorde
                                                if (startCount) {
74 321 mdecorde
                                                        for (String tok : parser.getText().trim()) {
75 321 mdecorde
                                                                total += tok.length()
76 321 mdecorde
                                                        }
77 321 mdecorde
78 321 mdecorde
                                                }
79 321 mdecorde
                                }
80 321 mdecorde
                        }
81 321 mdecorde
82 321 mdecorde
                }
83 321 mdecorde
                catch(Exception e){e.printStackTrace();}
84 321 mdecorde
        }
85 321 mdecorde
86 321 mdecorde
        public float getMean() {
87 321 mdecorde
                return ((float)total)/(float)nbWords;
88 321 mdecorde
        }
89 321 mdecorde
90 321 mdecorde
        public int getNbWords() {
91 321 mdecorde
                return nbWords;
92 321 mdecorde
        }
93 321 mdecorde
94 321 mdecorde
        public int getTotal() {
95 321 mdecorde
                return total;
96 321 mdecorde
        }
97 321 mdecorde
98 321 mdecorde
        public static void main(def args)
99 321 mdecorde
        {
100 321 mdecorde
                CountNumberOfChar c = new CountNumberOfChar(new File(System.getProperty("user.home")+"/TXM/corpora/bfm1/txm"), "form", null, null)
101 321 mdecorde
                //CountNumberOfChar c = new CountNumberOfChar(new File(System.getProperty("user.home")+"/TXM/corpora/bfm1/txm"), "form", "type", "pon")
102 321 mdecorde
                System.out.println("Moyenne: "+c.getMean());
103 321 mdecorde
        }
104 321 mdecorde
}