Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / filters / Tokeniser / TEITokenizer.groovy @ 187

History | View | Annotate | Download (6.4 kB)

1
package filters.Tokeniser;
2

    
3
import java.net.URL;
4
import java.util.regex.Pattern;
5

    
6
import org.txm.importer.StaxIdentityParser;
7
import org.txm.tokenizer.*;
8

    
9
public class TEITokenizer extends StaxIdentityParser {
10

    
11
        int DEBUG = 0;
12

    
13
        enum STATES { WORD, TEXT, IGNORE, NOTE };
14
        STATES state = STATES.IGNORE;
15
        String STARTTAG = ".*"
16
        String NOTETAG = ""
17
        //String WORDTAG = TokenizerClasses.word_tags;
18

    
19
        static String PONTEST = "[.;:,!?]";
20
        static String TYPE = "type";
21
        static String ID = "id"
22
        static String PON = "pon"
23
        static String NUM = "num"
24
        static String ENDOFLINE = System.getProperty("line.separator");
25
        def tests = TokenizerClasses.tests;
26

    
27
        int firstElement = 1;
28
        String filename = "";
29
        int wordcount = 0;
30
        String previousElem;
31

    
32
        def word_tags = TokenizerClasses.word_tags;
33
        def intraword_tags = TokenizerClasses.intraword_tags;
34
        def punct_strong = TokenizerClasses.punct_strong;
35
        def punct_all = TokenizerClasses.punct_all;
36
        def word_chars = TokenizerClasses.word_chars;
37
        def punct_quotes = TokenizerClasses.punct_quotes;
38

    
39
        StringBuffer buffer = new StringBuffer();
40

    
41
        public TEITokenizer(URL inputurl) {
42
                super(inputurl);
43

    
44
                this.filename = new File(inputurl.getFile()).getName();
45
                int index = filename.indexOf(".");
46
                if (index > 0)
47
                        filename = filename.substring(0, index);
48
        }
49

    
50
        protected void processBuffer() {
51
                if (buffer.size() > 0) {
52
                        tokenize(buffer.toString());
53
                        buffer = new StringBuffer();
54
                }
55
        }
56

    
57
        protected void processStartElement() {
58
                //println "OPEN "+localname+" = "+buffer.toString()
59
                if ((firstElement++) == 1) {
60
                        if (localname == "TEI") {
61
                                STARTTAG = "text"
62
                                NOTETAG = "note"
63
                        } else 
64
                                STARTTAG = localname
65
                }
66
                processBuffer();
67
                switch (state) {
68
                        case STATES.IGNORE:
69
                                super.processStartElement();
70
                                if (localname.matches(STARTTAG)) {
71
                                        //println "ENTER $STARTTAG"
72
                                        state = STATES.TEXT;
73
                                }
74
                                break;
75
                        case STATES.TEXT:
76
                                if (localname.matches(NOTETAG)) {
77
                                        super.processStartElement();
78
                                        //println "ENTER $NOTETAG"
79
                                        state = STATES.NOTE;
80
                                } else if (localname.matches(word_tags)) {
81
                                        //println "ENTER $WORDTAG"
82
                                        state = STATES.WORD;
83
                                        processStartWordElement();
84
                                } else {
85
                                        super.processStartElement();
86
                                        writer.writeCharacters(ENDOFLINE);
87
                                }
88
                                break;
89
                        default: // note, comment
90
                                super.processStartElement();
91
                }
92
                previousElem = localname;
93
        }
94

    
95
        protected void processEndElement() {
96
                //println "CLOSE "+localname+" = "+buffer.toString()
97
                processBuffer();
98
                switch (state) {
99
                        case STATES.NOTE:
100
                                super.processEndElement();
101
                                if (localname.matches(NOTETAG)) {
102
                                        writer.writeCharacters(ENDOFLINE);
103
                                        //println "EXIT $NOTETAG"
104
                                        state = STATES.TEXT;
105
                                }
106
                                break;
107
                        case STATES.WORD:
108
                                super.processEndElement();
109
                                writer.writeCharacters(ENDOFLINE);
110
                                if (localname.matches(word_tags)) {
111
                                        //println "EXIT $WORDTAG"
112
                                        state = STATES.TEXT;
113
                                }
114
                                break;
115
                        case STATES.TEXT:
116
                                super.processEndElement();
117
                                writer.writeCharacters(ENDOFLINE);
118
                                if (localname.matches(STARTTAG)) {
119
                                        //println "EXIT $STARTTAG"
120
                                        state = STATES.IGNORE;
121
                                }
122
                                break;
123
                        default: // note, comment
124
                                super.processEndElement();
125
                }
126
                previousElem = localname;
127
        }
128

    
129
        protected void processStartWordElement() {
130
                super.processStartElement();
131
                wordcount++;
132
        }
133

    
134
        protected void tokenize(String text) {
135
                if (DEBUG > 0) {println "buffer="+text}
136
                text = text.replaceAll("\n", " ");
137
                text = text.replaceAll("\\p{C}", "");                                                // remove ctrl characters
138
                for (String s : text.split("[\\p{Z}\\p{C}]+") ) {
139
                        s = s.trim();
140
                        iterate(s);
141
                }
142
        }
143

    
144
        /**
145
         * Iterate.
146
         *
147
         * @param s the s
148
         * @return the java.lang. object
149
         */
150
        protected iterate(String s)
151
        {
152
                while (s != null && s.length() > 0) {
153
                        if (DEBUG > 0){println "  > $s"}
154
                        s = standardChecks(s);
155
                }
156
        }
157

    
158
        /**
159
         * Standard checks.
160
         *
161
         * @param s the s
162
         * @return the java.lang. object
163
         */
164
        protected standardChecks(String s)
165
        {
166
                if (DEBUG > 0){println "checks : "+s}
167
                def m;
168

    
169
                for (TTest test : tests) {
170
                        if ((m = s =~ test.regex)) {
171
                                if (DEBUG > 0){println "test : "+test.regex}
172
                                if (test.before > 0)
173
                                        iterate(m[0][test.before])
174

    
175
                                if (test.hit > 0) {
176
                                        DEBUG--
177
                                        wordcount++;
178
                                        writer.writeStartElement("w");
179
                                        writeWordAttributes();// id
180
                                        writer.writeAttribute("type",test.type);
181
                                        writer.writeCharacters(m[0][test.hit]);
182
                                        writer.writeEndElement();
183
                                        writer.writeCharacters("\n");
184
                                }
185
                                if( test.after > 0)
186
                                        return m[0][test.after];
187
                        }
188
                }
189

    
190
                if (DEBUG > 0) {println "Other : "+s}
191
                //                def o = s.split("[\\p{Z}\\p{C}]+")
192
                //                for (String sub : o) {
193
                //                        if (sub.length() == 0) continue
194
                wordcount++;
195
                writer.writeStartElement("w");
196
                writeWordAttributes();// id
197
                if (s.matches(/\p{P}/))
198
                        writer.writeAttribute("type","pon");
199
                else
200
                        writer.writeAttribute("type","w");
201
                writer.writeCharacters(s);
202
                writer.writeEndElement();
203
                writer.writeCharacters("\n");
204
                //                }
205
                return "";
206

    
207

    
208
        }
209

    
210
        /**
211
         * Write word attributes.
212
         *
213
         * @return the java.lang. object
214
         */
215
        protected writeWordAttributes()
216
        {
217
                writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
218
                writer.writeAttribute("n",""+wordcount);
219
        }
220

    
221
        protected void processCharacters()
222
        {
223
                //println "TEXT: "+parser.getText()
224
                switch (state) {
225
                        case STATES.TEXT:
226
                                buffer.append(parser.getText())
227
                        //tokenize();
228
                                break
229
                        default:
230
                                super.processCharacters();
231
                }
232
        }
233

    
234
        public static void main(String[] args) {
235
                long start = System.currentTimeMillis();
236
                File dir = new File("/home/mdecorde/xml/txmrefman")
237
                File infile = new File(dir, "TEI/refman.xml");
238
                File outfile = new File(dir, "tokenization/refman-t3.xml");
239

    
240
                TEITokenizer tokenizer = new TEITokenizer(infile.toURI().toURL());
241

    
242
                if (tokenizer.process(outfile))
243
                        println "Done - OK"
244
                else
245
                        println "Done - FAIL"
246

    
247
                println "time: "+(System.currentTimeMillis()-start)/1000
248

    
249
                for (File f : new File("/home/mdecorde/xml/txmrefman/tokenization").listFiles()) {
250
                        String text = f.getText("UTF-8")
251
                        f.withWriter("UTF-8") { writer ->
252
                                text = text.replaceAll(/ id="w_refman_.+" n="[^"]+"/, "")
253
                                text = text.replaceAll("xml:id=", "id=")
254
                                text = text.replaceAll("<w>", "<w type=\"w\">")
255
//                                text = text.replaceAll("</?(p|body|item|text|table|list|ref|anchor|cell|row|head|div|term|hi|index|figure|graphic|lb|note).+", "")
256
//                                text = text.replaceAll("\n[^<>\n]+\n", "\n")
257
//                                text = text.replaceAll("\n[^<>\n]+\n", "\n")
258
//                                text = text.replaceAll("\n[^<>\n]+\n", "\n")
259
//                                text = text.replaceAll("\n++", "\n")
260
                                writer.write(text)
261
                        }
262
                }
263
        }
264
}