Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / transcriber / TokenizeEntities.groovy @ 1000

History | View | Annotate | Download (1.4 kB)

1
package org.txm.scripts.importer.transcriber
2

    
3
import org.txm.importer.StaxIdentityParser
4

    
5

    
6
/**
7
 * Wrap  "entities" span event in "w" tags 
8
 * 
9
 * @author mdecorde
10
 *
11
 */
12
class TokenizeEntities extends StaxIdentityParser{
13
        int wordcount = 1;
14
        boolean writeStartWord = false;
15
        boolean writeEndWord = false;
16
        public TokenizeEntities(URL url) {
17
                super(url);
18
        }
19
        
20
        public void processStartElement() {
21
                // close previous word
22
                if (localname == "Event" && parser.getAttributeValue(null, "type") == "entities") {
23
                        if (parser.getAttributeValue(null, "extent") == "end") { // end of word
24
                                writer.writeEndElement();
25
                                writeEndWord = false;
26
                        }
27
                }
28
                
29
                super.processStartElement();
30
                
31
                // start new word if necessary
32
                if (localname == "Event" && parser.getAttributeValue(null, "type") == "entities") {
33
                        if (parser.getAttributeValue(null, "extent") == "begin") { // start of word
34
                                writeStartWord = true;
35
                        }
36
                }
37
        }
38
        
39
        public void processEndElement() {
40

    
41
                super.processEndElement();
42
                
43
                if (writeStartWord) {
44
                        writer.writeStartElement("w");
45
                        //writer.writeAttribute("id", "w_entity_"+(wordcount++));
46
                        writeStartWord = false;
47
                } 
48
        }
49
        
50
        public static void main(String[] args) {
51
                File dir = new File ("C:\\Documents and Settings\\mdecorde\\xml\\minileman2011")
52
                File infile = new File(dir, "int05.trs")
53
                TokenizeEntities tokenizer = new TokenizeEntities(infile.toURI().toURL());
54
                println tokenizer.process(new File(dir, "out.trs")) 
55
        }
56
}