Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / nlp / XMLStaxTokenizerAdapter.groovy @ 1000

History | View | Annotate | Download (2.1 kB)

1
package org.txm.scripts.tal
2

    
3
import org.txm.importer.StaxIdentityParser;
4

    
5
/**
6
 * Experiment of Tokenizer that targets XPath results text
7
 * 
8
 * @author mdecorde
9
 *
10
 */
11
class XMLStaxTokenizerAdapter extends StaxIdentityParser {
12
        def xpaths;
13
        boolean debug = false;
14
        boolean doTokenize = false
15
        String str; // string to tokenize
16
        def tokenizeResults = []
17
        def nsegments = 0
18
        
19
        public XMLStaxTokenizerAdapter(File infile, def xpaths, def tokenizeResults) {
20
                super(infile.toURI().toURL());
21
                this.xpaths = xpaths
22
                this.tokenizeResults = tokenizeResults
23
        }
24
        
25
        protected void processStartElement() {
26
                String localname = parser.getLocalName();
27
                if (debug) println "start element $currentXPath"
28
                if (doTokenize) {
29
                        if (debug) println "Found text element $currentXPath"
30
                        tokenizeCurrentText();
31
                } else {
32
                        if (xpaths.contains(currentXPath.toString())) {
33
                                doTokenize = true;
34
                                str = "";
35
                        }
36
                }
37
                super.processStartElement()
38
        }
39
        
40
        protected void processEndElement() {
41
                if (xpaths.contains(currentXPath.toString())) {
42
                        tokenizeCurrentText()
43
                        doTokenize = false;
44
                        if (debug) println "Found text element: stop tokenizing"
45
                }
46
                
47
                super.processEndElement();
48
        }
49
        
50
        protected void processCharacters() {
51
                if (doTokenize) {
52
                        str += parser.getText();
53
                } else {
54
                        super.processCharacters();
55
                }
56
        }
57
        
58
        protected tokenizeCurrentText() {
59
                if (str.length() == 0) return;
60
                def tokens = tokenizeResults[nsegments]
61
                for (def token : tokens) {
62
                        writer.writeStartElement("w");
63
                        writer.writeCharacters(token);
64
                        writer.writeEndElement();
65
                        writer.writeCharacters(" ");
66
                }
67

    
68
                nsegments++;
69
        }
70
                
71
        /**
72
         * Best tokenizer ever.
73
         * Extends this function
74
         * 
75
         * @param str
76
         * @return
77
         */
78
        protected List<CharSequence> tokenize(String str) {
79
                return str.tokenize();
80
        }
81
        
82
        public static void main(String[] args) {
83
                File inputXMLFile = new File("/home/mdecorde/xml/comere/cmrclavardage/cmr-getalp_org-actu-tei-v1.xml");
84
                File outputXMLFile = new File("/home/mdecorde/xml/comere/cmrclavardage/tmp.xml");
85
                new XMLStaxTokenizerAdapter(inputXMLFile, ["/TEI/text/body/div/post/p/"]).process(outputXMLFile)
86
        }
87
}