Statistics
| Revision:

root / tmp / org.txm.tokenizer.core / src / org / txm / tokenizer / SimpleStringTokenizer.groovy @ 3057

History | View | Annotate | Download (6.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
package org.txm.tokenizer;
23

    
24
import java.io.File;
25
import java.util.LinkedHashMap;
26
import java.util.regex.Matcher;
27
import java.util.regex.Pattern;
28

    
29
import org.txm.tokenizer.TokenizerClasses;
30

    
31
public class SimpleStringTokenizer implements StringTokenizer {
32
        
33
        /** The punct_strong. */
34
        String punct_strong = TokenizerClasses.punct_strong;
35
        
36
        /** The punct_all. */
37
        String punct_all = TokenizerClasses.punct_all;
38
        
39
        /** The word_chars. */
40
        String word_chars = TokenizerClasses.word_chars;
41
        
42
        String fclitics = null; // default behavior don't manage clitics
43
        
44
        String pclitics = null; // default behavior don't manage clitics
45
        
46
        Pattern reg_punct_other = Pattern.compile("\\p{P}");
47
        
48
        /** The DEBUG. */
49
        public boolean DEBUG = false;
50
        
51
        String lang;
52
        
53
        def regElision = null;
54
        
55
        def reg3pts = null;
56
        
57
        def regPunct;
58
        
59
        def regFClitics = null;
60
        
61
        def regPClitics = null;
62
        
63
        String whitespaces;
64
        
65
        Pattern regWhitespaces;
66
        
67
        Pattern regSplitWhiteSpaces;
68
        
69
        /**
70
         * Instantiates a new simple string tokenizer.
71
         *
72
         * @param lang
73
         */
74
        public SimpleStringTokenizer(String lang) {
75
                this.lang = lang;
76
                if (lang != null)
77
                        if (lang.startsWith("en")) {
78
                                fclitics = TokenizerClasses.FClitic_en;
79
                        } else if (lang.startsWith("fr")) {
80
                                fclitics = TokenizerClasses.FClitic_fr;
81
                                pclitics = TokenizerClasses.PClitic_fr;
82
                        } else if (lang.startsWith("gl")) {
83
                                fclitics = TokenizerClasses.FClitic_gl;
84
                        } else if (lang.startsWith("it")) {
85
                                pclitics = TokenizerClasses.PClitic_it;
86
                        }
87
                
88
                punct_strong = TokenizerClasses.punct_strong;
89
                punct_all = TokenizerClasses.punct_all;
90
                word_chars = TokenizerClasses.word_chars;
91
                
92
                String strRegElision = TokenizerClasses.regElision;
93
                if (strRegElision != null && strRegElision.length() > 0) {
94
                        regElision = ~/^([\p{L}-]++${strRegElision}[\p{L}-]++)(.*)$/ // the test must start with the start of string  ^
95
                }
96
                reg3pts = ~/^(.*?)(\.\.\.)(.*)$/
97
                
98
                String strRegPunct = TokenizerClasses.regPunct;
99
                if (strRegPunct != null && strRegPunct.length() > 0) {
100
                        regPunct = ~/^(.*?)($strRegPunct)(.*)$/
101
                }
102
                if (fclitics != null && fclitics.length() > 0) {
103
                        regFClitics = ~/(.+)($fclitics)$/ // the test must end with the end of string $
104
                }
105
                if (pclitics != null && pclitics.length() > 0)
106
                        regPClitics = /^($pclitics)(.*)/ // the test must start with the start of string  ^
107
                
108
                if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0) {
109
                        regWhitespaces = ~TokenizerClasses.whitespaces;
110
                }
111
                
112
                if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0) {
113
                        regSplitWhiteSpaces = Pattern.compile(TokenizerClasses.whitespaces);
114
                }
115
        }
116
        
117
        public final static String WHITESPACE = " ";
118
        
119
        public final static String EMPTY = "";
120
        
121
        /**
122
         * Process word.
123
         */
124
        public ArrayList<String> processText(String text) {
125
                ArrayList<String> result = new ArrayList<String>();
126
                if (regSplitWhiteSpaces != null) {
127
                        for (String s : regSplitWhiteSpaces.split(text)) {                // separate with unicode white spaces
128
                                // if (DEBUG){println "process $s"}
129
                                result.addAll(iterate(s));
130
                        }
131
                }
132
                else {
133
                        result.addAll(iterate(text));
134
                }
135
                return result;
136
        }
137
        
138
        /**
139
         * Iterate. a String, should be called when a word is found in a String
140
         *
141
         * @param s the s
142
         * @return the java.lang. object
143
         */
144
        protected ArrayList<String> iterate(String s) {
145
                ArrayList<String> result = new ArrayList<String>();
146
                while (s != null && s.length() > 0) {
147
                        //        if (DEBUG){println "  > $s"}
148
                        s = standardChecks(result, s);
149
                }
150
                return result;
151
        }
152
        
153
        /**
154
         * Standard checks.
155
         *
156
         * @param s the s
157
         * @return the java.lang. object
158
         */
159
        //        @CompileStatic(SKIP)
160
        public String standardChecks(ArrayList<String> result, String s) {
161
                Matcher m;
162
                
163
                for (TTest test : TokenizerClasses.tests) {
164
                        if ((m = s =~ test.regex)) {
165
                                if (DEBUG) {println "test : "+test.regex}
166
                                if (test.before > 0) {
167
                                        result.addAll(iterate(m[0][test.before]))
168
                                }
169
                                if (test.hit > 0) {
170
                                        result.add(m[0][test.hit]);
171
                                }
172
                                if (test.after > 0) {
173
                                        return m[0][test.after];
174
                                }
175
                        }
176
                }
177
                
178
                if (fclitics != null && (m = s =~ regFClitics) ) {
179
                        //        if (DEBUG) println "CLITIC found: $s ->"+ m
180
                        result.addAll(iterate(m.group(1)))
181
                        
182
                        result.add(m.group(2));
183
                        
184
                        return "";
185
                } else if (pclitics != null && (m = s =~ regPClitics) ) {
186
                        if (DEBUG) println "PCLITIC found: $s ->"+ m
187
                        
188
                        result.add(m.group(1));
189
                        
190
                        result.addAll(iterate(m.group(2)))
191
                        
192
                        return "";
193
                } else if (regElision != null && (m = s =~ regElision) ) {
194
                        //        if (DEBUG) println "Elision found: $s ->"+ m.group(1)+" + "+m.group(2)+" + "+m.group(3)
195
                        //iterate(m.group(1))
196
                        
197
                        //                        int sep = s.indexOf("'");
198
                        //                        if (sep < 0)
199
                        //                                sep = s.indexOf("’");
200
                        //                        if (sep < 0)
201
                        //                                sep = s.indexOf("‘");
202
                        
203
                        result.add(m.group(1));
204
                        
205
                        result.addAll(iterate(m.group(2)))
206
                        
207
                        return "";
208
                } else if (reg3pts != null && (m = s =~ reg3pts) )        {
209
                        //        if(DEBUG){println "REG '...' found: $s -> "+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
210
                        result.addAll(iterate(m.group(1)))
211
                        
212
                        result.add("...");
213
                        
214
                        return m.group(3);
215
                } else if (regPunct != null && (m = s =~ regPunct) ) {
216
                        if(DEBUG){println "PUNCT found: $s ->"+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
217
                        result.addAll(iterate(m.group(1)))
218
                        
219
                        result.add(m.group(2));
220
                        
221
                        return m.group(3);
222
                } else {
223
                        //                if(DEBUG){println "Other found: "+s}
224
                        result.add(s);
225
                        
226
                        return "";
227
                }
228
        }
229
        
230
        /**
231
         * The main method.
232
         *
233
         * @param args the arguments
234
         */
235
        public static void main(String[] args) {
236
                def tests = [
237
                        ["fr", "c'est un test."],
238
                        ["fr", "C'est un autre test."],
239
                        [
240
                                "fr",
241
                                "C'est une version 0.8.1 un 01:12:12 test vers http://un.site.web.fr, fin."
242
                        ],
243
                        ["en", "This is a test."],
244
                        ["en", "It's a test."]
245
                ]
246
                
247
                for (def d : tests) {
248
                        String lang = d[0]
249
                        String text = d[1]
250
                        SimpleStringTokenizer tokenizer = new SimpleStringTokenizer(lang)
251
                        println "Process: $text"
252
                        println "Result : "+tokenizer.processText(text).collect{"<"+it+">"}
253
                }
254
        }
255
}