Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / filters / Tokeniser / TTTokenizer.groovy @ 1000

History | View | Annotate | Download (8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28

    
29

    
30
package org.txm.scripts.filters.Tokeniser
31
import java.io.File;
32
import java.util.regex.Pattern;
33
import java.util.regex.Matcher;
34

    
35
// TODO: Auto-generated Javadoc
36
/**
37
########################################################################
38
#                                                                      #
39
#  tokenization script for tagger preprocessing                        #
40
#  Author: Helmut Schmid, IMS, University of Stuttgart                 #
41
#          Serge Sharoff, University of Leeds                          #
42
#  Groovy Translation: mdecorde                                        #
43
#  Description:                                                        #
44
#  - splits input text into tokens (one token per line)                #
45
#  - cuts off punctuation, parentheses etc.                            #
46
#  - disambiguates periods                                             #
47
#  - preserves SGML markup                                             #
48
#                                                                      #
49
########################################################################
50
*/
51
class TTTokenizer {
52
        
53
        // characters which have to be cut off at the beginning of a word
54
        
55
        /** The P char. */
56
        def PChar="\\[{(\\`\"‚„†‡‹‘’“”•–—›";
57
        // characters which have to be cut off at the end of a word
58
        
59
        /** The F char. */
60
        def FChar="\\]}'`\"),;:!?%‚„…†‡‰‹‘’“”•–—›";
61
        // character sequences which have to be cut off at the beginning of a word
62
        
63
        /** The P clitic. */
64
        def PClitic = "";
65
        // character sequences which have to be cut off at the end of a word
66
        
67
        /** The F clitic. */
68
        def FClitic = "";
69
                
70
        /**
71
         * The main method.
72
         *
73
         * @param args the arguments
74
         */
75
        public static void main(String[] args)
76
        {
77
                File input = new File("/home/mdecorde/xml/tokenizer/test.xml");
78
                File output = new File("/home/mdecorde/xml/tokenizer/test-t.xml");
79
                
80
                def tokenizer = new TTTokenizer("", null);
81
                tokenizer.run(input, output)
82
        }
83
        
84
        
85
        /** The Token. */
86
        def Token = [:];
87
        
88
        /**
89
         * Instantiates a new tT tokenizer.
90
         *
91
         * @param lang the lang
92
         * @param abbr the abbr
93
         */
94
        public TTTokenizer(String lang, File abbr) {
95
                switch(lang) {
96
                case "en": // English
97
                          FClitic = "'(s|re|ve|d|m|em|ll)|n\'t";
98
                          break;
99
                case "it": // Italian
100
                          PClitic = '[dD][ae]ll\'|[nN]ell\'|[Aa]ll\'|[lLDd]\'|[Ss]ull\'|[Qq]uest\'|[Uu]n\'|[Ss]enz\'|[Tt]utt\''
101
                          break;
102
                case "fr": // French
103
                          PClitic = '[dcjlmnstDCJLNMST]\'|[Qq]u\'|[Jj]usqu\'|[Ll]orsqu\'';
104
                          FClitic = '-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mmes?|-m\'|-moi|-nous|-on|-toi|-tu|-t\'|-vous|-en|-y|-ci|-l';
105
                          break;
106
                }
107
                
108
                if(abbr != null && abbr.canRead()) {
109
                        Reader reader = new FileReader(abbr);
110
                        String line = reader.readLine();
111
                        while(line != null) {
112
                                line = line.replaceAll("^[ \t\r\n]+", "");
113
                                line = line.replaceAll("^[ \t\r\n]+\$", "");
114
                                if(!(line.matches("^#") || line.matches("\\s\$"))) // ignore comments
115
                                {
116
                                        Token.put(line, 1);
117
                                }
118
                                line = reader.readLine();
119
                        }
120
                        reader.close();
121
                }
122
        }
123
        
124
        /** The m. */
125
        def m;
126
        
127
        /**
128
         * Run.
129
         *
130
         * @param input the input
131
         * @param output the output
132
         * @return true, if successful
133
         */
134
        public boolean run(File input, File output) {
135
                if(!input.canRead()){
136
                        println "Can't read input file: "+input
137
                        return false;
138
                }
139
                if(!output.getParentFile().canWrite()){
140
                        println "Can't write out file: "+input
141
                        return false;
142
                }
143
                
144
                Reader reader = new FileReader(input);
145
                Writer writer = new FileWriter(output);
146
                String line = reader.readLine();
147
                while(line != null) 
148
                {
149
                        println "line: "+line
150
                        line = line.replaceAll("\t"," "); // replace newlines and tab characters with blanks
151
                        line = line.replaceAll("\n"," "); // replace newlines and tab characters with blanks
152
                        
153
                        String replace = line.replaceAll(/(<[^<> ]*) ([^<>]*>)/, '$1ÿ$2');
154
                        while(replace != line) // replace blanks within SGML tags
155
                        {
156
                                line = replace;
157
                                replace = replace.replaceAll(/(<[^<> ]*) ([^<>]*>)/, '$1ÿ$2');
158
                        }
159
                        line = replace;
160
                        
161
                        line = line.replaceAll(/ /, "þ"); // replace whitespace with a special character
162
                        // tr/\377\376/ \377/; // 377=ÿ , 376=þ
163
                        line = line.replaceAll(/ÿ/, " "); // restore SGML tags
164
                        line = line.replaceAll(/þ/, " ÿ"); // restore SGML tags
165
                        //line = line.replaceAll(/ÿ/," ");
166
                        
167
                        // prepare SGML-Tags for tokenization
168
                        line = line.replaceAll(/(<[^<>]*>)/, 'ÿ$1ÿ');
169
                        line = line.replaceAll(/^ÿ/,"");
170
                        line = line.replaceAll(/ÿ$/,"");
171
                        line = line.replaceAll(/ÿÿÿ*/,"ÿ");
172
                        
173
                        String[] split1 = line.split("ÿ");
174
                        boolean finish = true;
175
                        String suffix = "";
176
                        for(String s : split1)
177
                        {
178
                                if(s =~ /^<.*>$/) // SGML tag
179
                                {
180
                                        writer.write("$s\n");
181
                                }
182
                                else { // add a blank at the beginning and the end of each segment
183
                                        s = " $s ";
184
                                        // insert missing blanks after punctuation
185
                                        s = s.replaceAll(/(\.\.\.)/, " ... ");
186
                                        s = s.replaceAll(/([;\!\?])([^ ])/, '$1 $2');
187
                                        s = s.replaceAll(/([.,:])([^ 0-9.])/, '$1 $2');
188

    
189
                                        for(String s2 : s.split(" "))
190
                                        {
191
                                                
192
                                                finish = false;
193
                                                suffix = "";
194
                                                // separate punctuation and parentheses from words
195
                                                while(!finish) 
196
                                                {
197
                                                        //println "suffix: "+suffix
198
                                                        finish = true;
199
                                                //         cut off preceding punctuation
200
                                                        if((m = (s2 =~ /^([$PChar])(.*)$/)))
201
                                                        {
202
                                                                s2 = m[0][2]
203
                                                                writer.write(m[0][1]+"\n")
204
                                                                finish = false;
205
                                                        }
206
                                                        
207
                                                //         cut off trailing punctuation
208
                                                        if((m = (s2 =~ /^(.*)([$FChar])$/)))
209
                                                        {
210
                                                                s2 = m[0][1];
211
                                                                suffix = m[0][2] + "\n$suffix";
212
                                                                finish = false;
213
                                                                
214
                                                                if(s2.length() == 0)
215
                                                                        writer.write(suffix)
216
                                                        }
217
                                                
218
                                                        // cut off trailing periods if punctuation precedes
219
                                                        if((m = (s2 =~ /([$FChar])\.$/))) {
220
                                                                suffix = ".\n$suffix";
221
                                                                if(s2.length() == 0)
222
                                                                {
223
                                                                         s2 = m[0][1];
224
                                                                }
225
                                                                else
226
                                                                {
227
                                                                        suffix = m[0][1]+"\n$suffix"
228
                                                                        s2 = "";
229
                                                                }
230
                                                                finish = false;
231
                                                        }
232
                                                }// end while
233
                                                
234
                                                // handle explicitly listed tokens
235
                                                if(Token.containsKey(s2))
236
                                                {
237
                                                        writer.write("$s2\n$suffix");
238
                                                        continue;
239
                                                }
240
                                                
241
                                                // abbreviations of the form A. or U.S.A.
242
                                                if(s2 =~ /^([A-Za-z-]\.)+$/)        
243
                                                {
244
                                                        writer.write("$s2\n$suffix");
245
                                                        continue;
246
                                                }
247
                                                
248
                                                // disambiguate periods
249
                                                if((m = (s2 =~ /^(..*)\.$/)) && 
250
                                                                (s2 !="...") && 
251
                                                                !( s ==~ /^[0-9]+\.$/))
252
                                                {
253
                                                        s2 = m[0][1];
254
                                                        suffix = ".\n$suffix";
255
                                                        if(Token.containsKey(s2))
256
                                                        {
257
                                                                writer.write("$s2\n$suffix");
258
                                                                continue;
259
                                                        }
260
                                                }
261
                                                
262
                                                // cut off clitics
263
                                                if(PClitic.length() > 0)
264
                                                {
265
                                                        while((m = (s2 =~ /^($PClitic)(.*)/)))
266
                                                        {
267
                                                                s2 = m[0][2];
268
                                                                writer.write(m[0][1] + "\n")
269
                                                        }
270
                                                }
271

    
272
                                                if(FClitic.length() > 0)
273
                                                {
274
                                                        while((m = (s2 =~ /(.*)($FClitic)$/)))
275
                                                        {
276
                                                                s2 = m[0][1];
277
                                                                writer.write(m[0][2] + "\n")
278
                                                        }
279
                                                }
280
                                                if(s2.length() > 0)
281
                                                        writer.write("$s2\n$suffix");
282
                                        }
283
                                }        
284
                        }
285
                        line = reader.readLine();
286
                }
287
                writer.close()
288
                reader.close()
289
        }
290
}