Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / filters / Tokeniser / TTTokenizer.groovy @ 1000

History | View | Annotate | Download (8 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
//
22 321 mdecorde
//
23 321 mdecorde
//
24 321 mdecorde
// $LastChangedDate:$
25 321 mdecorde
// $LastChangedRevision:$
26 321 mdecorde
// $LastChangedBy:$
27 321 mdecorde
//
28 321 mdecorde
29 321 mdecorde
30 1000 mdecorde
package org.txm.scripts.filters.Tokeniser
31 321 mdecorde
import java.io.File;
32 321 mdecorde
import java.util.regex.Pattern;
33 321 mdecorde
import java.util.regex.Matcher;
34 321 mdecorde
35 321 mdecorde
// TODO: Auto-generated Javadoc
36 321 mdecorde
/**
37 321 mdecorde
########################################################################
38 321 mdecorde
#                                                                      #
39 321 mdecorde
#  tokenization script for tagger preprocessing                        #
40 321 mdecorde
#  Author: Helmut Schmid, IMS, University of Stuttgart                 #
41 321 mdecorde
#          Serge Sharoff, University of Leeds                          #
42 321 mdecorde
#  Groovy Translation: mdecorde                                        #
43 321 mdecorde
#  Description:                                                        #
44 321 mdecorde
#  - splits input text into tokens (one token per line)                #
45 321 mdecorde
#  - cuts off punctuation, parentheses etc.                            #
46 321 mdecorde
#  - disambiguates periods                                             #
47 321 mdecorde
#  - preserves SGML markup                                             #
48 321 mdecorde
#                                                                      #
49 321 mdecorde
########################################################################
50 321 mdecorde
*/
51 321 mdecorde
class TTTokenizer {
52 321 mdecorde
53 321 mdecorde
        // characters which have to be cut off at the beginning of a word
54 321 mdecorde
55 321 mdecorde
        /** The P char. */
56 321 mdecorde
        def PChar="\\[{(\\`\"‚„†‡‹‘’“”•–—›";
57 321 mdecorde
        // characters which have to be cut off at the end of a word
58 321 mdecorde
59 321 mdecorde
        /** The F char. */
60 321 mdecorde
        def FChar="\\]}'`\"),;:!?%‚„…†‡‰‹‘’“”•–—›";
61 321 mdecorde
        // character sequences which have to be cut off at the beginning of a word
62 321 mdecorde
63 321 mdecorde
        /** The P clitic. */
64 321 mdecorde
        def PClitic = "";
65 321 mdecorde
        // character sequences which have to be cut off at the end of a word
66 321 mdecorde
67 321 mdecorde
        /** The F clitic. */
68 321 mdecorde
        def FClitic = "";
69 321 mdecorde
70 321 mdecorde
        /**
71 321 mdecorde
         * The main method.
72 321 mdecorde
         *
73 321 mdecorde
         * @param args the arguments
74 321 mdecorde
         */
75 321 mdecorde
        public static void main(String[] args)
76 321 mdecorde
        {
77 321 mdecorde
                File input = new File("/home/mdecorde/xml/tokenizer/test.xml");
78 321 mdecorde
                File output = new File("/home/mdecorde/xml/tokenizer/test-t.xml");
79 321 mdecorde
80 321 mdecorde
                def tokenizer = new TTTokenizer("", null);
81 321 mdecorde
                tokenizer.run(input, output)
82 321 mdecorde
        }
83 321 mdecorde
84 321 mdecorde
85 321 mdecorde
        /** The Token. */
86 321 mdecorde
        def Token = [:];
87 321 mdecorde
88 321 mdecorde
        /**
89 321 mdecorde
         * Instantiates a new tT tokenizer.
90 321 mdecorde
         *
91 321 mdecorde
         * @param lang the lang
92 321 mdecorde
         * @param abbr the abbr
93 321 mdecorde
         */
94 321 mdecorde
        public TTTokenizer(String lang, File abbr) {
95 321 mdecorde
                switch(lang) {
96 321 mdecorde
                case "en": // English
97 321 mdecorde
                          FClitic = "'(s|re|ve|d|m|em|ll)|n\'t";
98 321 mdecorde
                          break;
99 321 mdecorde
                case "it": // Italian
100 321 mdecorde
                          PClitic = '[dD][ae]ll\'|[nN]ell\'|[Aa]ll\'|[lLDd]\'|[Ss]ull\'|[Qq]uest\'|[Uu]n\'|[Ss]enz\'|[Tt]utt\''
101 321 mdecorde
                          break;
102 321 mdecorde
                case "fr": // French
103 321 mdecorde
                          PClitic = '[dcjlmnstDCJLNMST]\'|[Qq]u\'|[Jj]usqu\'|[Ll]orsqu\'';
104 321 mdecorde
                          FClitic = '-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mmes?|-m\'|-moi|-nous|-on|-toi|-tu|-t\'|-vous|-en|-y|-ci|-l';
105 321 mdecorde
                          break;
106 321 mdecorde
                }
107 321 mdecorde
108 321 mdecorde
                if(abbr != null && abbr.canRead()) {
109 321 mdecorde
                        Reader reader = new FileReader(abbr);
110 321 mdecorde
                        String line = reader.readLine();
111 321 mdecorde
                        while(line != null) {
112 321 mdecorde
                                line = line.replaceAll("^[ \t\r\n]+", "");
113 321 mdecorde
                                line = line.replaceAll("^[ \t\r\n]+\$", "");
114 321 mdecorde
                                if(!(line.matches("^#") || line.matches("\\s\$"))) // ignore comments
115 321 mdecorde
                                {
116 321 mdecorde
                                        Token.put(line, 1);
117 321 mdecorde
                                }
118 321 mdecorde
                                line = reader.readLine();
119 321 mdecorde
                        }
120 321 mdecorde
                        reader.close();
121 321 mdecorde
                }
122 321 mdecorde
        }
123 321 mdecorde
124 321 mdecorde
        /** The m. */
125 321 mdecorde
        def m;
126 321 mdecorde
127 321 mdecorde
        /**
128 321 mdecorde
         * Run.
129 321 mdecorde
         *
130 321 mdecorde
         * @param input the input
131 321 mdecorde
         * @param output the output
132 321 mdecorde
         * @return true, if successful
133 321 mdecorde
         */
134 321 mdecorde
        public boolean run(File input, File output) {
135 321 mdecorde
                if(!input.canRead()){
136 321 mdecorde
                        println "Can't read input file: "+input
137 321 mdecorde
                        return false;
138 321 mdecorde
                }
139 321 mdecorde
                if(!output.getParentFile().canWrite()){
140 321 mdecorde
                        println "Can't write out file: "+input
141 321 mdecorde
                        return false;
142 321 mdecorde
                }
143 321 mdecorde
144 321 mdecorde
                Reader reader = new FileReader(input);
145 321 mdecorde
                Writer writer = new FileWriter(output);
146 321 mdecorde
                String line = reader.readLine();
147 321 mdecorde
                while(line != null)
148 321 mdecorde
                {
149 321 mdecorde
                        println "line: "+line
150 321 mdecorde
                        line = line.replaceAll("\t"," "); // replace newlines and tab characters with blanks
151 321 mdecorde
                        line = line.replaceAll("\n"," "); // replace newlines and tab characters with blanks
152 321 mdecorde
153 321 mdecorde
                        String replace = line.replaceAll(/(<[^<> ]*) ([^<>]*>)/, '$1ÿ$2');
154 321 mdecorde
                        while(replace != line) // replace blanks within SGML tags
155 321 mdecorde
                        {
156 321 mdecorde
                                line = replace;
157 321 mdecorde
                                replace = replace.replaceAll(/(<[^<> ]*) ([^<>]*>)/, '$1ÿ$2');
158 321 mdecorde
                        }
159 321 mdecorde
                        line = replace;
160 321 mdecorde
161 321 mdecorde
                        line = line.replaceAll(/ /, "þ"); // replace whitespace with a special character
162 321 mdecorde
                        // tr/\377\376/ \377/; // 377=ÿ , 376=þ
163 321 mdecorde
                        line = line.replaceAll(/ÿ/, " "); // restore SGML tags
164 321 mdecorde
                        line = line.replaceAll(/þ/, " ÿ"); // restore SGML tags
165 321 mdecorde
                        //line = line.replaceAll(/ÿ/," ");
166 321 mdecorde
167 321 mdecorde
                        // prepare SGML-Tags for tokenization
168 321 mdecorde
                        line = line.replaceAll(/(<[^<>]*>)/, 'ÿ$1ÿ');
169 321 mdecorde
                        line = line.replaceAll(/^ÿ/,"");
170 321 mdecorde
                        line = line.replaceAll(/ÿ$/,"");
171 321 mdecorde
                        line = line.replaceAll(/ÿÿÿ*/,"ÿ");
172 321 mdecorde
173 321 mdecorde
                        String[] split1 = line.split("ÿ");
174 321 mdecorde
                        boolean finish = true;
175 321 mdecorde
                        String suffix = "";
176 321 mdecorde
                        for(String s : split1)
177 321 mdecorde
                        {
178 321 mdecorde
                                if(s =~ /^<.*>$/) // SGML tag
179 321 mdecorde
                                {
180 321 mdecorde
                                        writer.write("$s\n");
181 321 mdecorde
                                }
182 321 mdecorde
                                else { // add a blank at the beginning and the end of each segment
183 321 mdecorde
                                        s = " $s ";
184 321 mdecorde
                                        // insert missing blanks after punctuation
185 321 mdecorde
                                        s = s.replaceAll(/(\.\.\.)/, " ... ");
186 321 mdecorde
                                        s = s.replaceAll(/([;\!\?])([^ ])/, '$1 $2');
187 321 mdecorde
                                        s = s.replaceAll(/([.,:])([^ 0-9.])/, '$1 $2');
188 321 mdecorde
189 321 mdecorde
                                        for(String s2 : s.split(" "))
190 321 mdecorde
                                        {
191 321 mdecorde
192 321 mdecorde
                                                finish = false;
193 321 mdecorde
                                                suffix = "";
194 321 mdecorde
                                                // separate punctuation and parentheses from words
195 321 mdecorde
                                                while(!finish)
196 321 mdecorde
                                                {
197 321 mdecorde
                                                        //println "suffix: "+suffix
198 321 mdecorde
                                                        finish = true;
199 321 mdecorde
                                                //         cut off preceding punctuation
200 321 mdecorde
                                                        if((m = (s2 =~ /^([$PChar])(.*)$/)))
201 321 mdecorde
                                                        {
202 321 mdecorde
                                                                s2 = m[0][2]
203 321 mdecorde
                                                                writer.write(m[0][1]+"\n")
204 321 mdecorde
                                                                finish = false;
205 321 mdecorde
                                                        }
206 321 mdecorde
207 321 mdecorde
                                                //         cut off trailing punctuation
208 321 mdecorde
                                                        if((m = (s2 =~ /^(.*)([$FChar])$/)))
209 321 mdecorde
                                                        {
210 321 mdecorde
                                                                s2 = m[0][1];
211 321 mdecorde
                                                                suffix = m[0][2] + "\n$suffix";
212 321 mdecorde
                                                                finish = false;
213 321 mdecorde
214 321 mdecorde
                                                                if(s2.length() == 0)
215 321 mdecorde
                                                                        writer.write(suffix)
216 321 mdecorde
                                                        }
217 321 mdecorde
218 321 mdecorde
                                                        // cut off trailing periods if punctuation precedes
219 321 mdecorde
                                                        if((m = (s2 =~ /([$FChar])\.$/))) {
220 321 mdecorde
                                                                suffix = ".\n$suffix";
221 321 mdecorde
                                                                if(s2.length() == 0)
222 321 mdecorde
                                                                {
223 321 mdecorde
                                                                         s2 = m[0][1];
224 321 mdecorde
                                                                }
225 321 mdecorde
                                                                else
226 321 mdecorde
                                                                {
227 321 mdecorde
                                                                        suffix = m[0][1]+"\n$suffix"
228 321 mdecorde
                                                                        s2 = "";
229 321 mdecorde
                                                                }
230 321 mdecorde
                                                                finish = false;
231 321 mdecorde
                                                        }
232 321 mdecorde
                                                }// end while
233 321 mdecorde
234 321 mdecorde
                                                // handle explicitly listed tokens
235 321 mdecorde
                                                if(Token.containsKey(s2))
236 321 mdecorde
                                                {
237 321 mdecorde
                                                        writer.write("$s2\n$suffix");
238 321 mdecorde
                                                        continue;
239 321 mdecorde
                                                }
240 321 mdecorde
241 321 mdecorde
                                                // abbreviations of the form A. or U.S.A.
242 321 mdecorde
                                                if(s2 =~ /^([A-Za-z-]\.)+$/)
243 321 mdecorde
                                                {
244 321 mdecorde
                                                        writer.write("$s2\n$suffix");
245 321 mdecorde
                                                        continue;
246 321 mdecorde
                                                }
247 321 mdecorde
248 321 mdecorde
                                                // disambiguate periods
249 321 mdecorde
                                                if((m = (s2 =~ /^(..*)\.$/)) &&
250 321 mdecorde
                                                                (s2 !="...") &&
251 321 mdecorde
                                                                !( s ==~ /^[0-9]+\.$/))
252 321 mdecorde
                                                {
253 321 mdecorde
                                                        s2 = m[0][1];
254 321 mdecorde
                                                        suffix = ".\n$suffix";
255 321 mdecorde
                                                        if(Token.containsKey(s2))
256 321 mdecorde
                                                        {
257 321 mdecorde
                                                                writer.write("$s2\n$suffix");
258 321 mdecorde
                                                                continue;
259 321 mdecorde
                                                        }
260 321 mdecorde
                                                }
261 321 mdecorde
262 321 mdecorde
                                                // cut off clitics
263 321 mdecorde
                                                if(PClitic.length() > 0)
264 321 mdecorde
                                                {
265 321 mdecorde
                                                        while((m = (s2 =~ /^($PClitic)(.*)/)))
266 321 mdecorde
                                                        {
267 321 mdecorde
                                                                s2 = m[0][2];
268 321 mdecorde
                                                                writer.write(m[0][1] + "\n")
269 321 mdecorde
                                                        }
270 321 mdecorde
                                                }
271 321 mdecorde
272 321 mdecorde
                                                if(FClitic.length() > 0)
273 321 mdecorde
                                                {
274 321 mdecorde
                                                        while((m = (s2 =~ /(.*)($FClitic)$/)))
275 321 mdecorde
                                                        {
276 321 mdecorde
                                                                s2 = m[0][1];
277 321 mdecorde
                                                                writer.write(m[0][2] + "\n")
278 321 mdecorde
                                                        }
279 321 mdecorde
                                                }
280 321 mdecorde
                                                if(s2.length() > 0)
281 321 mdecorde
                                                        writer.write("$s2\n$suffix");
282 321 mdecorde
                                        }
283 321 mdecorde
                                }
284 321 mdecorde
                        }
285 321 mdecorde
                        line = reader.readLine();
286 321 mdecorde
                }
287 321 mdecorde
                writer.close()
288 321 mdecorde
                reader.close()
289 321 mdecorde
        }
290 321 mdecorde
}