root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / filters / Tokeniser / TTTokenizer.groovy @ 1000
History | View | Annotate | Download (8 kB)
1 | 321 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 321 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 321 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 321 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 321 | mdecorde | //
|
6 | 321 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 321 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 321 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 321 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 321 | mdecorde | // later version.
|
11 | 321 | mdecorde | //
|
12 | 321 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 321 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 321 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 321 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 321 | mdecorde | // details.
|
17 | 321 | mdecorde | //
|
18 | 321 | mdecorde | // You should have received a copy of the GNU General
|
19 | 321 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 321 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 321 | mdecorde | //
|
22 | 321 | mdecorde | //
|
23 | 321 | mdecorde | //
|
24 | 321 | mdecorde | // $LastChangedDate:$
|
25 | 321 | mdecorde | // $LastChangedRevision:$
|
26 | 321 | mdecorde | // $LastChangedBy:$
|
27 | 321 | mdecorde | //
|
28 | 321 | mdecorde | |
29 | 321 | mdecorde | |
30 | 1000 | mdecorde | package org.txm.scripts.filters.Tokeniser
|
31 | 321 | mdecorde | import java.io.File; |
32 | 321 | mdecorde | import java.util.regex.Pattern; |
33 | 321 | mdecorde | import java.util.regex.Matcher; |
34 | 321 | mdecorde | |
35 | 321 | mdecorde | // TODO: Auto-generated Javadoc
|
36 | 321 | mdecorde | /**
|
37 | 321 | mdecorde | ########################################################################
|
38 | 321 | mdecorde | # #
|
39 | 321 | mdecorde | # tokenization script for tagger preprocessing #
|
40 | 321 | mdecorde | # Author: Helmut Schmid, IMS, University of Stuttgart #
|
41 | 321 | mdecorde | # Serge Sharoff, University of Leeds #
|
42 | 321 | mdecorde | # Groovy Translation: mdecorde #
|
43 | 321 | mdecorde | # Description: #
|
44 | 321 | mdecorde | # - splits input text into tokens (one token per line) #
|
45 | 321 | mdecorde | # - cuts off punctuation, parentheses etc. #
|
46 | 321 | mdecorde | # - disambiguates periods #
|
47 | 321 | mdecorde | # - preserves SGML markup #
|
48 | 321 | mdecorde | # #
|
49 | 321 | mdecorde | ########################################################################
|
50 | 321 | mdecorde | */
|
51 | 321 | mdecorde | class TTTokenizer { |
52 | 321 | mdecorde | |
53 | 321 | mdecorde | // characters which have to be cut off at the beginning of a word
|
54 | 321 | mdecorde | |
55 | 321 | mdecorde | /** The P char. */
|
56 | 321 | mdecorde | def PChar="\\[{(\\`\"‚„†‡‹‘’“”•–—›"; |
57 | 321 | mdecorde | // characters which have to be cut off at the end of a word
|
58 | 321 | mdecorde | |
59 | 321 | mdecorde | /** The F char. */
|
60 | 321 | mdecorde | def FChar="\\]}'`\"),;:!?%‚„…†‡‰‹‘’“”•–—›"; |
61 | 321 | mdecorde | // character sequences which have to be cut off at the beginning of a word
|
62 | 321 | mdecorde | |
63 | 321 | mdecorde | /** The P clitic. */
|
64 | 321 | mdecorde | def PClitic = ""; |
65 | 321 | mdecorde | // character sequences which have to be cut off at the end of a word
|
66 | 321 | mdecorde | |
67 | 321 | mdecorde | /** The F clitic. */
|
68 | 321 | mdecorde | def FClitic = ""; |
69 | 321 | mdecorde | |
70 | 321 | mdecorde | /**
|
71 | 321 | mdecorde | * The main method.
|
72 | 321 | mdecorde | *
|
73 | 321 | mdecorde | * @param args the arguments
|
74 | 321 | mdecorde | */
|
75 | 321 | mdecorde | public static void main(String[] args) |
76 | 321 | mdecorde | { |
77 | 321 | mdecorde | File input = new File("/home/mdecorde/xml/tokenizer/test.xml"); |
78 | 321 | mdecorde | File output = new File("/home/mdecorde/xml/tokenizer/test-t.xml"); |
79 | 321 | mdecorde | |
80 | 321 | mdecorde | def tokenizer = new TTTokenizer("", null); |
81 | 321 | mdecorde | tokenizer.run(input, output) |
82 | 321 | mdecorde | } |
83 | 321 | mdecorde | |
84 | 321 | mdecorde | |
85 | 321 | mdecorde | /** The Token. */
|
86 | 321 | mdecorde | def Token = [:];
|
87 | 321 | mdecorde | |
88 | 321 | mdecorde | /**
|
89 | 321 | mdecorde | * Instantiates a new tT tokenizer.
|
90 | 321 | mdecorde | *
|
91 | 321 | mdecorde | * @param lang the lang
|
92 | 321 | mdecorde | * @param abbr the abbr
|
93 | 321 | mdecorde | */
|
94 | 321 | mdecorde | public TTTokenizer(String lang, File abbr) { |
95 | 321 | mdecorde | switch(lang) {
|
96 | 321 | mdecorde | case "en": // English |
97 | 321 | mdecorde | FClitic = "'(s|re|ve|d|m|em|ll)|n\'t";
|
98 | 321 | mdecorde | break;
|
99 | 321 | mdecorde | case "it": // Italian |
100 | 321 | mdecorde | PClitic = '[dD][ae]ll\'|[nN]ell\'|[Aa]ll\'|[lLDd]\'|[Ss]ull\'|[Qq]uest\'|[Uu]n\'|[Ss]enz\'|[Tt]utt\''
|
101 | 321 | mdecorde | break;
|
102 | 321 | mdecorde | case "fr": // French |
103 | 321 | mdecorde | PClitic = '[dcjlmnstDCJLNMST]\'|[Qq]u\'|[Jj]usqu\'|[Ll]orsqu\'';
|
104 | 321 | mdecorde | FClitic = '-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mmes?|-m\'|-moi|-nous|-on|-toi|-tu|-t\'|-vous|-en|-y|-ci|-l';
|
105 | 321 | mdecorde | break;
|
106 | 321 | mdecorde | } |
107 | 321 | mdecorde | |
108 | 321 | mdecorde | if(abbr != null && abbr.canRead()) { |
109 | 321 | mdecorde | Reader reader = new FileReader(abbr); |
110 | 321 | mdecorde | String line = reader.readLine();
|
111 | 321 | mdecorde | while(line != null) { |
112 | 321 | mdecorde | line = line.replaceAll("^[ \t\r\n]+", ""); |
113 | 321 | mdecorde | line = line.replaceAll("^[ \t\r\n]+\$", ""); |
114 | 321 | mdecorde | if(!(line.matches("^#") || line.matches("\\s\$"))) // ignore comments |
115 | 321 | mdecorde | { |
116 | 321 | mdecorde | Token.put(line, 1);
|
117 | 321 | mdecorde | } |
118 | 321 | mdecorde | line = reader.readLine(); |
119 | 321 | mdecorde | } |
120 | 321 | mdecorde | reader.close(); |
121 | 321 | mdecorde | } |
122 | 321 | mdecorde | } |
123 | 321 | mdecorde | |
124 | 321 | mdecorde | /** The m. */
|
125 | 321 | mdecorde | def m;
|
126 | 321 | mdecorde | |
127 | 321 | mdecorde | /**
|
128 | 321 | mdecorde | * Run.
|
129 | 321 | mdecorde | *
|
130 | 321 | mdecorde | * @param input the input
|
131 | 321 | mdecorde | * @param output the output
|
132 | 321 | mdecorde | * @return true, if successful
|
133 | 321 | mdecorde | */
|
134 | 321 | mdecorde | public boolean run(File input, File output) { |
135 | 321 | mdecorde | if(!input.canRead()){
|
136 | 321 | mdecorde | println "Can't read input file: "+input
|
137 | 321 | mdecorde | return false; |
138 | 321 | mdecorde | } |
139 | 321 | mdecorde | if(!output.getParentFile().canWrite()){
|
140 | 321 | mdecorde | println "Can't write out file: "+input
|
141 | 321 | mdecorde | return false; |
142 | 321 | mdecorde | } |
143 | 321 | mdecorde | |
144 | 321 | mdecorde | Reader reader = new FileReader(input); |
145 | 321 | mdecorde | Writer writer = new FileWriter(output); |
146 | 321 | mdecorde | String line = reader.readLine();
|
147 | 321 | mdecorde | while(line != null) |
148 | 321 | mdecorde | { |
149 | 321 | mdecorde | println "line: "+line
|
150 | 321 | mdecorde | line = line.replaceAll("\t"," "); // replace newlines and tab characters with blanks |
151 | 321 | mdecorde | line = line.replaceAll("\n"," "); // replace newlines and tab characters with blanks |
152 | 321 | mdecorde | |
153 | 321 | mdecorde | String replace = line.replaceAll(/(<[^<> ]*) ([^<>]*>)/, '$1ÿ$2'); |
154 | 321 | mdecorde | while(replace != line) // replace blanks within SGML tags |
155 | 321 | mdecorde | { |
156 | 321 | mdecorde | line = replace; |
157 | 321 | mdecorde | replace = replace.replaceAll(/(<[^<> ]*) ([^<>]*>)/, '$1ÿ$2'); |
158 | 321 | mdecorde | } |
159 | 321 | mdecorde | line = replace; |
160 | 321 | mdecorde | |
161 | 321 | mdecorde | line = line.replaceAll(/ /, "þ"); // replace whitespace with a special character |
162 | 321 | mdecorde | // tr/\377\376/ \377/; // 377=ÿ , 376=þ
|
163 | 321 | mdecorde | line = line.replaceAll(/ÿ/, " "); // restore SGML tags |
164 | 321 | mdecorde | line = line.replaceAll(/þ/, " ÿ"); // restore SGML tags |
165 | 321 | mdecorde | //line = line.replaceAll(/ÿ/," ");
|
166 | 321 | mdecorde | |
167 | 321 | mdecorde | // prepare SGML-Tags for tokenization
|
168 | 321 | mdecorde | line = line.replaceAll(/(<[^<>]*>)/, 'ÿ$1ÿ'); |
169 | 321 | mdecorde | line = line.replaceAll(/^ÿ/,""); |
170 | 321 | mdecorde | line = line.replaceAll(/ÿ$/,""); |
171 | 321 | mdecorde | line = line.replaceAll(/ÿÿÿ*/,"ÿ"); |
172 | 321 | mdecorde | |
173 | 321 | mdecorde | String[] split1 = line.split("ÿ"); |
174 | 321 | mdecorde | boolean finish = true; |
175 | 321 | mdecorde | String suffix = ""; |
176 | 321 | mdecorde | for(String s : split1) |
177 | 321 | mdecorde | { |
178 | 321 | mdecorde | if(s =~ /^<.*>$/) // SGML tag |
179 | 321 | mdecorde | { |
180 | 321 | mdecorde | writer.write("$s\n");
|
181 | 321 | mdecorde | } |
182 | 321 | mdecorde | else { // add a blank at the beginning and the end of each segment |
183 | 321 | mdecorde | s = " $s ";
|
184 | 321 | mdecorde | // insert missing blanks after punctuation
|
185 | 321 | mdecorde | s = s.replaceAll(/(\.\.\.)/, " ... "); |
186 | 321 | mdecorde | s = s.replaceAll(/([;\!\?])([^ ])/, '$1 $2'); |
187 | 321 | mdecorde | s = s.replaceAll(/([.,:])([^ 0-9.])/, '$1 $2'); |
188 | 321 | mdecorde | |
189 | 321 | mdecorde | for(String s2 : s.split(" ")) |
190 | 321 | mdecorde | { |
191 | 321 | mdecorde | |
192 | 321 | mdecorde | finish = false;
|
193 | 321 | mdecorde | suffix = "";
|
194 | 321 | mdecorde | // separate punctuation and parentheses from words
|
195 | 321 | mdecorde | while(!finish)
|
196 | 321 | mdecorde | { |
197 | 321 | mdecorde | //println "suffix: "+suffix
|
198 | 321 | mdecorde | finish = true;
|
199 | 321 | mdecorde | // cut off preceding punctuation
|
200 | 321 | mdecorde | if((m = (s2 =~ /^([$PChar])(.*)$/))) |
201 | 321 | mdecorde | { |
202 | 321 | mdecorde | s2 = m[0][2] |
203 | 321 | mdecorde | writer.write(m[0][1]+"\n") |
204 | 321 | mdecorde | finish = false;
|
205 | 321 | mdecorde | } |
206 | 321 | mdecorde | |
207 | 321 | mdecorde | // cut off trailing punctuation
|
208 | 321 | mdecorde | if((m = (s2 =~ /^(.*)([$FChar])$/))) |
209 | 321 | mdecorde | { |
210 | 321 | mdecorde | s2 = m[0][1]; |
211 | 321 | mdecorde | suffix = m[0][2] + "\n$suffix"; |
212 | 321 | mdecorde | finish = false;
|
213 | 321 | mdecorde | |
214 | 321 | mdecorde | if(s2.length() == 0) |
215 | 321 | mdecorde | writer.write(suffix) |
216 | 321 | mdecorde | } |
217 | 321 | mdecorde | |
218 | 321 | mdecorde | // cut off trailing periods if punctuation precedes
|
219 | 321 | mdecorde | if((m = (s2 =~ /([$FChar])\.$/))) { |
220 | 321 | mdecorde | suffix = ".\n$suffix";
|
221 | 321 | mdecorde | if(s2.length() == 0) |
222 | 321 | mdecorde | { |
223 | 321 | mdecorde | s2 = m[0][1]; |
224 | 321 | mdecorde | } |
225 | 321 | mdecorde | else
|
226 | 321 | mdecorde | { |
227 | 321 | mdecorde | suffix = m[0][1]+"\n$suffix" |
228 | 321 | mdecorde | s2 = "";
|
229 | 321 | mdecorde | } |
230 | 321 | mdecorde | finish = false;
|
231 | 321 | mdecorde | } |
232 | 321 | mdecorde | }// end while
|
233 | 321 | mdecorde | |
234 | 321 | mdecorde | // handle explicitly listed tokens
|
235 | 321 | mdecorde | if(Token.containsKey(s2))
|
236 | 321 | mdecorde | { |
237 | 321 | mdecorde | writer.write("$s2\n$suffix");
|
238 | 321 | mdecorde | continue;
|
239 | 321 | mdecorde | } |
240 | 321 | mdecorde | |
241 | 321 | mdecorde | // abbreviations of the form A. or U.S.A.
|
242 | 321 | mdecorde | if(s2 =~ /^([A-Za-z-]\.)+$/) |
243 | 321 | mdecorde | { |
244 | 321 | mdecorde | writer.write("$s2\n$suffix");
|
245 | 321 | mdecorde | continue;
|
246 | 321 | mdecorde | } |
247 | 321 | mdecorde | |
248 | 321 | mdecorde | // disambiguate periods
|
249 | 321 | mdecorde | if((m = (s2 =~ /^(..*)\.$/)) && |
250 | 321 | mdecorde | (s2 !="...") &&
|
251 | 321 | mdecorde | !( s ==~ /^[0-9]+\.$/))
|
252 | 321 | mdecorde | { |
253 | 321 | mdecorde | s2 = m[0][1]; |
254 | 321 | mdecorde | suffix = ".\n$suffix";
|
255 | 321 | mdecorde | if(Token.containsKey(s2))
|
256 | 321 | mdecorde | { |
257 | 321 | mdecorde | writer.write("$s2\n$suffix");
|
258 | 321 | mdecorde | continue;
|
259 | 321 | mdecorde | } |
260 | 321 | mdecorde | } |
261 | 321 | mdecorde | |
262 | 321 | mdecorde | // cut off clitics
|
263 | 321 | mdecorde | if(PClitic.length() > 0) |
264 | 321 | mdecorde | { |
265 | 321 | mdecorde | while((m = (s2 =~ /^($PClitic)(.*)/))) |
266 | 321 | mdecorde | { |
267 | 321 | mdecorde | s2 = m[0][2]; |
268 | 321 | mdecorde | writer.write(m[0][1] + "\n") |
269 | 321 | mdecorde | } |
270 | 321 | mdecorde | } |
271 | 321 | mdecorde | |
272 | 321 | mdecorde | if(FClitic.length() > 0) |
273 | 321 | mdecorde | { |
274 | 321 | mdecorde | while((m = (s2 =~ /(.*)($FClitic)$/))) |
275 | 321 | mdecorde | { |
276 | 321 | mdecorde | s2 = m[0][1]; |
277 | 321 | mdecorde | writer.write(m[0][2] + "\n") |
278 | 321 | mdecorde | } |
279 | 321 | mdecorde | } |
280 | 321 | mdecorde | if(s2.length() > 0) |
281 | 321 | mdecorde | writer.write("$s2\n$suffix");
|
282 | 321 | mdecorde | } |
283 | 321 | mdecorde | } |
284 | 321 | mdecorde | } |
285 | 321 | mdecorde | line = reader.readLine(); |
286 | 321 | mdecorde | } |
287 | 321 | mdecorde | writer.close() |
288 | 321 | mdecorde | reader.close() |
289 | 321 | mdecorde | } |
290 | 321 | mdecorde | } |