root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / filters / ReunitBrokenWords / ReunitBrokenWords.groovy @ 1000
History | View | Annotate | Download (6.1 kB)
1 | 321 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 321 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 321 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 321 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 321 | mdecorde | //
|
6 | 321 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 321 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 321 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 321 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 321 | mdecorde | // later version.
|
11 | 321 | mdecorde | //
|
12 | 321 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 321 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 321 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 321 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 321 | mdecorde | // details.
|
17 | 321 | mdecorde | //
|
18 | 321 | mdecorde | // You should have received a copy of the GNU General
|
19 | 321 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 321 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 321 | mdecorde | //
|
22 | 321 | mdecorde | //
|
23 | 321 | mdecorde | //
|
24 | 321 | mdecorde | // $LastChangedDate:$
|
25 | 321 | mdecorde | // $LastChangedRevision:$
|
26 | 321 | mdecorde | // $LastChangedBy:$
|
27 | 321 | mdecorde | //
|
28 | 1000 | mdecorde | package org.txm.scripts.filters.ReunitBrokenWords;
|
29 | 321 | mdecorde | //Pre-processing extra-word tags (2)
|
30 | 1000 | mdecorde | import org.txm.importer.scripts.filters.*; |
31 | 321 | mdecorde | import org.txm.tokenizer.*; |
32 | 321 | mdecorde | |
33 | 321 | mdecorde | // TODO: Auto-generated Javadoc
|
34 | 321 | mdecorde | /**
|
35 | 321 | mdecorde | * The Class ReunitBrokenWords.
|
36 | 321 | mdecorde | */
|
37 | 321 | mdecorde | class ReunitBrokenWords extends Filter |
38 | 321 | mdecorde | { |
39 | 321 | mdecorde | |
40 | 321 | mdecorde | /** The tag_all. */
|
41 | 321 | mdecorde | def tag_all = TokenizerClasses.tag_all;
|
42 | 321 | mdecorde | |
43 | 321 | mdecorde | /** The div_tags. */
|
44 | 321 | mdecorde | def div_tags = TokenizerClasses.div_tags;
|
45 | 321 | mdecorde | |
46 | 321 | mdecorde | /** The q_tags. */
|
47 | 321 | mdecorde | def q_tags = TokenizerClasses.q_tags;
|
48 | 321 | mdecorde | |
49 | 321 | mdecorde | /** The extraword_tags. */
|
50 | 321 | mdecorde | def extraword_tags = TokenizerClasses.extraword_tags;
|
51 | 321 | mdecorde | |
52 | 321 | mdecorde | /** The corr_tags_no_seg. */
|
53 | 321 | mdecorde | def corr_tags_no_seg = TokenizerClasses.corr_tags_no_seg;
|
54 | 321 | mdecorde | |
55 | 321 | mdecorde | /** The corr_tags. */
|
56 | 321 | mdecorde | def corr_tags = TokenizerClasses.corr_tags;
|
57 | 321 | mdecorde | |
58 | 321 | mdecorde | /** The word_tags. */
|
59 | 321 | mdecorde | def word_tags = TokenizerClasses.word_tags;
|
60 | 321 | mdecorde | |
61 | 321 | mdecorde | /** The intraword_tags. */
|
62 | 321 | mdecorde | def intraword_tags = TokenizerClasses.intraword_tags;
|
63 | 321 | mdecorde | |
64 | 321 | mdecorde | /** The tag. */
|
65 | 321 | mdecorde | def tag;
|
66 | 321 | mdecorde | |
67 | 321 | mdecorde | /** The tag_end. */
|
68 | 321 | mdecorde | def tag_end;
|
69 | 321 | mdecorde | |
70 | 321 | mdecorde | /** The tag_name. */
|
71 | 321 | mdecorde | def tag_name;
|
72 | 321 | mdecorde | |
73 | 321 | mdecorde | /** The word_part_tags. */
|
74 | 321 | mdecorde | List<String> word_part_tags = []; |
75 | 321 | mdecorde | |
76 | 321 | mdecorde | /** The unknowned. */
|
77 | 321 | mdecorde | Set<String> unknowned = new HashSet<String>(); |
78 | 321 | mdecorde | |
79 | 321 | mdecorde | /** The level. */
|
80 | 321 | mdecorde | private String level = "extraword"; |
81 | 321 | mdecorde | |
82 | 321 | mdecorde | /* (non-Javadoc)
|
83 | 321 | mdecorde | * @see org.txm.importer.filters.Filter#SetUsedParam(java.lang.Object)
|
84 | 321 | mdecorde | */
|
85 | 321 | mdecorde | void SetUsedParam(Object args) |
86 | 321 | mdecorde | { |
87 | 321 | mdecorde | try
|
88 | 321 | mdecorde | { |
89 | 321 | mdecorde | div_tags = args.get("div_tags");
|
90 | 321 | mdecorde | q_tags = args.get("q_tags");
|
91 | 321 | mdecorde | extraword_tags = args.get("extraword_tags");
|
92 | 321 | mdecorde | corr_tags_no_seg = args.get("corr_tags_no_seg");
|
93 | 321 | mdecorde | corr_tags = args.get("corr_tags");
|
94 | 321 | mdecorde | word_tags = args.get("word_tags");
|
95 | 321 | mdecorde | intraword_tags = args.get("intraword_tags");
|
96 | 321 | mdecorde | } |
97 | 321 | mdecorde | catch(Exception e) |
98 | 321 | mdecorde | { |
99 | 321 | mdecorde | System.err.println(e);
|
100 | 321 | mdecorde | System.err.println("PreProcess2 needs 1 Map with args :\n div_tags, q_tags, extraword_tags, " + |
101 | 321 | mdecorde | "corr_tags_no_seg, corr_tags, word_tags, intraword_tags")
|
102 | 321 | mdecorde | } |
103 | 321 | mdecorde | } |
104 | 321 | mdecorde | |
105 | 321 | mdecorde | /* (non-Javadoc)
|
106 | 321 | mdecorde | * @see org.txm.importer.filters.Filter#before()
|
107 | 321 | mdecorde | */
|
108 | 321 | mdecorde | boolean before(){ }
|
109 | 321 | mdecorde | |
110 | 321 | mdecorde | /* (non-Javadoc)
|
111 | 321 | mdecorde | * @see org.txm.importer.filters.Filter#after()
|
112 | 321 | mdecorde | */
|
113 | 321 | mdecorde | void after()
|
114 | 321 | mdecorde | { |
115 | 321 | mdecorde | if(unknowned.size() > 0) |
116 | 321 | mdecorde | println "Unknown tags: "+unknowned;
|
117 | 321 | mdecorde | } |
118 | 321 | mdecorde | |
119 | 321 | mdecorde | /* (non-Javadoc)
|
120 | 321 | mdecorde | * @see org.txm.importer.filters.Filter#filter()
|
121 | 321 | mdecorde | */
|
122 | 321 | mdecorde | void filter()
|
123 | 321 | mdecorde | { |
124 | 321 | mdecorde | def m;
|
125 | 321 | mdecorde | def segment;
|
126 | 321 | mdecorde | boolean firstTest = true; |
127 | 321 | mdecorde | |
128 | 321 | mdecorde | while(!(line ==~ /\A\s*\Z/))// tant que la ligne contient des charracters autres que des blancs |
129 | 321 | mdecorde | { |
130 | 321 | mdecorde | if ((m = line =~ /^([^<]*)$/)) // si la ligne NE commence PAS par une balise |
131 | 321 | mdecorde | { |
132 | 321 | mdecorde | if(firstTest)
|
133 | 321 | mdecorde | { |
134 | 321 | mdecorde | //output.print(lineSeparator);
|
135 | 321 | mdecorde | firstTest=false;
|
136 | 321 | mdecorde | } |
137 | 321 | mdecorde | output.print(line);//on ecrit la ligne
|
138 | 321 | mdecorde | line = " "
|
139 | 321 | mdecorde | } |
140 | 321 | mdecorde | else if ((m = line =~ /\A\s*(<\/?($extraword_tags)( [^>]*|\/)?>)(.*)\Z/)) // balise externe |
141 | 321 | mdecorde | { |
142 | 321 | mdecorde | tag = m[0][1]; // le nom du tag |
143 | 321 | mdecorde | line = m[0][4]; |
144 | 321 | mdecorde | //println "balise ext : "+tag
|
145 | 321 | mdecorde | level = "extraword"; //di qu'on est ds un tag extern |
146 | 321 | mdecorde | output.print("\n$tag \n");
|
147 | 321 | mdecorde | } |
148 | 321 | mdecorde | else if ((m = line =~ /\A\s*(<($corr_tags)( [^>]*|\/)?>)(.*)\Z/)) //balise corr ouvrante |
149 | 321 | mdecorde | { |
150 | 321 | mdecorde | tag = m[0][1]; |
151 | 321 | mdecorde | String tag_end = m[0][3]; // ( [^>]*|\/)?>) |
152 | 321 | mdecorde | line = m[0][4]; |
153 | 321 | mdecorde | //println "balise corr : "+tag+" end : "+tag_end
|
154 | 321 | mdecorde | if(tag_end != null && tag_end.find("word_part")) //on trouve un word_part |
155 | 321 | mdecorde | { |
156 | 321 | mdecorde | //println "set level : intra :"+tag
|
157 | 321 | mdecorde | word_part_tags.add(m[0][2]); |
158 | 321 | mdecorde | level = "intraword";
|
159 | 321 | mdecorde | output.print(tag); |
160 | 321 | mdecorde | } |
161 | 321 | mdecorde | else {
|
162 | 321 | mdecorde | level = "extraword";
|
163 | 321 | mdecorde | output.print("\n$tag\n");
|
164 | 321 | mdecorde | } |
165 | 321 | mdecorde | } |
166 | 321 | mdecorde | else if ((m = line =~ /^\s*(<($word_tags)( [^>]*|\/)?>)(.*)$/)) |
167 | 321 | mdecorde | { |
168 | 321 | mdecorde | tag = m[0][1]; |
169 | 321 | mdecorde | line = m[0][4]; |
170 | 321 | mdecorde | //println "balise word : "+tag
|
171 | 321 | mdecorde | //output.print("\n$tag");
|
172 | 321 | mdecorde | output.write(line) |
173 | 321 | mdecorde | level = "intraword";
|
174 | 321 | mdecorde | } |
175 | 321 | mdecorde | else if ((m = line =~ /^\s*(<($intraword_tags)( [^>]*|\/)?>)(.*)$/)) |
176 | 321 | mdecorde | { |
177 | 321 | mdecorde | tag = m[0][1]; |
178 | 321 | mdecorde | line = m[0][4]; |
179 | 321 | mdecorde | //println "balise intra : "+tag
|
180 | 321 | mdecorde | output.print(tag); |
181 | 321 | mdecorde | level = "intraword";
|
182 | 321 | mdecorde | } |
183 | 321 | mdecorde | else if ((m = line =~ /^\s*(<\/($corr_tags|$word_tags|$intraword_tags)>)\s*(.*)$/)) |
184 | 321 | mdecorde | { |
185 | 321 | mdecorde | tag = m[0][1]; |
186 | 321 | mdecorde | tag_name = m[0][2]; |
187 | 321 | mdecorde | line = m[0][3]; |
188 | 321 | mdecorde | //println "balise tous : "+tag+" level : "+level
|
189 | 321 | mdecorde | if (level ==~ /extraword/) |
190 | 321 | mdecorde | {//println "write extra"
|
191 | 321 | mdecorde | output.print("\n");
|
192 | 321 | mdecorde | }//else{println "dont write intra"}
|
193 | 321 | mdecorde | |
194 | 321 | mdecorde | output.print(tag); |
195 | 321 | mdecorde | if (tag_name ==~ /^($corr_tags|$word_tags)$/) |
196 | 321 | mdecorde | { |
197 | 321 | mdecorde | //println("> corrwordtags "+tag_name+" level "+level+" Word_part_tag "+word_part_tags)
|
198 | 321 | mdecorde | if (level ==~ /extraword/) |
199 | 321 | mdecorde | { |
200 | 321 | mdecorde | output.print("\n");
|
201 | 321 | mdecorde | } |
202 | 321 | mdecorde | //else{println "dont write intra corr"}
|
203 | 321 | mdecorde | if(word_part_tags.size() > 0) |
204 | 321 | mdecorde | { |
205 | 321 | mdecorde | //String word_part_tag = word_part_tags[-1];
|
206 | 321 | mdecorde | //if(tag_name == word_part_tag)
|
207 | 321 | mdecorde | //{
|
208 | 321 | mdecorde | word_part_tags.remove(word_part_tags.size()-1)
|
209 | 321 | mdecorde | if(word_part_tags.size() == 0 ) |
210 | 321 | mdecorde | { |
211 | 321 | mdecorde | level = "extraword";
|
212 | 321 | mdecorde | //output.print("\n");
|
213 | 321 | mdecorde | } |
214 | 321 | mdecorde | //}
|
215 | 321 | mdecorde | } |
216 | 321 | mdecorde | else
|
217 | 321 | mdecorde | level = "extraword"
|
218 | 321 | mdecorde | } |
219 | 321 | mdecorde | } |
220 | 321 | mdecorde | else if ((m = line =~ /^\s*($tag_all)(.*)$/)) |
221 | 321 | mdecorde | { |
222 | 321 | mdecorde | //println "balise autres : "+tag
|
223 | 321 | mdecorde | tag = m[0][1]; |
224 | 321 | mdecorde | line = m[0][2]; |
225 | 321 | mdecorde | output.print("\n$tag\n");
|
226 | 321 | mdecorde | if(!unknowned.contains(tag))
|
227 | 321 | mdecorde | { |
228 | 321 | mdecorde | unknowned.add(tag) |
229 | 321 | mdecorde | //println "Warning: unknown tag $tag in line : "+line;
|
230 | 321 | mdecorde | } |
231 | 321 | mdecorde | |
232 | 321 | mdecorde | } |
233 | 321 | mdecorde | else{
|
234 | 321 | mdecorde | println "Error in "+ line;
|
235 | 321 | mdecorde | line = " ";
|
236 | 321 | mdecorde | } |
237 | 321 | mdecorde | firstTest=false;
|
238 | 321 | mdecorde | } |
239 | 321 | mdecorde | // End
|
240 | 321 | mdecorde | } |
241 | 321 | mdecorde | } |