Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / filters / ReunitBrokenWords / ReunitBrokenWords.groovy @ 1000

History | View | Annotate | Download (6.1 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
//
22 321 mdecorde
//
23 321 mdecorde
//
24 321 mdecorde
// $LastChangedDate:$
25 321 mdecorde
// $LastChangedRevision:$
26 321 mdecorde
// $LastChangedBy:$
27 321 mdecorde
//
28 1000 mdecorde
package org.txm.scripts.filters.ReunitBrokenWords;
29 321 mdecorde
//Pre-processing extra-word tags (2)
30 1000 mdecorde
import org.txm.importer.scripts.filters.*;
31 321 mdecorde
import org.txm.tokenizer.*;
32 321 mdecorde
33 321 mdecorde
// TODO: Auto-generated Javadoc
34 321 mdecorde
/**
35 321 mdecorde
 * The Class ReunitBrokenWords.
36 321 mdecorde
 */
37 321 mdecorde
class ReunitBrokenWords extends Filter
38 321 mdecorde
{
39 321 mdecorde
40 321 mdecorde
        /** The tag_all. */
41 321 mdecorde
        def tag_all = TokenizerClasses.tag_all;
42 321 mdecorde
43 321 mdecorde
        /** The div_tags. */
44 321 mdecorde
        def div_tags = TokenizerClasses.div_tags;
45 321 mdecorde
46 321 mdecorde
        /** The q_tags. */
47 321 mdecorde
        def q_tags = TokenizerClasses.q_tags;
48 321 mdecorde
49 321 mdecorde
        /** The extraword_tags. */
50 321 mdecorde
        def extraword_tags = TokenizerClasses.extraword_tags;
51 321 mdecorde
52 321 mdecorde
        /** The corr_tags_no_seg. */
53 321 mdecorde
        def corr_tags_no_seg = TokenizerClasses.corr_tags_no_seg;
54 321 mdecorde
55 321 mdecorde
        /** The corr_tags. */
56 321 mdecorde
        def corr_tags = TokenizerClasses.corr_tags;
57 321 mdecorde
58 321 mdecorde
        /** The word_tags. */
59 321 mdecorde
        def word_tags = TokenizerClasses.word_tags;
60 321 mdecorde
61 321 mdecorde
        /** The intraword_tags. */
62 321 mdecorde
        def intraword_tags = TokenizerClasses.intraword_tags;
63 321 mdecorde
64 321 mdecorde
        /** The tag. */
65 321 mdecorde
        def tag;
66 321 mdecorde
67 321 mdecorde
        /** The tag_end. */
68 321 mdecorde
        def tag_end;
69 321 mdecorde
70 321 mdecorde
        /** The tag_name. */
71 321 mdecorde
        def tag_name;
72 321 mdecorde
73 321 mdecorde
        /** The word_part_tags. */
74 321 mdecorde
        List<String> word_part_tags = [];
75 321 mdecorde
76 321 mdecorde
        /** The unknowned. */
77 321 mdecorde
        Set<String> unknowned = new HashSet<String>();
78 321 mdecorde
79 321 mdecorde
        /** The level. */
80 321 mdecorde
        private String level = "extraword";
81 321 mdecorde
82 321 mdecorde
        /* (non-Javadoc)
83 321 mdecorde
         * @see org.txm.importer.filters.Filter#SetUsedParam(java.lang.Object)
84 321 mdecorde
         */
85 321 mdecorde
        void SetUsedParam(Object args)
86 321 mdecorde
        {
87 321 mdecorde
                try
88 321 mdecorde
                {
89 321 mdecorde
                        div_tags = args.get("div_tags");
90 321 mdecorde
                        q_tags = args.get("q_tags");
91 321 mdecorde
                        extraword_tags = args.get("extraword_tags");
92 321 mdecorde
                        corr_tags_no_seg = args.get("corr_tags_no_seg");
93 321 mdecorde
                        corr_tags = args.get("corr_tags");
94 321 mdecorde
                        word_tags = args.get("word_tags");
95 321 mdecorde
                        intraword_tags = args.get("intraword_tags");
96 321 mdecorde
                }
97 321 mdecorde
                catch(Exception e)
98 321 mdecorde
                {
99 321 mdecorde
                        System.err.println(e);
100 321 mdecorde
                        System.err.println("PreProcess2 needs 1 Map with args  :\n div_tags, q_tags, extraword_tags, " +
101 321 mdecorde
                                        "corr_tags_no_seg, corr_tags, word_tags, intraword_tags")
102 321 mdecorde
                }
103 321 mdecorde
        }
104 321 mdecorde
105 321 mdecorde
        /* (non-Javadoc)
106 321 mdecorde
         * @see org.txm.importer.filters.Filter#before()
107 321 mdecorde
         */
108 321 mdecorde
        boolean before(){        }
109 321 mdecorde
110 321 mdecorde
        /* (non-Javadoc)
111 321 mdecorde
         * @see org.txm.importer.filters.Filter#after()
112 321 mdecorde
         */
113 321 mdecorde
        void after()
114 321 mdecorde
        {
115 321 mdecorde
                if(unknowned.size() > 0)
116 321 mdecorde
                        println "Unknown tags: "+unknowned;
117 321 mdecorde
        }
118 321 mdecorde
119 321 mdecorde
        /* (non-Javadoc)
120 321 mdecorde
         * @see org.txm.importer.filters.Filter#filter()
121 321 mdecorde
         */
122 321 mdecorde
        void filter()
123 321 mdecorde
        {
124 321 mdecorde
                def m;
125 321 mdecorde
                def segment;
126 321 mdecorde
                boolean firstTest = true;
127 321 mdecorde
128 321 mdecorde
                while(!(line ==~ /\A\s*\Z/))// tant que la ligne contient des charracters autres que des blancs
129 321 mdecorde
                {
130 321 mdecorde
                        if ((m = line =~ /^([^<]*)$/))          // si la ligne NE commence PAS par une balise
131 321 mdecorde
                        {
132 321 mdecorde
                                if(firstTest)
133 321 mdecorde
                                {
134 321 mdecorde
                                        //output.print(lineSeparator);
135 321 mdecorde
                                        firstTest=false;
136 321 mdecorde
                                }
137 321 mdecorde
                                output.print(line);//on ecrit la ligne
138 321 mdecorde
                                line = " "
139 321 mdecorde
                        }
140 321 mdecorde
                        else if ((m = line =~ /\A\s*(<\/?($extraword_tags)( [^>]*|\/)?>)(.*)\Z/)) // balise externe
141 321 mdecorde
                        {
142 321 mdecorde
                                tag = m[0][1]; // le nom du tag
143 321 mdecorde
                                line = m[0][4];
144 321 mdecorde
                                //println "balise ext : "+tag
145 321 mdecorde
                                level = "extraword"; //di qu'on est ds un tag extern
146 321 mdecorde
                                output.print("\n$tag \n");
147 321 mdecorde
                        }
148 321 mdecorde
                        else if ((m = line =~ /\A\s*(<($corr_tags)( [^>]*|\/)?>)(.*)\Z/)) //balise corr ouvrante
149 321 mdecorde
                        {
150 321 mdecorde
                                tag = m[0][1];
151 321 mdecorde
                                String tag_end = m[0][3]; // ( [^>]*|\/)?>)
152 321 mdecorde
                                line = m[0][4];
153 321 mdecorde
                                //println "balise corr : "+tag+" end : "+tag_end
154 321 mdecorde
                                if(tag_end != null && tag_end.find("word_part")) //on trouve un word_part
155 321 mdecorde
                                {
156 321 mdecorde
                                        //println "set level : intra :"+tag
157 321 mdecorde
                                        word_part_tags.add(m[0][2]);
158 321 mdecorde
                                        level = "intraword";
159 321 mdecorde
                                        output.print(tag);
160 321 mdecorde
                                }
161 321 mdecorde
                                else {
162 321 mdecorde
                                        level = "extraword";
163 321 mdecorde
                                        output.print("\n$tag\n");
164 321 mdecorde
                                }
165 321 mdecorde
                        }
166 321 mdecorde
                        else if ((m = line =~ /^\s*(<($word_tags)( [^>]*|\/)?>)(.*)$/))
167 321 mdecorde
                        {
168 321 mdecorde
                                tag = m[0][1];
169 321 mdecorde
                                line = m[0][4];
170 321 mdecorde
                                //println "balise word : "+tag
171 321 mdecorde
                                //output.print("\n$tag");
172 321 mdecorde
                                output.write(line)
173 321 mdecorde
                                level = "intraword";
174 321 mdecorde
                        }
175 321 mdecorde
                        else if ((m = line =~ /^\s*(<($intraword_tags)( [^>]*|\/)?>)(.*)$/))
176 321 mdecorde
                        {
177 321 mdecorde
                                tag = m[0][1];
178 321 mdecorde
                                line = m[0][4];
179 321 mdecorde
                                //println "balise intra : "+tag
180 321 mdecorde
                                output.print(tag);
181 321 mdecorde
                                level = "intraword";
182 321 mdecorde
                        }
183 321 mdecorde
                        else if ((m = line =~ /^\s*(<\/($corr_tags|$word_tags|$intraword_tags)>)\s*(.*)$/))
184 321 mdecorde
                        {
185 321 mdecorde
                                tag = m[0][1];
186 321 mdecorde
                                tag_name = m[0][2];
187 321 mdecorde
                                line = m[0][3];
188 321 mdecorde
                                //println "balise tous : "+tag+" level : "+level
189 321 mdecorde
                                if (level ==~ /extraword/)
190 321 mdecorde
                                {//println "write extra"
191 321 mdecorde
                                        output.print("\n");
192 321 mdecorde
                                }//else{println "dont write intra"}
193 321 mdecorde
194 321 mdecorde
                                output.print(tag);
195 321 mdecorde
                                if (tag_name ==~ /^($corr_tags|$word_tags)$/)
196 321 mdecorde
                                {
197 321 mdecorde
                                        //println("> corrwordtags "+tag_name+" level "+level+" Word_part_tag "+word_part_tags)
198 321 mdecorde
                                        if (level ==~ /extraword/)
199 321 mdecorde
                                        {
200 321 mdecorde
                                                output.print("\n");
201 321 mdecorde
                                        }
202 321 mdecorde
                                        //else{println "dont write intra corr"}
203 321 mdecorde
                                        if(word_part_tags.size() > 0)
204 321 mdecorde
                                        {
205 321 mdecorde
                                                //String word_part_tag = word_part_tags[-1];
206 321 mdecorde
                                                //if(tag_name == word_part_tag)
207 321 mdecorde
                                                //{
208 321 mdecorde
                                                        word_part_tags.remove(word_part_tags.size()-1)
209 321 mdecorde
                                                        if(word_part_tags.size() == 0 )
210 321 mdecorde
                                                        {
211 321 mdecorde
                                                                level = "extraword";
212 321 mdecorde
                                                                //output.print("\n");
213 321 mdecorde
                                                        }
214 321 mdecorde
                                                //}
215 321 mdecorde
                                        }
216 321 mdecorde
                                        else
217 321 mdecorde
                                                level = "extraword"
218 321 mdecorde
                                }
219 321 mdecorde
                        }
220 321 mdecorde
                        else if ((m = line =~ /^\s*($tag_all)(.*)$/))
221 321 mdecorde
                        {
222 321 mdecorde
                                //println "balise autres : "+tag
223 321 mdecorde
                                tag = m[0][1];
224 321 mdecorde
                                line = m[0][2];
225 321 mdecorde
                                output.print("\n$tag\n");
226 321 mdecorde
                                if(!unknowned.contains(tag))
227 321 mdecorde
                                {
228 321 mdecorde
                                        unknowned.add(tag)
229 321 mdecorde
                                        //println "Warning: unknown tag $tag in line : "+line;
230 321 mdecorde
                                }
231 321 mdecorde
232 321 mdecorde
                        }
233 321 mdecorde
                        else{
234 321 mdecorde
                                println "Error in "+ line;
235 321 mdecorde
                                line = " ";
236 321 mdecorde
                        }
237 321 mdecorde
                        firstTest=false;
238 321 mdecorde
                }
239 321 mdecorde
                // End
240 321 mdecorde
        }
241 321 mdecorde
}