Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / filters / ReunitBrokenWords / ReunitBrokenWords.groovy @ 1000

History | View | Annotate | Download (6.1 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package org.txm.scripts.filters.ReunitBrokenWords;
29
//Pre-processing extra-word tags (2)
30
import org.txm.importer.scripts.filters.*;
31
import org.txm.tokenizer.*;
32

    
33
// TODO: Auto-generated Javadoc
34
/**
35
 * The Class ReunitBrokenWords.
36
 */
37
class ReunitBrokenWords extends Filter
38
{
39
        
40
        /** The tag_all. */
41
        def tag_all = TokenizerClasses.tag_all;
42
        
43
        /** The div_tags. */
44
        def div_tags = TokenizerClasses.div_tags;
45
        
46
        /** The q_tags. */
47
        def q_tags = TokenizerClasses.q_tags;
48
        
49
        /** The extraword_tags. */
50
        def extraword_tags = TokenizerClasses.extraword_tags;
51
        
52
        /** The corr_tags_no_seg. */
53
        def corr_tags_no_seg = TokenizerClasses.corr_tags_no_seg;
54
        
55
        /** The corr_tags. */
56
        def corr_tags = TokenizerClasses.corr_tags;
57
        
58
        /** The word_tags. */
59
        def word_tags = TokenizerClasses.word_tags;
60
        
61
        /** The intraword_tags. */
62
        def intraword_tags = TokenizerClasses.intraword_tags;
63
        
64
        /** The tag. */
65
        def tag;
66
        
67
        /** The tag_end. */
68
        def tag_end;
69
        
70
        /** The tag_name. */
71
        def tag_name;
72
        
73
        /** The word_part_tags. */
74
        List<String> word_part_tags = [];
75
        
76
        /** The unknowned. */
77
        Set<String> unknowned = new HashSet<String>();
78
        
79
        /** The level. */
80
        private String level = "extraword";
81
        
82
        /* (non-Javadoc)
83
         * @see org.txm.importer.filters.Filter#SetUsedParam(java.lang.Object)
84
         */
85
        void SetUsedParam(Object args)
86
        {
87
                try
88
                {
89
                        div_tags = args.get("div_tags");
90
                        q_tags = args.get("q_tags");
91
                        extraword_tags = args.get("extraword_tags");
92
                        corr_tags_no_seg = args.get("corr_tags_no_seg");
93
                        corr_tags = args.get("corr_tags");
94
                        word_tags = args.get("word_tags");
95
                        intraword_tags = args.get("intraword_tags");
96
                }
97
                catch(Exception e)
98
                {
99
                        System.err.println(e);
100
                        System.err.println("PreProcess2 needs 1 Map with args  :\n div_tags, q_tags, extraword_tags, " +
101
                                        "corr_tags_no_seg, corr_tags, word_tags, intraword_tags")
102
                }
103
        }
104
        
105
        /* (non-Javadoc)
106
         * @see org.txm.importer.filters.Filter#before()
107
         */
108
        boolean before(){        }
109
        
110
        /* (non-Javadoc)
111
         * @see org.txm.importer.filters.Filter#after()
112
         */
113
        void after()
114
        { 
115
                if(unknowned.size() > 0)
116
                        println "Unknown tags: "+unknowned;
117
        }
118
        
119
        /* (non-Javadoc)
120
         * @see org.txm.importer.filters.Filter#filter()
121
         */
122
        void filter()
123
        {
124
                def m;
125
                def segment;
126
                boolean firstTest = true;
127

    
128
                while(!(line ==~ /\A\s*\Z/))// tant que la ligne contient des charracters autres que des blancs
129
                {
130
                        if ((m = line =~ /^([^<]*)$/))          // si la ligne NE commence PAS par une balise
131
                        {
132
                                if(firstTest)
133
                                {
134
                                        //output.print(lineSeparator);
135
                                        firstTest=false;
136
                                }
137
                                output.print(line);//on ecrit la ligne
138
                                line = " "
139
                        }
140
                        else if ((m = line =~ /\A\s*(<\/?($extraword_tags)( [^>]*|\/)?>)(.*)\Z/)) // balise externe         
141
                        {        
142
                                tag = m[0][1]; // le nom du tag
143
                                line = m[0][4];
144
                                //println "balise ext : "+tag
145
                                level = "extraword"; //di qu'on est ds un tag extern
146
                                output.print("\n$tag \n");
147
                        }
148
                        else if ((m = line =~ /\A\s*(<($corr_tags)( [^>]*|\/)?>)(.*)\Z/)) //balise corr ouvrante                
149
                        {   
150
                                tag = m[0][1];
151
                                String tag_end = m[0][3]; // ( [^>]*|\/)?>)
152
                                line = m[0][4];
153
                                //println "balise corr : "+tag+" end : "+tag_end
154
                                if(tag_end != null && tag_end.find("word_part")) //on trouve un word_part        
155
                                {
156
                                        //println "set level : intra :"+tag
157
                                        word_part_tags.add(m[0][2]);
158
                                        level = "intraword";
159
                                        output.print(tag);
160
                                }
161
                                else {
162
                                        level = "extraword";
163
                                        output.print("\n$tag\n");
164
                                }
165
                        } 
166
                        else if ((m = line =~ /^\s*(<($word_tags)( [^>]*|\/)?>)(.*)$/))                                 
167
                        {
168
                                tag = m[0][1];
169
                                line = m[0][4];
170
                                //println "balise word : "+tag
171
                                //output.print("\n$tag");
172
                                output.write(line)
173
                                level = "intraword";
174
                        }
175
                        else if ((m = line =~ /^\s*(<($intraword_tags)( [^>]*|\/)?>)(.*)$/))                        
176
                        {
177
                                tag = m[0][1];
178
                                line = m[0][4];
179
                                //println "balise intra : "+tag
180
                                output.print(tag);
181
                                level = "intraword";
182
                        }
183
                        else if ((m = line =~ /^\s*(<\/($corr_tags|$word_tags|$intraword_tags)>)\s*(.*)$/))
184
                        {
185
                                tag = m[0][1];
186
                                tag_name = m[0][2];
187
                                line = m[0][3];
188
                                //println "balise tous : "+tag+" level : "+level
189
                                if (level ==~ /extraword/) 
190
                                {//println "write extra"
191
                                        output.print("\n");
192
                                }//else{println "dont write intra"}
193
                                
194
                                output.print(tag);
195
                                if (tag_name ==~ /^($corr_tags|$word_tags)$/)                                 
196
                                {
197
                                        //println("> corrwordtags "+tag_name+" level "+level+" Word_part_tag "+word_part_tags)
198
                                        if (level ==~ /extraword/) 
199
                                        {
200
                                                output.print("\n");
201
                                        }
202
                                        //else{println "dont write intra corr"}
203
                                        if(word_part_tags.size() > 0)
204
                                        {
205
                                                //String word_part_tag = word_part_tags[-1];
206
                                                //if(tag_name == word_part_tag)
207
                                                //{
208
                                                        word_part_tags.remove(word_part_tags.size()-1)
209
                                                        if(word_part_tags.size() == 0 )
210
                                                        {
211
                                                                level = "extraword";
212
                                                                //output.print("\n");
213
                                                        }
214
                                                //}
215
                                        }
216
                                        else
217
                                                level = "extraword"
218
                                }
219
                        }
220
                        else if ((m = line =~ /^\s*($tag_all)(.*)$/))                                 
221
                        {
222
                                //println "balise autres : "+tag
223
                                tag = m[0][1];
224
                                line = m[0][2];
225
                                output.print("\n$tag\n");
226
                                if(!unknowned.contains(tag))
227
                                {
228
                                        unknowned.add(tag)
229
                                        //println "Warning: unknown tag $tag in line : "+line;
230
                                }
231
                                
232
                        }
233
                        else{                        
234
                                println "Error in "+ line;
235
                                line = " ";
236
                        }
237
                        firstTest=false;
238
                }
239
                // End
240
        }
241
}