Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / filters / WordInternalElement / WordInternalElement.groovy @ 1000

History | View | Annotate | Download (2.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package org.txm.scripts.filters.WordInternalElement;
29

    
30
import org.txm.importer.scripts.filters.*;
31
import java.util.regex.*;
32
import org.txm.tokenizer.*;
33

    
34
// TODO: Auto-generated Javadoc
35
/**
36
 * The Class WordInternalElement.
37
 */
38
class WordInternalElement extends Filter {
39
        
40
        /** The counter. */
41
        int counter;
42
        
43
        /** The old. */
44
        def old;
45
        
46
        /** The corr_tags_no_seg. */
47
        String corr_tags_no_seg = TokenizerClasses.corr_tags_no_seg;
48

    
49
        /* (non-Javadoc)
50
         * @see org.txm.importer.filters.Filter#SetUsedParam(java.lang.Object)
51
         */
52
        void SetUsedParam(Object args)
53
        {
54
                try
55
                {
56
                        corr_tags_no_seg = args.get("corr_tags_no_seg");
57

    
58
                }
59
                catch(Exception e)
60
                {
61
                        System.err.println(e);
62
                        System.err.println("wordinternal needs 1 Map with arg  :\n corr_tags_no_seg")
63
                }
64
        }
65

    
66
        /* (non-Javadoc)
67
         * @see org.txm.importer.filters.Filter#before()
68
         */
69
        boolean before()
70
        {
71
                counter = 0;
72
                println "begin wordinternal \n";
73
        }
74

    
75
        /* (non-Javadoc)
76
         * @see org.txm.importer.filters.Filter#after()
77
         */
78
        void after()
79
        {
80
                println "Deleted $counter wordinternalspaces \n";
81
        }
82

    
83
        /* (non-Javadoc)
84
         * @see org.txm.importer.filters.Filter#filter()
85
         */
86
        void filter()
87
        {
88
                def m;
89
                def segment;
90
                // Write your code here, but don't forget to write in the output
91
                // ex : output.write("TheStringToWrite " + line );
92
                // in the variable "line" is the current line value
93
                old = line;
94
                line = (line =~ /(<(corr_tags_no_seg) [^>]*word_part[^>]*>)\s*([^<]*?)\s*(<\/\2>)/).replaceAll("<seg type=\"word_part\">\$1\$3\$4<\\/seg>");
95
                
96
                if( old != line)
97
                        counter++;
98
                
99
                output.write(line+lineSeparator);
100
                // End
101
        }
102
}