Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / filters / ReunitBrokenTags / ReunitBrokenTags.groovy @ 187

History | View | Annotate | Download (2.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package filters.ReunitBrokenTags;
29

    
30
//Pre-processing extra-word tags (1)
31
import org.txm.importer.filters.*;
32
import java.util.regex.*;
33

    
34
// TODO: Auto-generated Javadoc
35
/**
36
 * The Class ReunitBrokenTags.
37
 */
38
class ReunitBrokenTags extends Filter {
39
        
40
        /** The counter. */
41
        int counter;
42

    
43
        /* (non-Javadoc)
44
         * @see org.txm.importer.filters.Filter#SetUsedParam(java.lang.Object)
45
         */
46
        void SetUsedParam(Object args) {
47

    
48
        }
49

    
50
        /* (non-Javadoc)
51
         * @see org.txm.importer.filters.Filter#before()
52
         */
53
        boolean before() {
54
                counter = 0;
55
                System.out.println("begin reunitbroken");
56
        }
57

    
58
        /* (non-Javadoc)
59
         * @see org.txm.importer.filters.Filter#after()
60
         */
61
        void after()
62
        {
63
                println "$counter Reunited tags \n";
64
        }
65

    
66
        /** The printcounter. */
67
        int printcounter = 0;
68

    
69
        /* (non-Javadoc)
70
         * @see org.txm.importer.filters.Filter#filter()
71
         */
72
        void filter()
73
        {
74
                def m;
75
                def segment = (line =~ / /).replaceAll(" ");
76
                segment = (line =~ / | | /).replaceAll(" ");
77
                while(! (segment ==~ /\A\s*\Z/)) 
78
                {
79
                        if (segment ==~ /^[^<>]*$/) {
80
                                segment = (segment =~ /^\s+/).replaceFirst(" ");
81
                                //if(printcounter < 15)
82
                                        //println(segment);
83
                                printcounter++;
84
                                output.write(segment+lineSeparator);
85
                                segment = " ";
86
                        }
87
                        else if (segment ==~ /^\s*([^<]*<[^>]+)$/) {
88
                                segment = (segment =~ /^\s+/).replaceFirst(" ");
89
                                segment = (segment =~ /\s+$/).replaceFirst(" ");
90
                                output.write(segment);
91
                                segment = " ";
92
                                counter++;
93
                        }
94
                        else if( (m = (segment =~ /^\s+([^<]*<[^>]+>)(.*)$/)) )        {
95
                                segment = m[0][2];
96
                                output.write(" "+m[0][1]);
97
                        }
98
                        else if( (m = (segment =~ /^([^< ]*<[^>]+>)(.*)$/)) ) {
99
                                segment = m[0][2];
100
                                output.write(m[0][1]);
101
                        }
102
                        else if( (m = (segment =~ /^\s*([^>]*)>(.*)$/)) ) {
103
                                segment = m[0][2];
104
                                output.write(" "+m[0][1]+">");
105
                        }
106
                        else {
107
                                output.write(segment+lineSeparator);
108
                                println "ERROR in tag restitution segment\n";
109
                                segment = " ";
110
                        }
111
                }
112
        }
113
}