Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / CleanFile.groovy @ 625

History | View | Annotate | Download (4.6 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
// 
38
// 
39
// 
40
// $LastChangedDate: 2017-06-26 16:53:47 +0200 (lun. 26 juin 2017) $
41
// $LastChangedRevision: 3451 $
42
// $LastChangedBy: mdecorde $ 
43
//
44
package org.txm.importer;
45

    
46
import java.io.BufferedWriter;
47
import java.io.File;
48

    
49
import javax.xml.stream.XMLInputFactory;
50
import javax.xml.stream.XMLStreamReader
51

    
52
import org.txm.utils.io.IOUtils;
53

    
54
// TODO: Auto-generated Javadoc
55
/**
56
 * remove all ctrl char and replace \t by ' '.
57
 *
58
 * @param infile : the file to process
59
 * @param encoding : the encoding of the file
60
 */
61
class CleanFile {
62
        public CleanFile(File infile, String encoding){
63
                Reader reader = new BufferedReader(new InputStreamReader(new FileInputStream(infile),encoding));
64
                File outfile = File.createTempFile("filecleaner","123456", infile.getParentFile());
65
                Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"));
66
                String line = reader.readLine();
67
                while(line != null) {
68
                        writer.write(clean(line) +"\n")
69
                        line = reader.readLine();
70
                }
71
                writer.close();
72
                reader.close();
73
                
74
                infile.delete(); // erase old file
75
                outfile.renameTo(infile); // rename the copy
76
        }
77

    
78
        def static ctrl_reg = /[\p{Cntrl}&&[^\n\t]]/ // all ctrl but \n
79
        
80
        def static surrogate = "[^\\p{P}&&[^\\u0001-\\uFFFF]]"
81
        def static surrogate_reg = /$surrogate/
82
        def static surrogate_punct = "[\\p{P}&&[^\\u0001-\\uFFFF]]"
83
        def static surrogate_punct_reg = /$surrogate_punct/
84
        def static SURROGATE = "_"
85
        def static SURROGATE_PUNCT = ""
86
        def static EMPTY = ""
87
        def static SPACE = " "
88
        def static TAB = "\t"
89
        
90
        /**
91
         * Remove ctrl and surrogates chars from a String. except tabs
92
         *
93
         * @param str the str
94
         * @return the cleaned string
95
         */
96
        public static String cleanAllButTabs(String str) {
97
                str = str.replaceAll(ctrl_reg, EMPTY); // remove all ctrl chars
98
                str = str.replaceAll(surrogate_reg, SURROGATE); // replace all surrogate chars
99
                str = str.replaceAll(surrogate_punct_reg, SURROGATE_PUNCT); // replace all surrogate punct chars
100
                return str;
101
        }
102
        
103
        /**
104
         * Remove ctrl and surrogates chars from a String.
105
         *
106
         * @param str the str
107
         * @return the cleaned string
108
         */
109
        public static String clean(String str) {
110
                str = str.replace(TAB, SPACE); // replace \t by ' '
111
                str = str.replaceAll(ctrl_reg, EMPTY); // remove all ctrl chars
112
                str = str.replaceAll(surrogate_reg, SURROGATE); // replace all surrogate chars
113
                str = str.replaceAll(surrogate_punct_reg, SURROGATE_PUNCT); // replace all surrogate punct chars
114
                return str;
115
        }
116
        
117
        public static void removeSurrogateFromXmlFile(File file, File outputFile) {
118
                XMLStreamReader xmlStreamReader = XMLInputFactory.newInstance().createXMLStreamReader( new FileReader( file ) );
119
                String fileEncoding = xmlStreamReader.getEncoding();
120
                String text = IOUtils.getText(file, fileEncoding)
121
                text = CleanFile.clean(text);
122
                IOUtils.write(outputFile, text)
123
        }
124

    
125
        /**
126
         * The main method.
127
         *
128
         * @param args the arguments
129
         */
130
        public static void main(String[] args){
131
                File infile = new File("C:\\Documents and Settings\\H\\xml\\cleaner\\test.txt")                
132
                String encoding = "CP1252";
133
                new CleanFile(infile, encoding);
134
        }
135
}