Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / importer / CleanFile.groovy @ 966

History | View | Annotate | Download (4.6 kB)

1 881 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 881 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 881 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 881 mdecorde
// Sophia Antipolis, University of Paris 3.
5 881 mdecorde
//
6 881 mdecorde
// The TXM platform is free software: you can redistribute it
7 881 mdecorde
// and/or modify it under the terms of the GNU General Public
8 881 mdecorde
// License as published by the Free Software Foundation,
9 881 mdecorde
// either version 2 of the License, or (at your option) any
10 881 mdecorde
// later version.
11 881 mdecorde
//
12 881 mdecorde
// The TXM platform is distributed in the hope that it will be
13 881 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 881 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 881 mdecorde
// PURPOSE. See the GNU General Public License for more
16 881 mdecorde
// details.
17 881 mdecorde
//
18 881 mdecorde
// You should have received a copy of the GNU General
19 881 mdecorde
// Public License along with the TXM platform. If not, see
20 881 mdecorde
// http://www.gnu.org/licenses.
21 881 mdecorde
22 881 mdecorde
//
23 881 mdecorde
// This file is part of the TXM platform.
24 881 mdecorde
//
25 881 mdecorde
// The TXM platform is free software: you can redistribute it and/or modif y
26 881 mdecorde
// it under the terms of the GNU General Public License as published by
27 881 mdecorde
// the Free Software Foundation, either version 3 of the License, or
28 881 mdecorde
// (at your option) any later version.
29 881 mdecorde
//
30 881 mdecorde
// The TXM platform is distributed in the hope that it will be useful,
31 881 mdecorde
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32 881 mdecorde
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33 881 mdecorde
// GNU General Public License for more details.
34 881 mdecorde
//
35 881 mdecorde
// You should have received a copy of the GNU General Public License
36 881 mdecorde
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37 881 mdecorde
//
38 881 mdecorde
//
39 881 mdecorde
//
40 881 mdecorde
// $LastChangedDate: 2017-06-26 16:53:47 +0200 (lun. 26 juin 2017) $
41 881 mdecorde
// $LastChangedRevision: 3451 $
42 881 mdecorde
// $LastChangedBy: mdecorde $
43 881 mdecorde
//
44 881 mdecorde
package org.txm.importer;
45 881 mdecorde
46 881 mdecorde
import java.io.BufferedWriter;
47 881 mdecorde
import java.io.File;
48 881 mdecorde
49 881 mdecorde
import javax.xml.stream.XMLInputFactory;
50 881 mdecorde
import javax.xml.stream.XMLStreamReader
51 881 mdecorde
52 881 mdecorde
import org.txm.utils.io.IOUtils;
53 881 mdecorde
54 881 mdecorde
// TODO: Auto-generated Javadoc
55 881 mdecorde
/**
56 881 mdecorde
 * remove all ctrl char and replace \t by ' '.
57 881 mdecorde
 *
58 881 mdecorde
 * @param infile : the file to process
59 881 mdecorde
 * @param encoding : the encoding of the file
60 881 mdecorde
 */
61 881 mdecorde
class CleanFile {
62 881 mdecorde
        public CleanFile(File infile, String encoding){
63 881 mdecorde
                Reader reader = new BufferedReader(new InputStreamReader(new FileInputStream(infile),encoding));
64 881 mdecorde
                File outfile = File.createTempFile("filecleaner","123456", infile.getParentFile());
65 881 mdecorde
                Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"));
66 881 mdecorde
                String line = reader.readLine();
67 881 mdecorde
                while(line != null) {
68 881 mdecorde
                        writer.write(clean(line) +"\n")
69 881 mdecorde
                        line = reader.readLine();
70 881 mdecorde
                }
71 881 mdecorde
                writer.close();
72 881 mdecorde
                reader.close();
73 881 mdecorde
74 881 mdecorde
                infile.delete(); // erase old file
75 881 mdecorde
                outfile.renameTo(infile); // rename the copy
76 881 mdecorde
        }
77 881 mdecorde
78 881 mdecorde
        def static ctrl_reg = /[\p{Cntrl}&&[^\n\t]]/ // all ctrl but \n
79 881 mdecorde
80 881 mdecorde
        def static surrogate = "[^\\p{P}&&[^\\u0001-\\uFFFF]]"
81 881 mdecorde
        def static surrogate_reg = /$surrogate/
82 881 mdecorde
        def static surrogate_punct = "[\\p{P}&&[^\\u0001-\\uFFFF]]"
83 881 mdecorde
        def static surrogate_punct_reg = /$surrogate_punct/
84 881 mdecorde
        def static SURROGATE = "_"
85 881 mdecorde
        def static SURROGATE_PUNCT = ""
86 881 mdecorde
        def static EMPTY = ""
87 881 mdecorde
        def static SPACE = " "
88 881 mdecorde
        def static TAB = "\t"
89 881 mdecorde
90 881 mdecorde
        /**
91 881 mdecorde
         * Remove ctrl and surrogates chars from a String. except tabs
92 881 mdecorde
         *
93 881 mdecorde
         * @param str the str
94 881 mdecorde
         * @return the cleaned string
95 881 mdecorde
         */
96 881 mdecorde
        public static String cleanAllButTabs(String str) {
97 881 mdecorde
                str = str.replaceAll(ctrl_reg, EMPTY); // remove all ctrl chars
98 881 mdecorde
                str = str.replaceAll(surrogate_reg, SURROGATE); // replace all surrogate chars
99 881 mdecorde
                str = str.replaceAll(surrogate_punct_reg, SURROGATE_PUNCT); // replace all surrogate punct chars
100 881 mdecorde
                return str;
101 881 mdecorde
        }
102 881 mdecorde
103 881 mdecorde
        /**
104 881 mdecorde
         * Remove ctrl and surrogates chars from a String.
105 881 mdecorde
         *
106 881 mdecorde
         * @param str the str
107 881 mdecorde
         * @return the cleaned string
108 881 mdecorde
         */
109 881 mdecorde
        public static String clean(String str) {
110 881 mdecorde
                str = str.replace(TAB, SPACE); // replace \t by ' '
111 881 mdecorde
                str = str.replaceAll(ctrl_reg, EMPTY); // remove all ctrl chars
112 881 mdecorde
                str = str.replaceAll(surrogate_reg, SURROGATE); // replace all surrogate chars
113 881 mdecorde
                str = str.replaceAll(surrogate_punct_reg, SURROGATE_PUNCT); // replace all surrogate punct chars
114 881 mdecorde
                return str;
115 881 mdecorde
        }
116 881 mdecorde
117 881 mdecorde
        public static void removeSurrogateFromXmlFile(File file, File outputFile) {
118 881 mdecorde
                XMLStreamReader xmlStreamReader = XMLInputFactory.newInstance().createXMLStreamReader( new FileReader( file ) );
119 881 mdecorde
                String fileEncoding = xmlStreamReader.getEncoding();
120 881 mdecorde
                String text = IOUtils.getText(file, fileEncoding)
121 881 mdecorde
                text = CleanFile.clean(text);
122 881 mdecorde
                IOUtils.write(outputFile, text)
123 881 mdecorde
        }
124 881 mdecorde
125 881 mdecorde
        /**
126 881 mdecorde
         * The main method.
127 881 mdecorde
         *
128 881 mdecorde
         * @param args the arguments
129 881 mdecorde
         */
130 881 mdecorde
        public static void main(String[] args){
131 881 mdecorde
                File infile = new File("C:\\Documents and Settings\\H\\xml\\cleaner\\test.txt")
132 881 mdecorde
                String encoding = "CP1252";
133 881 mdecorde
                new CleanFile(infile, encoding);
134 881 mdecorde
        }
135 881 mdecorde
}