root / tmp / org.txm.core / src / java / org / txm / scripts / importer / CleanFile.groovy @ 2473
History | View | Annotate | Download (4.8 kB)
1 |
// Copyright © 2010-2013 ENS de Lyon.
|
---|---|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice
|
4 |
// Sophia Antipolis, University of Paris 3.
|
5 |
//
|
6 |
// The TXM platform is free software: you can redistribute it
|
7 |
// and/or modify it under the terms of the GNU General Public
|
8 |
// License as published by the Free Software Foundation,
|
9 |
// either version 2 of the License, or (at your option) any
|
10 |
// later version.
|
11 |
//
|
12 |
// The TXM platform is distributed in the hope that it will be
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 |
// PURPOSE. See the GNU General Public License for more
|
16 |
// details.
|
17 |
//
|
18 |
// You should have received a copy of the GNU General
|
19 |
// Public License along with the TXM platform. If not, see
|
20 |
// http://www.gnu.org/licenses.
|
21 |
|
22 |
//
|
23 |
// This file is part of the TXM platform.
|
24 |
//
|
25 |
// The TXM platform is free software: you can redistribute it and/or modif y
|
26 |
// it under the terms of the GNU General Public License as published by
|
27 |
// the Free Software Foundation, either version 3 of the License, or
|
28 |
// (at your option) any later version.
|
29 |
//
|
30 |
// The TXM platform is distributed in the hope that it will be useful,
|
31 |
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
32 |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
33 |
// GNU General Public License for more details.
|
34 |
//
|
35 |
// You should have received a copy of the GNU General Public License
|
36 |
// along with the TXM platform. If not, see <http://www.gnu.org/licenses/>.
|
37 |
//
|
38 |
//
|
39 |
//
|
40 |
// $LastChangedDate: 2017-06-26 16:53:47 +0200 (lun. 26 juin 2017) $
|
41 |
// $LastChangedRevision: 3451 $
|
42 |
// $LastChangedBy: mdecorde $
|
43 |
//
|
44 |
package org.txm.scripts.importer;
|
45 |
|
46 |
import java.io.BufferedWriter; |
47 |
import java.io.File; |
48 |
|
49 |
import javax.xml.stream.XMLInputFactory; |
50 |
import javax.xml.stream.XMLStreamReader |
51 |
|
52 |
import org.txm.utils.io.IOUtils; |
53 |
|
54 |
// TODO: Auto-generated Javadoc
|
55 |
/**
|
56 |
* remove all ctrl char and replace \t by ' '.
|
57 |
*
|
58 |
* @param infile : the file to process
|
59 |
* @param encoding : the encoding of the file
|
60 |
*/
|
61 |
class CleanFile { |
62 |
public CleanFile(File infile, String encoding){ |
63 |
Reader reader = new BufferedReader(new InputStreamReader(new FileInputStream(infile),encoding)); |
64 |
File outfile = File.createTempFile("filecleaner","123456", infile.getParentFile()); |
65 |
Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8")); |
66 |
String line = reader.readLine();
|
67 |
while(line != null) { |
68 |
writer.write(clean(line) +"\n")
|
69 |
line = reader.readLine(); |
70 |
} |
71 |
writer.close(); |
72 |
reader.close(); |
73 |
|
74 |
infile.delete(); // erase old file
|
75 |
outfile.renameTo(infile); // rename the copy
|
76 |
} |
77 |
|
78 |
def static ctrl_reg = /[\p{Cntrl}&&[^\n\t]]/ // all ctrl but \n |
79 |
|
80 |
def static surrogate = "[^\\p{P}&&[^\\u0001-\\uFFFF]]" |
81 |
def static surrogate_reg = /$surrogate/ |
82 |
def static surrogate_punct = "[\\p{P}&&[^\\u0001-\\uFFFF]]" |
83 |
def static surrogate_punct_reg = /$surrogate_punct/ |
84 |
def static SURROGATE = "_" |
85 |
def static SURROGATE_PUNCT = "☒" |
86 |
def static EMPTY = "" |
87 |
def static SPACE = " " |
88 |
def static TAB = "\t" |
89 |
|
90 |
/**
|
91 |
* Remove ctrl and surrogates chars from a String. except tabs
|
92 |
*
|
93 |
* @param str the str
|
94 |
* @return the cleaned string
|
95 |
*/
|
96 |
public static String cleanAllButTabs(String str) { |
97 |
str = str.replaceAll(ctrl_reg, EMPTY); // remove all ctrl chars
|
98 |
str = str.replaceAll(surrogate_reg, SURROGATE); // replace all surrogate chars
|
99 |
str = str.replaceAll(surrogate_punct_reg, SURROGATE_PUNCT); // replace all surrogate punct chars
|
100 |
return str;
|
101 |
} |
102 |
|
103 |
/**
|
104 |
* Remove ctrl and surrogates chars from a String.
|
105 |
*
|
106 |
* @param str the str
|
107 |
* @return the cleaned string
|
108 |
*/
|
109 |
public static String clean(String str) { |
110 |
str = str.replace(TAB, SPACE); // replace \t by ' '
|
111 |
str = str.replaceAll(ctrl_reg, EMPTY); // remove all ctrl chars
|
112 |
str = str.replaceAll(surrogate_reg, SURROGATE); // replace all surrogate chars
|
113 |
str = str.replaceAll(surrogate_punct_reg, SURROGATE_PUNCT); // replace all surrogate punct chars
|
114 |
return str;
|
115 |
} |
116 |
|
117 |
public static void removeSurrogateFromXmlFile(File file, File outputFile) { |
118 |
FileReader reader = new FileReader( file ); |
119 |
XMLStreamReader xmlStreamReader = XMLInputFactory.newInstance().createXMLStreamReader( reader ); |
120 |
String fileEncoding = xmlStreamReader.getEncoding();
|
121 |
xmlStreamReader.close(); |
122 |
reader.close(); |
123 |
|
124 |
if (fileEncoding == null) fileEncoding = "UTF-8"; //$NON-NLS-1$ |
125 |
|
126 |
String text = IOUtils.getText(file, fileEncoding)
|
127 |
text = CleanFile.clean(text); |
128 |
IOUtils.write(outputFile, text) |
129 |
} |
130 |
|
131 |
/**
|
132 |
* The main method.
|
133 |
*
|
134 |
* @param args the arguments
|
135 |
*/
|
136 |
public static void main(String[] args){ |
137 |
File infile = new File("C:\\Documents and Settings\\H\\xml\\cleaner\\test.txt") |
138 |
String encoding = "CP1252"; |
139 |
new CleanFile(infile, encoding);
|
140 |
} |
141 |
} |