Revision 479 tmp/org.txm.groovy.core/src/groovy/org/txm/importer/CleanFile.groovy

CleanFile.groovy (revision 479)
37 37
// 
38 38
// 
39 39
// 
40
// $LastChangedDate: 2013-05-06 17:38:43 +0200 (lun., 06 mai 2013) $
41
// $LastChangedRevision: 2386 $
40
// $LastChangedDate: 2017-06-26 16:53:47 +0200 (lun. 26 juin 2017) $
41
// $LastChangedRevision: 3451 $
42 42
// $LastChangedBy: mdecorde $ 
43 43
//
44 44
package org.txm.importer;
45 45

  
46 46
import java.io.BufferedWriter;
47 47
import java.io.File;
48

  
49
import javax.xml.stream.XMLInputFactory;
50
import javax.xml.stream.XMLStreamReader
51

  
52
import org.txm.utils.io.IOUtils;
48 53

  
49 54
// TODO: Auto-generated Javadoc
50 55
/**
......
59 64
		File outfile = File.createTempFile("filecleaner","123456", infile.getParentFile());
60 65
		Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"));
61 66
		String line = reader.readLine();
62
		while(line != null)
63
		{
67
		while(line != null) {
64 68
			writer.write(clean(line) +"\n")
65 69
			line = reader.readLine();
66 70
		}
......
70 74
		infile.delete(); // erase old file
71 75
		outfile.renameTo(infile); // rename the copy
72 76
	}
73

  
77

  
78
	def static ctrl_reg = /\p{Cntrl}/
79
	def static surrogate = "[^\\p{P}&&[^\\u0001-\\uFFFF]]"
80
	def static surrogate_reg = /$surrogate/
81
	def static surrogate_punct = "[\\p{P}&&[^\\u0001-\\uFFFF]]"
82
	def static surrogate_punct_reg = /$surrogate_punct/
83
	def static SURROGATE = "_"
84
	def static SURROGATE_PUNCT = "☒"
85
	def static EMPTY = ""
86
	def static SPACE = " "
87
	def static TAB = "\t"
88
	
74 89
	/**
75
	 * Clean.
90
	 * Remove ctrl and surrogates chars from a String.
76 91
	 *
77 92
	 * @param str the str
78
	 * @return the string
93
	 * @return the cleaned string
79 94
	 */
80 95
	public static String clean(String str) {
81
		str = str.replace("\t", " ");// replace \t by ' '
82
		str = str.replaceAll("\\p{Cntrl}", ""); // remove all ctrl char
96
		str = str.replace(TAB, SPACE); // replace \t by ' '
97
		str = str.replaceAll(ctrl_reg, EMPTY); // remove all ctrl chars
98
		str = str.replaceAll(surrogate_reg, SURROGATE); // replace all surrogate chars
99
		str = str.replaceAll(surrogate_punct_reg, SURROGATE_PUNCT); // replace all surrogate punct chars
83 100
		return str;
101
	}
102
	
103
	public static void removeSurrogateFromXmlFile(File file, File outputFile) {
104
		XMLStreamReader xmlStreamReader = XMLInputFactory.newInstance().createXMLStreamReader( new FileReader( file ) );
105
		String fileEncoding = xmlStreamReader.getEncoding();
106
		String text = IOUtils.getText(file, fileEncoding)
107
		text = CleanFile.clean(text);
108
		IOUtils.write(outputFile, text)
84 109
	}
85 110

  
86 111
	/**

Also available in: Unified diff