Revision 625 tmp/org.txm.groovy.core/src/groovy/org/txm/importer/CleanFile.groovy

CleanFile.groovy (revision 625)
75 75
		outfile.renameTo(infile); // rename the copy
76 76
	}
77 77

  
78
	def static ctrl_reg = /\p{Cntrl}/
78
	def static ctrl_reg = /[\p{Cntrl}&&[^\n\t]]/ // all ctrl but \n
79
	
79 80
	def static surrogate = "[^\\p{P}&&[^\\u0001-\\uFFFF]]"
80 81
	def static surrogate_reg = /$surrogate/
81 82
	def static surrogate_punct = "[\\p{P}&&[^\\u0001-\\uFFFF]]"
......
85 86
	def static EMPTY = ""
86 87
	def static SPACE = " "
87 88
	def static TAB = "\t"
89
	
90
	/**
91
	 * Remove ctrl and surrogates chars from a String. except tabs
92
	 *
93
	 * @param str the str
94
	 * @return the cleaned string
95
	 */
96
	public static String cleanAllButTabs(String str) {
97
		str = str.replaceAll(ctrl_reg, EMPTY); // remove all ctrl chars
98
		str = str.replaceAll(surrogate_reg, SURROGATE); // replace all surrogate chars
99
		str = str.replaceAll(surrogate_punct_reg, SURROGATE_PUNCT); // replace all surrogate punct chars
100
		return str;
101
	}
88 102
	
89 103
	/**
90 104
	 * Remove ctrl and surrogates chars from a String.

Also available in: Unified diff