Révision 2967
tmp/org.txm.utils/src/org/txm/utils/AsciiUtils.java (revision 2967) | ||
---|---|---|
127 | 127 |
if (s.length() == 0) { |
128 | 128 |
return s; |
129 | 129 |
} |
130 |
|
|
130 |
|
|
131 | 131 |
// ensure the "w_" prefix presence |
132 | 132 |
if (s.startsWith("w")) { |
133 | 133 |
if (!s.startsWith("w_")) { |
... | ... | |
136 | 136 |
} else { |
137 | 137 |
s = "w_" + s; |
138 | 138 |
} |
139 |
//System.out.println("first="+s); |
|
139 |
// System.out.println("first="+s);
|
|
140 | 140 |
|
141 |
String rez = convertNonAscii(s);//.toLowerCase(); |
|
142 |
//System.out.println("nonasscii="+rez); |
|
141 |
String rez = convertNonAscii(s);// .toLowerCase();
|
|
142 |
// System.out.println("nonasscii="+rez);
|
|
143 | 143 |
rez = rez.replaceAll("\\p{Space}++", "_"); |
144 |
//System.out.println("spaces="+rez); |
|
144 |
// System.out.println("spaces="+rez);
|
|
145 | 145 |
rez = rez.replaceAll("[¤€§µ£°().,;:/?!@§%\\\\\"’ʹ'*+\\-}\\]\\[{#~&]", ""); //$NON-NLS-1$ //$NON-NLS-2$ // "[^\\P{P}_]" |
146 |
//System.out.println("ponc="+rez); |
|
146 |
// System.out.println("ponc="+rez);
|
|
147 | 147 |
|
148 | 148 |
return rez; |
149 | 149 |
} |
... | ... | |
172 | 172 |
return s; |
173 | 173 |
} |
174 | 174 |
String rez = s.trim(); |
175 |
s = s.replaceAll("\\p{Space}++", "_");
|
|
175 |
rez = rez.replaceAll("\\p{Space}++", "_");
|
|
176 | 176 |
rez = rez.replaceAll("_", "-"); |
177 | 177 |
rez = convertNonAscii(rez).toLowerCase(); |
178 |
|
|
178 |
|
|
179 | 179 |
rez = rez.replaceAll("[¤€§µ£°().,;:/?!@§%\\\\\"’ʹ'*+\\}\\]\\[{#~&]", ""); //$NON-NLS-1$ //$NON-NLS-2$ |
180 | 180 |
// remove first chars if number |
181 | 181 |
char c = rez.charAt(0); |
... | ... | |
199 | 199 |
public static void main(String args[]) { |
200 | 200 |
String s = "01The result : - - _ тврьдо È,É,Ê,Ë,Û,Ù,Ï,Î,À,Â,Ô,è,é,ê,ë,û,ù,ï,î,à,â,ô,ç 0 1 2 3 4 5 6 7 8 9 10"; //$NON-NLS-1$ |
201 | 201 |
System.out.println(AsciiUtils.convertNonAscii(s)); |
202 |
//System.out.println(AsciiUtils.buildId(s)); |
|
202 |
// System.out.println(AsciiUtils.buildId(s));
|
|
203 | 203 |
String s2 = "w_ТВРЬДОтврьдо_123&é\"'(-è_çà)=/*-+~#{[|`\\^@]}¤;:!§/.?µ%£°"; |
204 |
System.out.println("nonascii="+AsciiUtils.convertNonAscii(s2));
|
|
205 |
System.out.println("word_id="+AsciiUtils.buildWordId(s2));
|
|
206 |
System.out.println("attribute_id="+AsciiUtils.buildAttributeId(s2));
|
|
204 |
System.out.println("nonascii=" + AsciiUtils.convertNonAscii(s2));
|
|
205 |
System.out.println("word_id=" + AsciiUtils.buildWordId(s2));
|
|
206 |
System.out.println("attribute_id=" + AsciiUtils.buildAttributeId(s2));
|
|
207 | 207 |
// output : |
208 | 208 |
// The result : E,E,E,E,U,U,I,I,A,A,O,e,e,e,e,u,u,i,i,a,a,o,c |
209 | 209 |
|
Formats disponibles : Unified diff