| 34 |
34 |
*
|
| 35 |
35 |
* FR:
|
| 36 |
36 |
*
|
| 37 |
|
* AsciiUtils.convertNonAscii(str) supprime les accents (note : cette méthode
|
| 38 |
|
* fonctionne avec une liste fermée de caractères accentués)
|
|
37 |
* AsciiUtils.convertNonAscii(str) supprime les accents
|
| 39 |
38 |
*
|
| 40 |
39 |
* AsciiUtils.removePunct(str) supprime les ponctuations
|
| 41 |
40 |
*
|
| ... | ... | |
| 76 |
75 |
private AsciiUtils() {
|
| 77 |
76 |
}
|
| 78 |
77 |
|
| 79 |
|
public static Transliterator asciiFormmater = Transliterator.getInstance("Any-Latin; NFD; [^\\p{Alnum}] Remove");
|
|
78 |
public static Transliterator asciiFormmater = Transliterator.getInstance("Any-Latin; NFD; [^\\p{Alnum}\\p{p}] Remove");
|
| 80 |
79 |
|
| 81 |
80 |
// remove accentued from a string and replace with ascii equivalent
|
| 82 |
81 |
/**
|
| 83 |
|
* Convert non ascii.
|
|
82 |
* Convert non ascii characters. Warning punctuations are not removed
|
| 84 |
83 |
*
|
| 85 |
84 |
* @param s
|
| 86 |
85 |
* the s
|
| ... | ... | |
| 126 |
125 |
*/
|
| 127 |
126 |
public static String buildWordId(String s) {
|
| 128 |
127 |
|
| 129 |
|
if (s.length() == 0)
|
|
128 |
if (s.length() == 0) {
|
| 130 |
129 |
return s;
|
| 131 |
|
|
|
130 |
}
|
|
131 |
|
| 132 |
132 |
// ensure the "w_" prefix presence
|
| 133 |
133 |
if (s.startsWith("w")) {
|
| 134 |
134 |
if (!s.startsWith("w_")) {
|
| ... | ... | |
| 137 |
137 |
} else {
|
| 138 |
138 |
s = "w_" + s;
|
| 139 |
139 |
}
|
|
140 |
//System.out.println("first="+s);
|
| 140 |
141 |
|
| 141 |
142 |
String rez = convertNonAscii(s).toLowerCase();
|
|
143 |
//System.out.println("nonasscii="+rez);
|
| 142 |
144 |
rez = rez.replaceAll("\\p{Space}++", "_");
|
| 143 |
|
rez = rez.replaceAll("[¤€§µ£°().,;:/?§%\"’'*+\\-}\\]\\[{#~&]", ""); //$NON-NLS-1$ //$NON-NLS-2$
|
|
145 |
//System.out.println("spaces="+rez);
|
|
146 |
rez = rez.replaceAll("[¤€§µ£°().,;:/?!@§%\\\\\"’ʹ'*+\\-}\\]\\[{#~&]", ""); //$NON-NLS-1$ //$NON-NLS-2$ // "[^\\P{P}_]"
|
|
147 |
//System.out.println("ponc="+rez);
|
| 144 |
148 |
|
| 145 |
149 |
return rez;
|
| 146 |
150 |
}
|
| ... | ... | |
| 172 |
176 |
String rez = convertNonAscii(s).toLowerCase();
|
| 173 |
177 |
rez = rez.replaceAll("\\p{Space}++", "_");
|
| 174 |
178 |
rez = rez.replaceAll("_", "-");
|
| 175 |
|
rez = rez.replaceAll("[¤€§µ£°().,;:/?§%\"’'*+\\}\\]\\[{#~&]", ""); //$NON-NLS-1$ //$NON-NLS-2$
|
|
179 |
rez = rez.replaceAll("[¤€§µ£°().,;:/?!@§%\\\\\"’ʹ'*+\\}\\]\\[{#~&]", ""); //$NON-NLS-1$ //$NON-NLS-2$
|
| 176 |
180 |
// remove first chars if number
|
| 177 |
181 |
char c = rez.charAt(0);
|
| 178 |
182 |
while (c == '0' || c == '1' || c == '2' || c == '3' || c == '4' || c == '5' || c == '6' || c == '7' || c == '8' || c == '9') {
|
| ... | ... | |
| 193 |
197 |
* the arguments
|
| 194 |
198 |
*/
|
| 195 |
199 |
public static void main(String args[]) {
|
| 196 |
|
String s = "01The result : È,É,Ê,Ë,Û,Ù,Ï,Î,À,Â,Ô,è,é,ê,ë,û,ù,ï,î,à,â,ô,ç 0 1 2 3 4 5 6 7 8 9 10"; //$NON-NLS-1$
|
|
200 |
String s = "01The result : _ тврьдо È,É,Ê,Ë,Û,Ù,Ï,Î,À,Â,Ô,è,é,ê,ë,û,ù,ï,î,à,â,ô,ç 0 1 2 3 4 5 6 7 8 9 10"; //$NON-NLS-1$
|
| 197 |
201 |
System.out.println(AsciiUtils.convertNonAscii(s));
|
| 198 |
|
System.out.println(AsciiUtils.buildId(s));
|
|
202 |
//System.out.println(AsciiUtils.buildId(s));
|
|
203 |
String s2 = "w_тврьдо_123&é\"'(-è_çà)=/*-+~#{[|`\\^@]}¤;:!§/.?µ%£°";
|
|
204 |
System.out.println("nonascii="+AsciiUtils.convertNonAscii(s2));
|
|
205 |
System.out.println("word_id="+AsciiUtils.buildWordId(s2));
|
|
206 |
System.out.println("attribute_id="+AsciiUtils.buildAttributeId(s2));
|
| 199 |
207 |
// output :
|
| 200 |
208 |
// The result : E,E,E,E,U,U,I,I,A,A,O,e,e,e,e,u,u,i,i,a,a,o,c
|
| 201 |
209 |
|