34 |
34 |
*
|
35 |
35 |
* FR:
|
36 |
36 |
*
|
37 |
|
* AsciiUtils.convertNonAscii(str) supprime les accents (note : cette méthode
|
38 |
|
* fonctionne avec une liste fermée de caractères accentués)
|
|
37 |
* AsciiUtils.convertNonAscii(str) supprime les accents
|
39 |
38 |
*
|
40 |
39 |
* AsciiUtils.removePunct(str) supprime les ponctuations
|
41 |
40 |
*
|
... | ... | |
76 |
75 |
private AsciiUtils() {
|
77 |
76 |
}
|
78 |
77 |
|
79 |
|
public static Transliterator asciiFormmater = Transliterator.getInstance("Any-Latin; NFD; [^\\p{Alnum}] Remove");
|
|
78 |
public static Transliterator asciiFormmater = Transliterator.getInstance("Any-Latin; NFD; [^\\p{Alnum}\\p{p}] Remove");
|
80 |
79 |
|
81 |
80 |
// remove accentued from a string and replace with ascii equivalent
|
82 |
81 |
/**
|
83 |
|
* Convert non ascii.
|
|
82 |
* Convert non ascii characters. Warning punctuations are not removed
|
84 |
83 |
*
|
85 |
84 |
* @param s
|
86 |
85 |
* the s
|
... | ... | |
126 |
125 |
*/
|
127 |
126 |
public static String buildWordId(String s) {
|
128 |
127 |
|
129 |
|
if (s.length() == 0)
|
|
128 |
if (s.length() == 0) {
|
130 |
129 |
return s;
|
131 |
|
|
|
130 |
}
|
|
131 |
|
132 |
132 |
// ensure the "w_" prefix presence
|
133 |
133 |
if (s.startsWith("w")) {
|
134 |
134 |
if (!s.startsWith("w_")) {
|
... | ... | |
137 |
137 |
} else {
|
138 |
138 |
s = "w_" + s;
|
139 |
139 |
}
|
|
140 |
//System.out.println("first="+s);
|
140 |
141 |
|
141 |
142 |
String rez = convertNonAscii(s).toLowerCase();
|
|
143 |
//System.out.println("nonasscii="+rez);
|
142 |
144 |
rez = rez.replaceAll("\\p{Space}++", "_");
|
143 |
|
rez = rez.replaceAll("[¤€§µ£°().,;:/?§%\"’'*+\\-}\\]\\[{#~&]", ""); //$NON-NLS-1$ //$NON-NLS-2$
|
|
145 |
//System.out.println("spaces="+rez);
|
|
146 |
rez = rez.replaceAll("[¤€§µ£°().,;:/?!@§%\\\\\"’ʹ'*+\\-}\\]\\[{#~&]", ""); //$NON-NLS-1$ //$NON-NLS-2$ // "[^\\P{P}_]"
|
|
147 |
//System.out.println("ponc="+rez);
|
144 |
148 |
|
145 |
149 |
return rez;
|
146 |
150 |
}
|
... | ... | |
172 |
176 |
String rez = convertNonAscii(s).toLowerCase();
|
173 |
177 |
rez = rez.replaceAll("\\p{Space}++", "_");
|
174 |
178 |
rez = rez.replaceAll("_", "-");
|
175 |
|
rez = rez.replaceAll("[¤€§µ£°().,;:/?§%\"’'*+\\}\\]\\[{#~&]", ""); //$NON-NLS-1$ //$NON-NLS-2$
|
|
179 |
rez = rez.replaceAll("[¤€§µ£°().,;:/?!@§%\\\\\"’ʹ'*+\\}\\]\\[{#~&]", ""); //$NON-NLS-1$ //$NON-NLS-2$
|
176 |
180 |
// remove first chars if number
|
177 |
181 |
char c = rez.charAt(0);
|
178 |
182 |
while (c == '0' || c == '1' || c == '2' || c == '3' || c == '4' || c == '5' || c == '6' || c == '7' || c == '8' || c == '9') {
|
... | ... | |
193 |
197 |
* the arguments
|
194 |
198 |
*/
|
195 |
199 |
public static void main(String args[]) {
|
196 |
|
String s = "01The result : È,É,Ê,Ë,Û,Ù,Ï,Î,À,Â,Ô,è,é,ê,ë,û,ù,ï,î,à,â,ô,ç 0 1 2 3 4 5 6 7 8 9 10"; //$NON-NLS-1$
|
|
200 |
String s = "01The result : _ тврьдо È,É,Ê,Ë,Û,Ù,Ï,Î,À,Â,Ô,è,é,ê,ë,û,ù,ï,î,à,â,ô,ç 0 1 2 3 4 5 6 7 8 9 10"; //$NON-NLS-1$
|
197 |
201 |
System.out.println(AsciiUtils.convertNonAscii(s));
|
198 |
|
System.out.println(AsciiUtils.buildId(s));
|
|
202 |
//System.out.println(AsciiUtils.buildId(s));
|
|
203 |
String s2 = "w_тврьдо_123&é\"'(-è_çà)=/*-+~#{[|`\\^@]}¤;:!§/.?µ%£°";
|
|
204 |
System.out.println("nonascii="+AsciiUtils.convertNonAscii(s2));
|
|
205 |
System.out.println("word_id="+AsciiUtils.buildWordId(s2));
|
|
206 |
System.out.println("attribute_id="+AsciiUtils.buildAttributeId(s2));
|
199 |
207 |
// output :
|
200 |
208 |
// The result : E,E,E,E,U,U,I,I,A,A,O,e,e,e,e,u,u,i,i,a,a,o,c
|
201 |
209 |
|