Revision 3057

tmp/org.txm.tokenizer.core/src/org/txm/tokenizer/SimpleStringTokenizer.groovy (revision 3057)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

  
22
package org.txm.tokenizer;
23

  
24
import java.io.File;
25
import java.util.LinkedHashMap;
26
import java.util.regex.Matcher;
27
import java.util.regex.Pattern;
28

  
29
import org.txm.tokenizer.TokenizerClasses;
30

  
31
public class SimpleStringTokenizer implements StringTokenizer {
32
	
33
	/** The punct_strong. */
34
	String punct_strong = TokenizerClasses.punct_strong;
35
	
36
	/** The punct_all. */
37
	String punct_all = TokenizerClasses.punct_all;
38
	
39
	/** The word_chars. */
40
	String word_chars = TokenizerClasses.word_chars;
41
	
42
	String fclitics = null; // default behavior don't manage clitics
43
	
44
	String pclitics = null; // default behavior don't manage clitics
45
	
46
	Pattern reg_punct_other = Pattern.compile("\\p{P}");
47
	
48
	/** The DEBUG. */
49
	public boolean DEBUG = false;
50
	
51
	String lang;
52
	
53
	def regElision = null;
54
	
55
	def reg3pts = null;
56
	
57
	def regPunct;
58
	
59
	def regFClitics = null;
60
	
61
	def regPClitics = null;
62
	
63
	String whitespaces;
64
	
65
	Pattern regWhitespaces;
66
	
67
	Pattern regSplitWhiteSpaces;
68
	
69
	/**
70
	 * Instantiates a new simple string tokenizer.
71
	 *
72
	 * @param lang
73
	 */
74
	public SimpleStringTokenizer(String lang) {
75
		this.lang = lang;
76
		if (lang != null)
77
			if (lang.startsWith("en")) {
78
				fclitics = TokenizerClasses.FClitic_en;
79
			} else if (lang.startsWith("fr")) {
80
				fclitics = TokenizerClasses.FClitic_fr;
81
				pclitics = TokenizerClasses.PClitic_fr;
82
			} else if (lang.startsWith("gl")) {
83
				fclitics = TokenizerClasses.FClitic_gl;
84
			} else if (lang.startsWith("it")) {
85
				pclitics = TokenizerClasses.PClitic_it;
86
			}
87
		
88
		punct_strong = TokenizerClasses.punct_strong;
89
		punct_all = TokenizerClasses.punct_all;
90
		word_chars = TokenizerClasses.word_chars;
91
		
92
		String strRegElision = TokenizerClasses.regElision;
93
		if (strRegElision != null && strRegElision.length() > 0) {
94
			regElision = ~/^([\p{L}-]++${strRegElision}[\p{L}-]++)(.*)$/ // the test must start with the start of string  ^
95
		}
96
		reg3pts = ~/^(.*?)(\.\.\.)(.*)$/
97
		
98
		String strRegPunct = TokenizerClasses.regPunct;
99
		if (strRegPunct != null && strRegPunct.length() > 0) {
100
			regPunct = ~/^(.*?)($strRegPunct)(.*)$/
101
		}
102
		if (fclitics != null && fclitics.length() > 0) {
103
			regFClitics = ~/(.+)($fclitics)$/ // the test must end with the end of string $
104
		}
105
		if (pclitics != null && pclitics.length() > 0)
106
			regPClitics = /^($pclitics)(.*)/ // the test must start with the start of string  ^
107
		
108
		if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0) {
109
			regWhitespaces = ~TokenizerClasses.whitespaces;
110
		}
111
		
112
		if (TokenizerClasses.whitespaces != null && TokenizerClasses.whitespaces.length() > 0) {
113
			regSplitWhiteSpaces = Pattern.compile(TokenizerClasses.whitespaces);
114
		}
115
	}
116
	
117
	public final static String WHITESPACE = " ";
118
	
119
	public final static String EMPTY = "";
120
	
121
	/**
122
	 * Process word.
123
	 */
124
	public ArrayList<String> processText(String text) {
125
		ArrayList<String> result = new ArrayList<String>();
126
		if (regSplitWhiteSpaces != null) {
127
			for (String s : regSplitWhiteSpaces.split(text)) {		// separate with unicode white spaces
128
				// if (DEBUG){println "process $s"}
129
				result.addAll(iterate(s));
130
			}
131
		}
132
		else {
133
			result.addAll(iterate(text));
134
		}
135
		return result;
136
	}
137
	
138
	/**
139
	 * Iterate. a String, should be called when a word is found in a String
140
	 *
141
	 * @param s the s
142
	 * @return the java.lang. object
143
	 */
144
	protected ArrayList<String> iterate(String s) {
145
		ArrayList<String> result = new ArrayList<String>();
146
		while (s != null && s.length() > 0) {
147
			//	if (DEBUG){println "  > $s"}
148
			s = standardChecks(result, s);
149
		}
150
		return result;
151
	}
152
	
153
	/**
154
	 * Standard checks.
155
	 *
156
	 * @param s the s
157
	 * @return the java.lang. object
158
	 */
159
	//	@CompileStatic(SKIP)
160
	public String standardChecks(ArrayList<String> result, String s) {
161
		Matcher m;
162
		
163
		for (TTest test : TokenizerClasses.tests) {
164
			if ((m = s =~ test.regex)) {
165
				if (DEBUG) {println "test : "+test.regex}
166
				if (test.before > 0) {
167
					result.addAll(iterate(m[0][test.before]))
168
				}
169
				if (test.hit > 0) {
170
					result.add(m[0][test.hit]);
171
				}
172
				if (test.after > 0) {
173
					return m[0][test.after];
174
				}
175
			}
176
		}
177
		
178
		if (fclitics != null && (m = s =~ regFClitics) ) {
179
			//	if (DEBUG) println "CLITIC found: $s ->"+ m
180
			result.addAll(iterate(m.group(1)))
181
			
182
			result.add(m.group(2));
183
			
184
			return "";
185
		} else if (pclitics != null && (m = s =~ regPClitics) ) {
186
			if (DEBUG) println "PCLITIC found: $s ->"+ m
187
			
188
			result.add(m.group(1));
189
			
190
			result.addAll(iterate(m.group(2)))
191
			
192
			return "";
193
		} else if (regElision != null && (m = s =~ regElision) ) {
194
			//	if (DEBUG) println "Elision found: $s ->"+ m.group(1)+" + "+m.group(2)+" + "+m.group(3)
195
			//iterate(m.group(1))
196
			
197
			//			int sep = s.indexOf("'");
198
			//			if (sep < 0)
199
			//				sep = s.indexOf("’");
200
			//			if (sep < 0)
201
			//				sep = s.indexOf("‘");
202
			
203
			result.add(m.group(1));
204
			
205
			result.addAll(iterate(m.group(2)))
206
			
207
			return "";
208
		} else if (reg3pts != null && (m = s =~ reg3pts) )	{
209
			//	if(DEBUG){println "REG '...' found: $s -> "+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
210
			result.addAll(iterate(m.group(1)))
211
			
212
			result.add("...");
213
			
214
			return m.group(3);
215
		} else if (regPunct != null && (m = s =~ regPunct) ) {
216
			if(DEBUG){println "PUNCT found: $s ->"+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
217
			result.addAll(iterate(m.group(1)))
218
			
219
			result.add(m.group(2));
220
			
221
			return m.group(3);
222
		} else {
223
			//		if(DEBUG){println "Other found: "+s}
224
			result.add(s);
225
			
226
			return "";
227
		}
228
	}
229
	
230
	/**
231
	 * The main method.
232
	 *
233
	 * @param args the arguments
234
	 */
235
	public static void main(String[] args) {
236
		def tests = [
237
			["fr", "c'est un test."],
238
			["fr", "C'est un autre test."],
239
			[
240
				"fr",
241
				"C'est une version 0.8.1 un 01:12:12 test vers http://un.site.web.fr, fin."
242
			],
243
			["en", "This is a test."],
244
			["en", "It's a test."]
245
		]
246
		
247
		for (def d : tests) {
248
			String lang = d[0]
249
			String text = d[1]
250
			SimpleStringTokenizer tokenizer = new SimpleStringTokenizer(lang)
251
			println "Process: $text"
252
			println "Result : "+tokenizer.processText(text).collect{"<"+it+">"}
253
		}
254
	}
255
}
tmp/org.txm.tokenizer.core/src/org/txm/tokenizer/StringTokenizer.java (revision 3057)
1
package org.txm.tokenizer;
2

  
3
import java.util.List;
4

  
5
public interface StringTokenizer {
6
	
7
	List<String> processText(String text);
8
}
0 9

  
tmp/org.txm.tokenizer.core/src/org/txm/tokenizer/TTest.java (revision 3057)
1 1
package org.txm.tokenizer;
2 2

  
3 3
public class TTest {
4
	
4 5
	String regex;
6
	
5 7
	String type = "w";
6
	int before=1, hit=2, after=3;
8
	
9
	int before = 1, hit = 2, after = 3;
10
	
7 11
	public TTest(String regex, String type, int before, int hit, int after) {
8 12
		this.regex = regex;
9 13
		this.type = type;
......
15 19
	public String getRegex() {
16 20
		return regex;
17 21
	}
18

  
22
	
19 23
	public TTest(String regex, int before, int hit, int after) {
20 24
		this.regex = regex;
21 25
		this.before = before;
22 26
		this.hit = hit;
23 27
		this.after = after;
24 28
	}
25

  
29
	
26 30
	public TTest(String regex, String type) {
27 31
		this.regex = regex;
28
		this.type= type;
32
		this.type = type;
29 33
	}
30

  
34
	
31 35
	public TTest(String regex) {
32 36
		this.regex = regex;
33 37
	}
34

  
35
	/* (non-Javadoc)
38
	
39
	/*
40
	 * (non-Javadoc)
36 41
	 * @see java.lang.Object#toString()
37 42
	 */
38 43
	@Override
39 44
	public String toString() {
40
		return "[regex=" + regex + ", type=" + type + ", before="+ before + ", hit=" + hit + ", after=" + after + "]";
45
		return "[regex=" + regex + ", type=" + type + ", before=" + before + ", hit=" + hit + ", after=" + after + "]";
41 46
	}
42 47
}
43 48

  
tmp/org.txm.tokenizer.core/src/org/txm/tokenizer/TokenizerClasses.java (revision 3057)
27 27
//
28 28
package org.txm.tokenizer;
29 29

  
30
import java.io.File;
31
import java.io.FileWriter;
32 30
import java.io.IOException;
33 31
import java.lang.reflect.Field;
34 32
import java.lang.reflect.Modifier;
35
import java.util.ArrayList;
36
import java.util.List;
37
import java.util.prefs.Preferences;
38 33
import java.util.regex.Pattern;
39 34

  
35
import javax.xml.parsers.ParserConfigurationException;
36

  
40 37
import org.eclipse.core.resources.ProjectScope;
41 38
import org.eclipse.core.runtime.preferences.IEclipsePreferences;
42 39
import org.osgi.service.prefs.BackingStoreException;
43
import org.txm.utils.xml.DomUtils;
44 40
import org.w3c.dom.Document;
45 41
import org.w3c.dom.Element;
46 42
import org.w3c.dom.Node;
47 43
import org.w3c.dom.NodeList;
48 44
import org.xml.sax.SAXException;
49 45

  
50
import javax.xml.parsers.ParserConfigurationException;
51
import javax.xml.stream.*;
52
import java.net.URL;
53

  
54 46
/**
55 47
 * Holder for tokenizer rules regexp
56 48
 */
57 49
public class TokenizerClasses {
58

  
50
	
59 51
	//// ROOTS ? ////
60 52
	public static boolean debug = false;
53
	
61 54
	/** The tag_all. */
62 55
	public static String tag_all = null;
63

  
56
	
64 57
	/** The Weblex enclitics. */
65 58
	public static String enclitics = null;
59
	
66 60
	public static String encliticsFR = null;
67

  
61
	
68 62
	/** The TT enclitics. */
69 63
	public static String FClitic_en = null;
64
	
70 65
	public static String PClitic_fr = null;
66
	
71 67
	public static String FClitic_fr = null;
68
	
72 69
	public static String PClitic_it = null;
70
	
73 71
	public static String FClitic_gl = null;
74

  
72
	
75 73
	/** The div_tags. */
76 74
	public static String div_tags = null;
77

  
75
	
78 76
	/** The q_tags. */
79 77
	public static String q_tags = null;
80

  
78
	
81 79
	/** The extraword_tags. */
82 80
	public static String extraword1_tags = null;
83

  
81
	
84 82
	/** The corr_tags_no_seg. */
85 83
	public static String corr_tags_no_seg = null;
86

  
84
	
87 85
	/** The word_tags. */
88 86
	public static String word_tags = null;
87
	
89 88
	/** the element to create when a word is created */
90 89
	public static String word_element_to_create = null;
91

  
90
	
92 91
	/** The intraword_tags. */
93 92
	public static String intraword_tags = null;
94

  
93
	
95 94
	/** The punct_quotes. */
96 95
	public static String punct_quotes = null;
97

  
96
	
98 97
	/** The punct_strong1. */
99 98
	public static String punct_strong1 = null;
100

  
99
	
101 100
	/** The punct_strong2. */
102 101
	public static String punct_strong2 = null;
103

  
102
	
104 103
	/** The punct_paren_open1. */
105 104
	public static String punct_paren_open1 = null;
106

  
105
	
107 106
	/** The punct_paren_open2. */
108 107
	public static String punct_paren_open2 = null;
109

  
108
	
110 109
	/** The punct_paren_close1. */
111 110
	public static String punct_paren_close1 = null;
112

  
111
	
113 112
	/** The punct_paren_close2. */
114 113
	public static String punct_paren_close2 = null;
115

  
114
	
116 115
	/** The punct_weak. */
117 116
	public static String punct_weak = null;
118

  
117
	
119 118
	public static String entity = null;
120

  
119
	
121 120
	public static String seg_tags = null;
122

  
121
	
123 122
	//// COMBINAISONS ////
124

  
123
	
125 124
	/** The corr_tags. */
126 125
	public static String corr_tags = null;
126
	
127 127
	/** The extraword_tags. */
128 128
	public static String extraword_tags = null;
129

  
129
	
130 130
	/** The punct_strong. */
131 131
	public static String punct_strong = null;
132

  
132
	
133 133
	/** The punct_paren_open. */
134 134
	public static String punct_paren_open = null;
135

  
135
	
136 136
	/** The punct_paren_close. */
137 137
	public static String punct_paren_close = null;
138

  
138
	
139 139
	/** The punct_paren. */
140 140
	public static String punct_paren = null;
141

  
141
	
142 142
	/** The punct_all. */
143 143
	public static String punct_all = null;
144

  
144
	
145 145
	/** The word_chars. */
146 146
	public static String word_chars = null;
147

  
147
	
148 148
	public static TTest[] tests = {
149
			//		new TTest(/\A(.*-)(je|m[eo]i|tu|t[eo]i|lui|luy|ilz|ils?|no?u?s|vo?u?s|on|leurs|ce|ci|là|elles?)(.*)\Z/),
150
			//		new TTest(/\A(.*)(-)()\Z/, "pon"),
151
			//		//new TTest(/\A(.*)(Bande dessinée|eau de vie|machine à écrire|Moyen Âge|petit pois|poule d'eau|Avoir l'air|se rendre compte|faire semblant|prendre froid|s'en aller|Comme il faut|bon marché|bon enfant|en retard|en colère|à la mode|de bonne humeur|hors de propos|de travers|en vacances|de standing|à l'abandon|sans défense|pieds nus|Tout à fait|d'ores et déjà|de temps en temps|tout de suite|de gré ou de force|ne pas|ne jamais|ne plus|sans doute|on ne peut plus|Quelque chose|ce dernier|n'importe quoi|N'importe quel|beaucoup de|plus d'un|peu de|un tas de|un groupe de|je ne sais quel|De sorte que|bien que|parce que|c'est pourquoi|de même que|depuis que|Quant à|au-dessus de|par rapport à|à côté de|grâce à|hors de|le long de|Par ma barbe !|Mon dieu !|Eh bien !|Bonne nuit !|Pas de panique !)(.*)\Z/),
152
			//		new TTest(/\A([Aa]ujourd'hui)(.*)/,0,1,2),
153
			//		
149
			// new TTest(/\A(.*-)(je|m[eo]i|tu|t[eo]i|lui|luy|ilz|ils?|no?u?s|vo?u?s|on|leurs|ce|ci|là|elles?)(.*)\Z/),
150
			// new TTest(/\A(.*)(-)()\Z/, "pon"),
151
			// new TTest(/\A(.*)(Bande dessinée|eau de vie|machine à écrire|Moyen Âge|petit pois|poule d'eau|Avoir l'air|se rendre compte|faire semblant|prendre froid|s'en aller|Comme il faut|bon
152
			// marché|bon enfant|en retard|en colère|à la mode|de bonne humeur|hors de propos|de travers|en vacances|de standing|à l'abandon|sans défense|pieds nus|Tout à fait|d'ores et déjà|de temps
153
			// en temps|tout de suite|de gré ou de force|ne pas|ne jamais|ne plus|sans doute|on ne peut plus|Quelque chose|ce dernier|n'importe quoi|N'importe quel|beaucoup de|plus d'un|peu de|un tas
154
			// de|un groupe de|je ne sais quel|De sorte que|bien que|parce que|c'est pourquoi|de même que|depuis que|Quant à|au-dessus de|par rapport à|à côté de|grâce à|hors de|le long de|Par ma
155
			// barbe !|Mon dieu !|Eh bien !|Bonne nuit !|Pas de panique !)(.*)\Z/),
156
			// new TTest(/\A([Aa]ujourd'hui)(.*)/,0,1,2),
157
			
154 158
			// BFM ONLY
155
			//		new TTest(/\A()([.·][^ .·]+[.·])(.*)\Z/, "num"), 
156

  
159
			// new TTest(/\A()([.·][^ .·]+[.·])(.*)\Z/, "num"),
160
			
157 161
			// TXM REFMAN ONLY
158
			//		new TTest(/\A(.*)($tag_all)(.*)\Z/, "tag"),
159
			//		new TTest(/\A(.*)([0-9][0-9][\/][0-9][0-9][\/][0-9][0-9])(.*)\Z/, "date", 1,2,3),
160
			//		new TTest(/\A(.*)([0-9][0-9]:[0-9][0-9]:[0-9][0-9])(.*)\Z/, "time", 1,2,3),
161
			//		new TTest(/\A([^0-9]*)([0-9]+(\.[0-9]+[a-z]+))(.*)\Z/, "version", 1,2,4),
162
			//		new TTest(/\A([^0-9]*)([0-9]+(\.[0-9]+)(\.[0-9]+)+)(\.[0-9]+[a-z]+)?(.*)\Z/, "version", 1,2,6),
163
			//		new TTest(/\A()([a-z]++:\/\/)(.*)\Z/, "protocol", 1,2,3),
164
			//		new TTest(/\A()([0-9]+([.,][0-9]+)?)()\Z/, "num", 1,2,4),
165
			//		new TTest(/\A()(([A-Z]:)?([\/\\][^\/\\]+)+)()\Z/, "path", 1,2,5),
166

  
167
			//		new TTest(/\A([\p{Ps}|\p{Pe}|\p{Pi}|\p{Pf}|\p{Po}|\p{S}]*)(\p{L}++['‘’])(.*)\Z/, "w", 1, 2, 3),
168
			//		new TTest(/\A(.*)(\.\.\.)(.*)\Z/, "pon"),
169
			//		new TTest(/\A(.*)(\p{Ps}|\p{Pe}|\p{Pi}|\p{Pf}|\p{Po}|\p{S})(.*)\Z/, "pon")
162
			// new TTest(/\A(.*)($tag_all)(.*)\Z/, "tag"),
163
			// new TTest(/\A(.*)([0-9][0-9][\/][0-9][0-9][\/][0-9][0-9])(.*)\Z/, "date", 1,2,3),
164
			// new TTest("^(.*)([0-9][0-9]:[0-9][0-9]:[0-9][0-9])(.*)$", "time", 1, 2, 3),
165
			// new TTest("^([^0-9]*)([0-9]+(\\.[0-9]+)?(\\.[0-9]+[a-z]*))(.*)$", "version", 1, 2, 5),
166
			// new TTest(/\A([^0-9]*)([0-9]+(\.[0-9]+)(\.[0-9]+)+)(\.[0-9]+[a-z]+)?(.*)\Z/, "version", 1,2,6),
167
			// new TTest("^(.*)(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)(.*)$", "url", 1, 2, 5),
168
			// new TTest(/\A()([0-9]+([.,][0-9]+)?)()\Z/, "num", 1,2,4),
169
			// new TTest(/\A()(([A-Z]:)?([\/\\][^\/\\]+)+)()\Z/, "path", 1,2,5),
170
			
171
			// new TTest(/\A([\p{Ps}|\p{Pe}|\p{Pi}|\p{Pf}|\p{Po}|\p{S}]*)(\p{L}++['‘’])(.*)\Z/, "w", 1, 2, 3),
172
			// new TTest(/\A(.*)(\.\.\.)(.*)\Z/, "pon"),
173
			// new TTest(/\A(.*)(\p{Ps}|\p{Pe}|\p{Pi}|\p{Pf}|\p{Po}|\p{S})(.*)\Z/, "pon")
170 174
	};
171

  
175
			
172 176
	public static String whitespaces = null;
177
	
173 178
	public static String regElision = null;
179
	
174 180
	public static String regPunct = null;
175 181
	
176 182
	static {
177 183
		reset(); // Initialize static fields
178 184
	}
179

  
180
	//punct_strong1 = ".!?";
181
	//punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
182
	//punct_strong = "[$punct_strong1]|$punct_strong2";
183
	//punct_paren_open1 = "«";
184
	//punct_paren_open2 = "<<|``|\\(|\\[|\\{";
185
	//punct_paren_open = "$punct_paren_open1|$punct_paren_open2";
186
	//punct_paren_close1 = "»";
187
	//punct_paren_close2 = ">>|''|‘‘|’’|\\)|\\]|\\}";
188
	//punct_paren_close = "$punct_paren_close1|$punct_paren_close2";
189
	//punct_paren = "$punct_paren_open|$punct_paren_close";
190
	//punct_weak = ",;∼ˆ·*:\"`'“”\\+±=/\\-≤≥<>\\—_ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
191
	//punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
192
	//word_chars = "[^ $punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|&[^;]+;";
193

  
185
	
186
	// punct_strong1 = ".!?";
187
	// punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
188
	// punct_strong = "[$punct_strong1]|$punct_strong2";
189
	// punct_paren_open1 = "«";
190
	// punct_paren_open2 = "<<|``|\\(|\\[|\\{";
191
	// punct_paren_open = "$punct_paren_open1|$punct_paren_open2";
192
	// punct_paren_close1 = "»";
193
	// punct_paren_close2 = ">>|''|‘‘|’’|\\)|\\]|\\}";
194
	// punct_paren_close = "$punct_paren_close1|$punct_paren_close2";
195
	// punct_paren = "$punct_paren_open|$punct_paren_close";
196
	// punct_weak = ",;∼ˆ·*:\"`'“”\\+±=/\\-≤≥<>\\—_ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
197
	// punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
198
	// word_chars = "[^ $punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|&[^;]+;";
199
	
194 200
	public static void reset() {
195 201
		tag_all = "<[A-Za-z][^>]+>";
196 202
		
......
221 227
		FClitic_fr = "-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m[\'‘’]|-moi|-nous|-on|-toi|-tu|-t[\'‘’]|-vous|-en|-y|-ci|-là";
222 228
		PClitic_it = "[dD][ae]ll[\'‘’]|[nN]ell[\'‘’]|[Aa]ll[\'‘’]|[lLDd][\'‘’]|[Ss]ull[\'‘’]|[Qq]uest[\'‘’]|[Uu]n[\'‘’]|[Ss]enz[\'‘’]|[Tt]utt[\'‘’]";
223 229
		FClitic_gl = "-la|-las|-lo|-los|-nos";
224

  
230
		
225 231
		whitespaces = "[\\p{Z}\\p{C}]+";
226 232
		regElision = "['‘’]";
227 233
		regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]";
228

  
234
		
229 235
		recombine();
230 236
	}
231

  
237
	
232 238
	public static void recombine() {
233
		corr_tags = ""+corr_tags_no_seg+"|"+seg_tags+"";
234
		extraword_tags = ""+div_tags+"|"+q_tags+"|"+extraword1_tags+"";
235
		punct_strong = "["+punct_strong1+"]|"+punct_strong2+"";
236
		punct_paren_open = ""+punct_paren_open1+"|"+punct_paren_open2+"";
237
		punct_paren_close = ""+punct_paren_close1+"|"+punct_paren_close2+"";
238
		punct_paren = ""+punct_paren_open+"|"+punct_paren_close+"";
239
		punct_all = ""+punct_strong+"|"+punct_paren+"|["+punct_weak+"]";
240
		word_chars = "[^ "+punct_quotes+""+punct_strong1+""+punct_paren_open1+""+punct_paren_close1+""+punct_weak+"]+|"+entity+"";
239
		corr_tags = "" + corr_tags_no_seg + "|" + seg_tags + "";
240
		extraword_tags = "" + div_tags + "|" + q_tags + "|" + extraword1_tags + "";
241
		punct_strong = "[" + punct_strong1 + "]|" + punct_strong2 + "";
242
		punct_paren_open = "" + punct_paren_open1 + "|" + punct_paren_open2 + "";
243
		punct_paren_close = "" + punct_paren_close1 + "|" + punct_paren_close2 + "";
244
		punct_paren = "" + punct_paren_open + "|" + punct_paren_close + "";
245
		punct_all = "" + punct_strong + "|" + punct_paren + "|[" + punct_weak + "]";
246
		word_chars = "[^ " + punct_quotes + "" + punct_strong1 + "" + punct_paren_open1 + "" + punct_paren_close1 + "" + punct_weak + "]+|" + entity + "";
241 247
	}
242

  
248
	
243 249
	/**
244 250
	 * To dom.
245 251
	 *
......
253 259
		tokenizer.setAttribute("onlyThoseTests", "false");
254 260
		parent.appendChild(tokenizer);
255 261
		
256
		//String tag_all = "<[^>]+>";
262
		// String tag_all = "<[^>]+>";
257 263
		Element p = doc.createElement("param");
258
		p.setAttribute("key","tag_all");
264
		p.setAttribute("key", "tag_all");
259 265
		p.setTextContent(tag_all);
260 266
		tokenizer.appendChild(p);
261
		//String enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
267
		// String enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
262 268
		p = doc.createElement("param");
263
		p.setAttribute("key","enclitics");
269
		p.setAttribute("key", "enclitics");
264 270
		p.setTextContent(enclitics);
265 271
		tokenizer.appendChild(p);
266
		//String encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
272
		// String encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
267 273
		p = doc.createElement("param");
268
		p.setAttribute("key","encliticsFR");
274
		p.setAttribute("key", "encliticsFR");
269 275
		p.setTextContent(encliticsFR);
270 276
		tokenizer.appendChild(p);
271
		//String div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl";
277
		// String div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl";
272 278
		p = doc.createElement("param");
273
		p.setAttribute("key","div_tags");
279
		p.setAttribute("key", "div_tags");
274 280
		p.setTextContent(div_tags);
275 281
		tokenizer.appendChild(p);
276
		//String q_tags = "q|quote|item|stage";
282
		// String q_tags = "q|quote|item|stage";
277 283
		p = doc.createElement("param");
278
		p.setAttribute("key","q_tags");
284
		p.setAttribute("key", "q_tags");
279 285
		p.setTextContent(q_tags);
280 286
		tokenizer.appendChild(p);
281
		//String extraword_tags = "$div_tags|$q_tags|expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
287
		// String extraword_tags = "$div_tags|$q_tags|expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
282 288
		p = doc.createElement("param");
283
		p.setAttribute("key","extraword_tags");
289
		p.setAttribute("key", "extraword_tags");
284 290
		p.setTextContent(extraword_tags);
285 291
		tokenizer.appendChild(p);
286
		//String corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno";
292
		// String corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno";
287 293
		p = doc.createElement("param");
288
		p.setAttribute("key","enclitics");
294
		p.setAttribute("key", "enclitics");
289 295
		p.setTextContent(enclitics);
290 296
		tokenizer.appendChild(p);
291
		//String corr_tags = "$corr_tags_no_seg|seg";
297
		// String corr_tags = "$corr_tags_no_seg|seg";
292 298
		p = doc.createElement("param");
293
		p.setAttribute("key","corr_tags");
299
		p.setAttribute("key", "corr_tags");
294 300
		p.setTextContent(corr_tags);
295 301
		tokenizer.appendChild(p);
296
		//String word_tags = "w|abbr|num";
302
		// String word_tags = "w|abbr|num";
297 303
		p = doc.createElement("param");
298
		p.setAttribute("key","word_tags");
304
		p.setAttribute("key", "word_tags");
299 305
		p.setTextContent(word_tags);
300 306
		tokenizer.appendChild(p);
301
		//String intraword_tags = "c|ex";
307
		// String intraword_tags = "c|ex";
302 308
		p = doc.createElement("param");
303
		p.setAttribute("key","intraword_tags");
309
		p.setAttribute("key", "intraword_tags");
304 310
		p.setTextContent(intraword_tags);
305 311
		tokenizer.appendChild(p);
306

  
307
		//String punct_strong1 = ".!?";
312
		
313
		// String punct_strong1 = ".!?";
308 314
		p = doc.createElement("param");
309
		p.setAttribute("key","punct_strong1");
315
		p.setAttribute("key", "punct_strong1");
310 316
		p.setTextContent(punct_strong1);
311 317
		tokenizer.appendChild(p);
312
		//String punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
318
		// String punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
313 319
		p = doc.createElement("param");
314
		p.setAttribute("key","punct_strong2");
320
		p.setAttribute("key", "punct_strong2");
315 321
		p.setTextContent(punct_strong2);
316 322
		tokenizer.appendChild(p);
317
		//String punct_strong = "[$punct_strong1]|$punct_strong2";
323
		// String punct_strong = "[$punct_strong1]|$punct_strong2";
318 324
		p = doc.createElement("param");
319
		p.setAttribute("key","punct_strong");
325
		p.setAttribute("key", "punct_strong");
320 326
		p.setTextContent(punct_strong);
321 327
		tokenizer.appendChild(p);
322
		//String punct_paren_open1 = "«";
328
		// String punct_paren_open1 = "«";
323 329
		p = doc.createElement("param");
324
		p.setAttribute("key","punct_paren_open1");
330
		p.setAttribute("key", "punct_paren_open1");
325 331
		p.setTextContent(punct_paren_open1);
326 332
		tokenizer.appendChild(p);
327
		//String punct_paren_open2 = "<<|``|\\(|\\[|\\{";
333
		// String punct_paren_open2 = "<<|``|\\(|\\[|\\{";
328 334
		p = doc.createElement("param");
329
		p.setAttribute("key","punct_paren_open2");
335
		p.setAttribute("key", "punct_paren_open2");
330 336
		p.setTextContent(punct_paren_open2);
331 337
		tokenizer.appendChild(p);
332
		//String punct_paren_open = "$punct_paren_open1|$punct_paren_open2";
338
		// String punct_paren_open = "$punct_paren_open1|$punct_paren_open2";
333 339
		p = doc.createElement("param");
334
		p.setAttribute("key","punct_paren_open");
340
		p.setAttribute("key", "punct_paren_open");
335 341
		p.setTextContent(punct_paren_open);
336 342
		tokenizer.appendChild(p);
337
		//String punct_paren_close1 = "»";
343
		// String punct_paren_close1 = "»";
338 344
		p = doc.createElement("param");
339
		p.setAttribute("key","punct_paren_close1");
345
		p.setAttribute("key", "punct_paren_close1");
340 346
		p.setTextContent(punct_paren_close1);
341 347
		tokenizer.appendChild(p);
342
		//String punct_paren_close2 = ">>|''|‘‘|’’|\\)|\\]|\\}";
348
		// String punct_paren_close2 = ">>|''|‘‘|’’|\\)|\\]|\\}";
343 349
		p = doc.createElement("param");
344
		p.setAttribute("key","punct_paren_close2");
350
		p.setAttribute("key", "punct_paren_close2");
345 351
		p.setTextContent(punct_paren_close2);
346 352
		tokenizer.appendChild(p);
347
		//String punct_paren_close = "$punct_paren_close1|$punct_paren_close2";
353
		// String punct_paren_close = "$punct_paren_close1|$punct_paren_close2";
348 354
		p = doc.createElement("param");
349
		p.setAttribute("key","punct_paren_close");
355
		p.setAttribute("key", "punct_paren_close");
350 356
		p.setTextContent(punct_paren_close);
351 357
		tokenizer.appendChild(p);
352
		//String punct_paren = "$punct_paren_open|$punct_paren_close";
358
		// String punct_paren = "$punct_paren_open|$punct_paren_close";
353 359
		p = doc.createElement("param");
354
		p.setAttribute("key","punct_paren");
360
		p.setAttribute("key", "punct_paren");
355 361
		p.setTextContent(punct_paren);
356 362
		tokenizer.appendChild(p);
357
		//String punct_weak = ",;∼ˆ·*:\"`'“”\\+±=/\\-≤≥<>\\—_ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
363
		// String punct_weak = ",;∼ˆ·*:\"`'“”\\+±=/\\-≤≥<>\\—_ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
358 364
		p = doc.createElement("param");
359
		p.setAttribute("key","punct_weak");
365
		p.setAttribute("key", "punct_weak");
360 366
		p.setTextContent(punct_weak);
361 367
		tokenizer.appendChild(p);
362
		//String punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
368
		// String punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
363 369
		p = doc.createElement("param");
364
		p.setAttribute("key","punct_all");
370
		p.setAttribute("key", "punct_all");
365 371
		p.setTextContent(punct_all);
366 372
		tokenizer.appendChild(p);
367
		//String word_chars = "[^ $punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|&[^;]+;";
373
		// String word_chars = "[^ $punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|&[^;]+;";
368 374
		p = doc.createElement("param");
369
		p.setAttribute("key","word_chars");
375
		p.setAttribute("key", "word_chars");
370 376
		p.setTextContent(word_chars);
371 377
		tokenizer.appendChild(p);
372
		//whitespaces = "[\\p{Z}\\p{C}]+";
378
		// whitespaces = "[\\p{Z}\\p{C}]+";
373 379
		p = doc.createElement("param");
374
		p.setAttribute("key","whitespaces");
380
		p.setAttribute("key", "whitespaces");
375 381
		p.setTextContent(whitespaces);
376 382
		tokenizer.appendChild(p);
377
		//regElision = "['‘’]"
383
		// regElision = "['‘’]"
378 384
		p = doc.createElement("param");
379
		p.setAttribute("key","regElision");
385
		p.setAttribute("key", "regElision");
380 386
		p.setTextContent(regElision);
381 387
		tokenizer.appendChild(p);
382
		//regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]"
388
		// regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]"
383 389
		p = doc.createElement("param");
384
		p.setAttribute("key","regPunct");
390
		p.setAttribute("key", "regPunct");
385 391
		p.setTextContent(regPunct);
386 392
		tokenizer.appendChild(p);
387 393
	}
388

  
394
	
389 395
	/**
390 396
	 * Dump.
391 397
	 */
392 398
	public static void dump() {
393 399
		System.out.println("BRUT");
394
		System.out.println("tag_all = "+tag_all+"");
395
		System.out.println("enclitics = "+enclitics+"");
396
		System.out.println("encliticsFR = "+encliticsFR+"");
397
		System.out.println("div_tags = "+div_tags+"");
398
		System.out.println("q_tags = "+q_tags+"");
399
		System.out.println("extraword1_tags = "+extraword1_tags+"");
400
		System.out.println("corr_tags_no_seg = "+corr_tags_no_seg+"");
401
		System.out.println("word_tags = "+word_tags+"");
402
		System.out.println("intraword_tags = "+intraword_tags+"");
403
		System.out.println("punct_quotes = "+punct_quotes+"");
404
		System.out.println("punct_strong1 = "+punct_strong1+"");
405
		System.out.println("punct_strong2 = "+punct_strong2+"");
406
		System.out.println("punct_paren_open1 = "+punct_paren_open1+"");
407
		System.out.println("punct_paren_open2 = "+punct_paren_open2+"");
408
		System.out.println("punct_paren_close1 = "+punct_paren_close1+"");
409
		System.out.println("punct_paren_close2 = "+punct_paren_close2+"");
410
		System.out.println("punct_weak = "+punct_weak+"");
411
		System.out.println("entity = "+entity+"");
412
		System.out.println("seg_tags = "+seg_tags+"");
400
		System.out.println("tag_all = " + tag_all + "");
401
		System.out.println("enclitics = " + enclitics + "");
402
		System.out.println("encliticsFR = " + encliticsFR + "");
403
		System.out.println("div_tags = " + div_tags + "");
404
		System.out.println("q_tags = " + q_tags + "");
405
		System.out.println("extraword1_tags = " + extraword1_tags + "");
406
		System.out.println("corr_tags_no_seg = " + corr_tags_no_seg + "");
407
		System.out.println("word_tags = " + word_tags + "");
408
		System.out.println("intraword_tags = " + intraword_tags + "");
409
		System.out.println("punct_quotes = " + punct_quotes + "");
410
		System.out.println("punct_strong1 = " + punct_strong1 + "");
411
		System.out.println("punct_strong2 = " + punct_strong2 + "");
412
		System.out.println("punct_paren_open1 = " + punct_paren_open1 + "");
413
		System.out.println("punct_paren_open2 = " + punct_paren_open2 + "");
414
		System.out.println("punct_paren_close1 = " + punct_paren_close1 + "");
415
		System.out.println("punct_paren_close2 = " + punct_paren_close2 + "");
416
		System.out.println("punct_weak = " + punct_weak + "");
417
		System.out.println("entity = " + entity + "");
418
		System.out.println("seg_tags = " + seg_tags + "");
413 419
		System.out.println("COMBINED");
414
		System.out.println("corr_tags = "+corr_tags+"");
415
		System.out.println("extraword_tags = "+extraword_tags+"");
416
		System.out.println("punct_strong = "+punct_strong+"");
417
		System.out.println("punct_paren_open = "+punct_paren_open+"");
418
		System.out.println("punct_paren_close = "+punct_paren_close+"");
419
		System.out.println("punct_paren = "+punct_paren+"");
420
		System.out.println("punct_all = "+punct_all+"");
421
		System.out.println("word_chars = "+word_chars+"");
422
		System.out.println("whitespaces = "+whitespaces+"");
423
		System.out.println("regElision = "+regElision+"");
424
		System.out.println("regPunct = "+regPunct+"");
420
		System.out.println("corr_tags = " + corr_tags + "");
421
		System.out.println("extraword_tags = " + extraword_tags + "");
422
		System.out.println("punct_strong = " + punct_strong + "");
423
		System.out.println("punct_paren_open = " + punct_paren_open + "");
424
		System.out.println("punct_paren_close = " + punct_paren_close + "");
425
		System.out.println("punct_paren = " + punct_paren + "");
426
		System.out.println("punct_all = " + punct_all + "");
427
		System.out.println("word_chars = " + word_chars + "");
428
		System.out.println("whitespaces = " + whitespaces + "");
429
		System.out.println("regElision = " + regElision + "");
430
		System.out.println("regPunct = " + regPunct + "");
425 431
		System.out.println("TESTS");
426

  
432
		
427 433
		for (TTest test : tests) {
428
			System.out.println(" "+test+"");
434
			System.out.println(" " + test + "");
429 435
		}
430 436
	}
431

  
437
	
432 438
	public static boolean isValid() {
433
		try { Pattern.compile(tag_all); }
434
		catch (Exception e) { System.out.println("tag_all="+tag_all+": "+e);return false; }
435

  
436
		try { Pattern.compile(enclitics); }
437
		catch (Exception e) { System.out.println("enclitics="+enclitics+": "+e);return false; }
438

  
439
		try { Pattern.compile(encliticsFR); }
440
		catch (Exception e) { System.out.println("encliticsFR="+encliticsFR+": "+e);return false; }
441

  
442
		try { Pattern.compile(div_tags); }
443
		catch (Exception e) { System.out.println("div_tags="+div_tags+": "+e);return false; }
444

  
445
		try { Pattern.compile(q_tags); }
446
		catch (Exception e) { System.out.println("q_tags="+q_tags+": "+e);return false; }
447

  
448
		try { Pattern.compile(extraword1_tags); }
449
		catch (Exception e) { System.out.println("extraword1_tags="+extraword1_tags+": "+e);return false; }
450

  
451
		try { Pattern.compile(corr_tags_no_seg); }
452
		catch (Exception e) { System.out.println("corr_tags_no_seg="+corr_tags_no_seg+": "+e);return false; }
453

  
454
		try { Pattern.compile(word_tags); }
455
		catch (Exception e) { System.out.println("word_tags="+word_tags+": "+e);return false; }
456

  
457
		try { Pattern.compile(intraword_tags); }
458
		catch (Exception e) { System.out.println("intraword_tags="+intraword_tags+": "+e);return false; }
459

  
460
		try { Pattern.compile(punct_quotes); }
461
		catch (Exception e) { System.out.println("punct_quotes="+punct_quotes+": "+e);return false; }
462

  
463
		try { Pattern.compile(punct_strong1); }
464
		catch (Exception e) { System.out.println("punct_strong1="+punct_strong1+": "+e);return false; }
465

  
466
		try { Pattern.compile(punct_strong2); }
467
		catch (Exception e) { System.out.println("punct_strong2="+punct_strong2+": "+e);return false; }
468

  
469
		try { Pattern.compile(punct_paren_open1); }
470
		catch (Exception e) { System.out.println("punct_paren_open1="+punct_paren_open1+": "+e);return false; }
471

  
472
		try { Pattern.compile(punct_paren_open2); }
473
		catch (Exception e) { System.out.println("punct_paren_open2="+punct_paren_open2+": "+e);return false; }
474

  
475
		try { Pattern.compile(punct_paren_close1); }
476
		catch (Exception e) { System.out.println("punct_paren_close1="+punct_paren_close1+": "+e);return false; }
477

  
478
		try { Pattern.compile(punct_paren_close2); }
479
		catch (Exception e) { System.out.println("punct_paren_close2="+punct_paren_close2+": "+e);return false; }
480

  
481
		try { Pattern.compile(punct_weak); }
482
		catch (Exception e) { System.out.println("punct_weak="+punct_weak+": "+e);return false; }
483

  
484
		try { Pattern.compile(entity); }
485
		catch (Exception e) { System.out.println("entity="+entity+": "+e);return false; }
486

  
487
		try { Pattern.compile(seg_tags); }
488
		catch (Exception e) { System.out.println("seg_tags="+seg_tags+": "+e);return false; }
489

  
490
		try { Pattern.compile(corr_tags); }
491
		catch (Exception e) { System.out.println("corr_tags: "+e);return false; }
492

  
493
		try { 	Pattern.compile(extraword_tags); }
494
		catch (Exception e) { System.out.println("extraword_tags: "+e);return false; }
495

  
439
		try {
440
			Pattern.compile(tag_all);
441
		}
442
		catch (Exception e) {
443
			System.out.println("tag_all=" + tag_all + ": " + e);
444
			return false;
445
		}
446
		
447
		try {
448
			Pattern.compile(enclitics);
449
		}
450
		catch (Exception e) {
451
			System.out.println("enclitics=" + enclitics + ": " + e);
452
			return false;
453
		}
454
		
455
		try {
456
			Pattern.compile(encliticsFR);
457
		}
458
		catch (Exception e) {
459
			System.out.println("encliticsFR=" + encliticsFR + ": " + e);
460
			return false;
461
		}
462
		
463
		try {
464
			Pattern.compile(div_tags);
465
		}
466
		catch (Exception e) {
467
			System.out.println("div_tags=" + div_tags + ": " + e);
468
			return false;
469
		}
470
		
471
		try {
472
			Pattern.compile(q_tags);
473
		}
474
		catch (Exception e) {
475
			System.out.println("q_tags=" + q_tags + ": " + e);
476
			return false;
477
		}
478
		
479
		try {
480
			Pattern.compile(extraword1_tags);
481
		}
482
		catch (Exception e) {
483
			System.out.println("extraword1_tags=" + extraword1_tags + ": " + e);
484
			return false;
485
		}
486
		
487
		try {
488
			Pattern.compile(corr_tags_no_seg);
489
		}
490
		catch (Exception e) {
491
			System.out.println("corr_tags_no_seg=" + corr_tags_no_seg + ": " + e);
492
			return false;
493
		}
494
		
495
		try {
496
			Pattern.compile(word_tags);
497
		}
498
		catch (Exception e) {
499
			System.out.println("word_tags=" + word_tags + ": " + e);
500
			return false;
501
		}
502
		
503
		try {
504
			Pattern.compile(intraword_tags);
505
		}
506
		catch (Exception e) {
507
			System.out.println("intraword_tags=" + intraword_tags + ": " + e);
508
			return false;
509
		}
510
		
511
		try {
512
			Pattern.compile(punct_quotes);
513
		}
514
		catch (Exception e) {
515
			System.out.println("punct_quotes=" + punct_quotes + ": " + e);
516
			return false;
517
		}
518
		
519
		try {
520
			Pattern.compile(punct_strong1);
521
		}
522
		catch (Exception e) {
523
			System.out.println("punct_strong1=" + punct_strong1 + ": " + e);
524
			return false;
525
		}
526
		
527
		try {
528
			Pattern.compile(punct_strong2);
529
		}
530
		catch (Exception e) {
531
			System.out.println("punct_strong2=" + punct_strong2 + ": " + e);
532
			return false;
533
		}
534
		
535
		try {
536
			Pattern.compile(punct_paren_open1);
537
		}
538
		catch (Exception e) {
539
			System.out.println("punct_paren_open1=" + punct_paren_open1 + ": " + e);
540
			return false;
541
		}
542
		
543
		try {
544
			Pattern.compile(punct_paren_open2);
545
		}
546
		catch (Exception e) {
547
			System.out.println("punct_paren_open2=" + punct_paren_open2 + ": " + e);
548
			return false;
549
		}
550
		
551
		try {
552
			Pattern.compile(punct_paren_close1);
553
		}
554
		catch (Exception e) {
555
			System.out.println("punct_paren_close1=" + punct_paren_close1 + ": " + e);
556
			return false;
557
		}
558
		
559
		try {
560
			Pattern.compile(punct_paren_close2);
561
		}
562
		catch (Exception e) {
563
			System.out.println("punct_paren_close2=" + punct_paren_close2 + ": " + e);
564
			return false;
565
		}
566
		
567
		try {
568
			Pattern.compile(punct_weak);
569
		}
570
		catch (Exception e) {
571
			System.out.println("punct_weak=" + punct_weak + ": " + e);
572
			return false;
573
		}
574
		
575
		try {
576
			Pattern.compile(entity);
577
		}
578
		catch (Exception e) {
579
			System.out.println("entity=" + entity + ": " + e);
580
			return false;
581
		}
582
		
583
		try {
584
			Pattern.compile(seg_tags);
585
		}
586
		catch (Exception e) {
587
			System.out.println("seg_tags=" + seg_tags + ": " + e);
588
			return false;
589
		}
590
		
591
		try {
592
			Pattern.compile(corr_tags);
593
		}
594
		catch (Exception e) {
595
			System.out.println("corr_tags: " + e);
596
			return false;
597
		}
598
		
599
		try {
600
			Pattern.compile(extraword_tags);
601
		}
602
		catch (Exception e) {
603
			System.out.println("extraword_tags: " + e);
604
			return false;
605
		}
606
		
496 607
		if (punct_strong != null)
497
			try { 	Pattern.compile(punct_strong); }
498
		catch (Exception e) { System.out.println("punct_strong: "+e);return false; }
499

  
500
		try { Pattern.compile(punct_paren_open); }
501
		catch (Exception e) { System.out.println("punct_paren_open: "+e);return false; }
502

  
503
		try { Pattern.compile(punct_paren_close); }
504
		catch (Exception e) { System.out.println("punct_paren_close: "+e);return false; }
505

  
506
		try { Pattern.compile(punct_paren); }
507
		catch (Exception e) { System.out.println("punct_paren: "+e);return false; }
508

  
509
		try { Pattern.compile(punct_all); }
510
		catch (Exception e) { System.out.println("punct_all: "+e);return false; }
511

  
512
		try { Pattern.compile(word_chars); }
513
		catch (Exception e) { System.out.println("word_chars: "+e);return false; }
514

  
608
			try {
609
			Pattern.compile(punct_strong);
610
			}
611
			catch (Exception e) {
612
			System.out.println("punct_strong: " + e);
613
			return false;
614
			}
615
			
616
		try {
617
			Pattern.compile(punct_paren_open);
618
		}
619
		catch (Exception e) {
620
			System.out.println("punct_paren_open: " + e);
621
			return false;
622
		}
623
		
624
		try {
625
			Pattern.compile(punct_paren_close);
626
		}
627
		catch (Exception e) {
628
			System.out.println("punct_paren_close: " + e);
629
			return false;
630
		}
631
		
632
		try {
633
			Pattern.compile(punct_paren);
634
		}
635
		catch (Exception e) {
636
			System.out.println("punct_paren: " + e);
637
			return false;
638
		}
639
		
640
		try {
641
			Pattern.compile(punct_all);
642
		}
643
		catch (Exception e) {
644
			System.out.println("punct_all: " + e);
645
			return false;
646
		}
647
		
648
		try {
649
			Pattern.compile(word_chars);
650
		}
651
		catch (Exception e) {
652
			System.out.println("word_chars: " + e);
653
			return false;
654
		}
655
		
515 656
		if (regPunct != null)
516
			try { Pattern.compile(regPunct); }
517
		catch (Exception e) { System.out.println("regPunct="+regPunct+": "+e);return false; }
518

  
657
			try {
658
			Pattern.compile(regPunct);
659
			}
660
			catch (Exception e) {
661
			System.out.println("regPunct=" + regPunct + ": " + e);
662
			return false;
663
			}
664
			
519 665
		if (regElision != null)
520
			try { Pattern.compile(regElision); }
521
		catch (Exception e) { System.out.println("regElision="+regElision+": "+e);return false; }
522

  
666
			try {
667
			Pattern.compile(regElision);
668
			}
669
			catch (Exception e) {
670
			System.out.println("regElision=" + regElision + ": " + e);
671
			return false;
672
			}
673
			
523 674
		if (whitespaces != null)
524
			try { Pattern.compile(whitespaces); }
525
		catch (Exception e) { System.out.println("whitespaces="+whitespaces+": "+e);return false; }
526

  
527
		for(TTest test : tests) {
528
			try { Pattern.compile(test.getRegex()); }
529
			catch (Exception e) { System.out.println(""+test.getRegex()+": "+e);return false; }
675
			try {
676
			Pattern.compile(whitespaces);
677
			}
678
			catch (Exception e) {
679
			System.out.println("whitespaces=" + whitespaces + ": " + e);
680
			return false;
681
			}
682
			
683
		for (TTest test : tests) {
684
			try {
685
				Pattern.compile(test.getRegex());
686
			}
687
			catch (Exception e) {
688
				System.out.println("" + test.getRegex() + ": " + e);
689
				return false;
690
			}
530 691
		}
531 692
		return true;
532 693
	}
533

  
694
	
534 695
	/**
535 696
	 * old way used in TXM 0.7.9 AND in corpus 0.7.9 -> 0.8.0 restoration
697
	 * 
536 698
	 * @param tokenizerElement
537 699
	 * @return
538 700
	 */
539 701
	@Deprecated
540 702
	public static boolean loadFromNode(Element tokenizerElement) {
541
		//load params
703
		// load params
542 704
		String tmp_strong_punct = null;
543 705
		NodeList params = tokenizerElement.getElementsByTagName("param");
544
		for (int i = 0 ; i < params.getLength() ; i++) {
706
		for (int i = 0; i < params.getLength(); i++) {
545 707
			Element param = (Element) params.item(i);
546 708
			String key = param.getAttribute("name");
547 709
			String value = param.getAttribute("value");
548 710
			if (value == null || value.length() == 0) value = param.getTextContent();
549 711
			if (value.length() == 0) value = null;
550

  
551
			if (debug) System.out.println(" Tokenizer parametrized with "+key+"="+value+"");
552

  
712
			
713
			if (debug) System.out.println(" Tokenizer parametrized with " + key + "=" + value + "");
714
			
553 715
			if (key.equals("tag_all"))
554 716
				tag_all = value;
555 717
			else if (key.equals("enclitics"))
......
582 744
				punct_paren_close1 = value;
583 745
			else if (key.equals("punct_paren_close2"))
584 746
				punct_paren_close2 = value;
585
			else if (key .equals("punct_weak"))
747
			else if (key.equals("punct_weak"))
586 748
				punct_weak = value;
587 749
			else if (key.equals("entity"))
588 750
				entity = value;
......
597 759
			else if (key.equals("punct_strong")) // this is temporary
598 760
				tmp_strong_punct = value; // this is temporary
599 761
			else
600
				System.out.println("MISSING TOKENIZER KEY: "+key);
762
				System.out.println("MISSING TOKENIZER KEY: " + key);
601 763
		}
602
		//recombine
764
		// recombine
603 765
		recombine();
604 766
		if (tmp_strong_punct != null) punct_strong = tmp_strong_punct;  // this is temporary
605

  
767
		
606 768
		String shouldResetTests = tokenizerElement.getAttribute("onlyThoseTests");
607 769
		if ("true".equals(shouldResetTests)) {
608 770
			System.out.println("Warning: tokenizer only using import parameters tests");
609
			//tests = new ArrayList<>();
771
			// tests = new ArrayList<>();
610 772
		}
611

  
773
		
612 774
		NodeList testsList = tokenizerElement.getElementsByTagName("test");
613 775
		if (testsList.getLength() > 0) {
614
			//System.out.println("Add "+testsList.getLength()+" tests to the tokenizer"
615
			for (int i = 0 ; i < testsList.getLength() ; i++) {
776
			// System.out.println("Add "+testsList.getLength()+" tests to the tokenizer"
777
			for (int i = 0; i < testsList.getLength(); i++) {
616 778
				Element test = (Element) testsList.item(i);
617
				//				tests.add(i, new TTest(test.getTextContent(), test.getAttribute("type"),
618
				//						Integer.parseInt(test.getAttribute("before")),
619
				//						Integer.parseInt(test.getAttribute("hit")),
620
				//						Integer.parseInt(test.getAttribute("after")))
621
				//						);
779
				// tests.add(i, new TTest(test.getTextContent(), test.getAttribute("type"),
780
				// Integer.parseInt(test.getAttribute("before")),
781
				// Integer.parseInt(test.getAttribute("hit")),
782
				// Integer.parseInt(test.getAttribute("after")))
783
				// );
622 784
			}
623
			//System.out.println("Tests: "+tests);
785
			// System.out.println("Tests: "+tests);
624 786
		}
625

  
787
		
626 788
		if (isValid())
627 789
			return true;
628 790
		else {
......
631 793
			return false;
632 794
		}
633 795
	}
634

  
796
	
635 797
	/**
636 798
	 * The main method.
637 799
	 *
638 800
	 * @param args the arguments
639
	 * @throws IOException 
640
	 * @throws SAXException 
641
	 * @throws ParserConfigurationException 
801
	 * @throws IOException
802
	 * @throws SAXException
803
	 * @throws ParserConfigurationException
642 804
	 */
643 805
	public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException {
644

  
645
//		Document doc = DomUtils.load(new File("/home/mdecorde/xml/discours/import.xml"));
646
//		Node corpora = doc.getDocumentElement().getElementsByTagName("corpora").item(0);
647
//		Node corpus = ((Element)corpora).getElementsByTagName("corpus").item(0);
648
//		Node tokenizer = ((Element)corpus).getElementsByTagName("tokenizer").item(0);
649
//		System.out.println("OK? "+ TokenizerClasses.loadFromNode((Element) tokenizer));
806
		
807
		// Document doc = DomUtils.load(new File("/home/mdecorde/xml/discours/import.xml"));
808
		// Node corpora = doc.getDocumentElement().getElementsByTagName("corpora").item(0);
809
		// Node corpus = ((Element)corpora).getElementsByTagName("corpus").item(0);
810
		// Node tokenizer = ((Element)corpus).getElementsByTagName("tokenizer").item(0);
811
		// System.out.println("OK? "+ TokenizerClasses.loadFromNode((Element) tokenizer));
650 812
	}
651

  
813
	
652 814
	public static boolean loadFromProject(ProjectScope projectScope) {
653
		//load params
815
		// load params
654 816
		IEclipsePreferences params = projectScope.getNode("Tokenizer");
655 817
		
656 818
		try {
......
663 825
					String value = params.get(name, null);
664 826
					if (value != null) {
665 827
						try {
666
							if (debug) System.out.println(" Tokenizer parametrized with "+name+"="+value);
828
							if (debug) System.out.println(" Tokenizer parametrized with " + name + "=" + value);
667 829
							field.set(field, value);
668 830
						}
669 831
						catch (IllegalArgumentException e) {
......
677 839
					}
678 840
				}
679 841
			}
680
//			// old way, now using Reflection
842
			// // old way, now using Reflection
681 843
			
682
//			String[] keys = params.keys();
683
//			for (String key : keys) {
684
//				//					Element param = (Element) params.item(i);
685
//				//					String key = param.getAttribute("name");
686
//				String value = params.get(key, "");
687
//				//					if (value == null || value.length() == 0) value = param.getTextContent();
688
//				if (value.length() == 0) value = null;
689
//
690
//				if (debug) System.out.println(" Tokenizer parametrized with "+key+"="+value+"");
691
//
692
//				if (key.equals("tag_all"))
693
//					tag_all = value;
694
//				else if (key.equals("enclitics"))
695
//					enclitics = value;
696
//				else if (key.equals("encliticsFR"))
697
//					encliticsFR = value;
698
//				else if (key.equals("div_tags"))
699
//					div_tags = value;
700
//				else if (key.equals("q_tags"))
701
//					q_tags = value;
702
//				else if (key.equals("extraword1_tags"))
703
//					extraword1_tags = value;
704
//				else if (key.equals("corr_tags_no_seg"))
705
//					corr_tags_no_seg = value;
706
//				else if (key.equals("word_tags")) {
707
//					word_tags = value;
708
//					word_element_to_create = value; // FIXME for now
709
//				} else if (key.equals("intraword_tags"))
710
//					intraword_tags = value;
711
//				else if (key.equals("punct_quotes"))
712
//					punct_quotes = value;
713
//				else if (key.equals("punct_strong1"))
714
//					punct_strong1 = value;
715
//				else if (key.equals("punct_strong2"))
716
//					punct_strong2 = value;
717
//				else if (key.equals("punct_paren_open1"))
718
//					punct_paren_open1 = value;
719
//				else if (key.equals("punct_paren_open2"))
720
//					punct_paren_open2 = value;
721
//				else if (key.equals("punct_paren_close1"))
722
//					punct_paren_close1 = value;
723
//				else if (key.equals("punct_paren_close2"))
724
//					punct_paren_close2 = value;
725
//				else if (key.equals("punct_weak"))
726
//					punct_weak = value;
727
//				else if (key.equals("entity"))
728
//					entity = value;
729
//				else if (key.equals("seg_tags"))
730
//					seg_tags = value;
731
//				else if (key.equals("regPunct"))
732
//					regPunct = value;
733
//				else if (key.equals("regElision"))
734
//					regElision = value;
735
//				else if (key.equals("whitespaces"))
736
//					whitespaces = value;
737
//				else if (key.equals("punct_strong")) // this is temporary
738
//					tmp_strong_punct = value; // this is temporary
739
////				else
740
////					System.out.println("MISSING TOKENIZER KEY: "+key);
741
//			}
742

  
743
			//recombine
844
			// String[] keys = params.keys();
845
			// for (String key : keys) {
846
			// // Element param = (Element) params.item(i);
847
			// // String key = param.getAttribute("name");
848
			// String value = params.get(key, "");
849
			// // if (value == null || value.length() == 0) value = param.getTextContent();
850
			// if (value.length() == 0) value = null;
851
			//
852
			// if (debug) System.out.println(" Tokenizer parametrized with "+key+"="+value+"");
853
			//
854
			// if (key.equals("tag_all"))
855
			// tag_all = value;
856
			// else if (key.equals("enclitics"))
857
			// enclitics = value;
858
			// else if (key.equals("encliticsFR"))
859
			// encliticsFR = value;
860
			// else if (key.equals("div_tags"))
861
			// div_tags = value;
862
			// else if (key.equals("q_tags"))
863
			// q_tags = value;
864
			// else if (key.equals("extraword1_tags"))
865
			// extraword1_tags = value;
866
			// else if (key.equals("corr_tags_no_seg"))
867
			// corr_tags_no_seg = value;
868
			// else if (key.equals("word_tags")) {
869
			// word_tags = value;
870
			// word_element_to_create = value; // FIXME for now
871
			// } else if (key.equals("intraword_tags"))
872
			// intraword_tags = value;
873
			// else if (key.equals("punct_quotes"))
874
			// punct_quotes = value;
875
			// else if (key.equals("punct_strong1"))
876
			// punct_strong1 = value;
877
			// else if (key.equals("punct_strong2"))
878
			// punct_strong2 = value;
879
			// else if (key.equals("punct_paren_open1"))
880
			// punct_paren_open1 = value;
881
			// else if (key.equals("punct_paren_open2"))
882
			// punct_paren_open2 = value;
883
			// else if (key.equals("punct_paren_close1"))
884
			// punct_paren_close1 = value;
885
			// else if (key.equals("punct_paren_close2"))
886
			// punct_paren_close2 = value;
887
			// else if (key.equals("punct_weak"))
888
			// punct_weak = value;
889
			// else if (key.equals("entity"))
890
			// entity = value;
891
			// else if (key.equals("seg_tags"))
892
			// seg_tags = value;
893
			// else if (key.equals("regPunct"))
894
			// regPunct = value;
895
			// else if (key.equals("regElision"))
896
			// regElision = value;
897
			// else if (key.equals("whitespaces"))
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff