Revision 887

tmp/org.txm.tokenizer.core/.classpath (revision 887)
1
<?xml version="1.0" encoding="UTF-8"?>
2
<classpath>
3
	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7"/>
4
	<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
5
	<classpathentry kind="src" path="src"/>
6
	<classpathentry exported="true" kind="con" path="GROOVY_SUPPORT"/>
7
	<classpathentry exported="true" kind="con" path="GROOVY_DSL_SUPPORT"/>
8
	<classpathentry kind="output" path="bin"/>
9
</classpath>
0 10

  
tmp/org.txm.tokenizer.core/META-INF/MANIFEST.MF (revision 887)
1
Manifest-Version: 1.0
2
Bundle-ManifestVersion: 2
3
Bundle-Name: org.txm.tokenizer.core
4
Bundle-SymbolicName: org.txm.tokenizer.core
5
Bundle-Version: 1.0.0.qualifier
6
Automatic-Module-Name: org.txm.tokenizer.core
7
Bundle-RequiredExecutionEnvironment: JavaSE-1.7
8
Export-Package: org.txm.tokenizer
9
Require-Bundle: org.txm.libs.groovy-all;bundle-version="2.3.7",
10
 org.txm.utils;bundle-version="1.0.0"
0 11

  
tmp/org.txm.tokenizer.core/.project (revision 887)
1
<?xml version="1.0" encoding="UTF-8"?>
2
<projectDescription>
3
	<name>org.txm.tokenizer.core</name>
4
	<comment></comment>
5
	<projects>
6
	</projects>
7
	<buildSpec>
8
		<buildCommand>
9
			<name>org.eclipse.jdt.core.javabuilder</name>
10
			<arguments>
11
			</arguments>
12
		</buildCommand>
13
		<buildCommand>
14
			<name>org.eclipse.pde.ManifestBuilder</name>
15
			<arguments>
16
			</arguments>
17
		</buildCommand>
18
		<buildCommand>
19
			<name>org.eclipse.pde.SchemaBuilder</name>
20
			<arguments>
21
			</arguments>
22
		</buildCommand>
23
	</buildSpec>
24
	<natures>
25
		<nature>org.eclipse.jdt.groovy.core.groovyNature</nature>
26
		<nature>org.eclipse.pde.PluginNature</nature>
27
		<nature>org.eclipse.jdt.core.javanature</nature>
28
	</natures>
29
</projectDescription>
0 30

  
tmp/org.txm.tokenizer.core/src/org/txm/tokenizer/TokenizerClasses.groovy (revision 887)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-04-06 09:11:32 +0200 (jeu. 06 avril 2017) $
25
// $LastChangedRevision: 3425 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.tokenizer
29

  
30
import java.io.FileWriter;
31
import java.util.regex.Pattern;
32

  
33
import org.txm.utils.xml.DomUtils;
34
import org.w3c.dom.Document;
35
import org.w3c.dom.Element
36
import org.w3c.dom.Node;
37
import org.w3c.dom.NodeList;
38

  
39
import javax.xml.stream.*;
40
import java.net.URL;
41

  
42
// TODO: Auto-generated Javadoc
43
/**
44
 * The Class TokenizerClasses.
45
 */
46
class TokenizerClasses {
47

  
48
	//// ROOTS ? ////
49
	public static boolean debug = false;
50
	/** The tag_all. */
51
	public static String tag_all = "<[A-Za-z][^>]+>";
52
	
53
	/** The Weblex enclitics. */
54
	public static String enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
55
	public static String encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
56

  
57
	/** The TT enclitics. */
58
	public static String FClitic_en = "'(s|re|ve|d|m|em|ll)|n['‘’]t";
59
	public static String PClitic_fr = '[dcjlmnstDCJLNMST][\'‘’]|[Qq]u[\'‘’]|[Jj]usqu[\'‘’]|[Ll]orsqu[\'‘’]|[Pp]uisqu[\'‘’]|[Qq]uoiqu[\'‘’]';
60
	public static String FClitic_fr = '-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m[\'‘’]|-moi|-nous|-on|-toi|-tu|-t[\'‘’]|-vous|-en|-y|-ci|-là';
61
	public static String PClitic_it = '[dD][ae]ll[\'‘’]|[nN]ell[\'‘’]|[Aa]ll[\'‘’]|[lLDd][\'‘’]|[Ss]ull[\'‘’]|[Qq]uest[\'‘’]|[Uu]n[\'‘’]|[Ss]enz[\'‘’]|[Tt]utt[\'‘’]';
62
	public static String FClitic_gl = '-la|-las|-lo|-los|-nos';
63
	
64
	/** The div_tags. */
65
	public static String div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl|opener|dateline";
66

  
67
	/** The q_tags. */
68
	public static String q_tags = "q|quote|said|item|stage|cit|label|heraldry";
69

  
70
	/** The extraword_tags. */
71
	public static String extraword1_tags = "expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
72

  
73
	/** The corr_tags_no_seg. */
74
	public static String corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno|surplus";
75

  
76
	/** The word_tags. */
77
	public static String word_tags = "w";
78

  
79
	/** The intraword_tags. */
80
	public static String intraword_tags = "c|ex|caesura";
81

  
82
	/** The punct_quotes. */
83
	public static String punct_quotes = "'‘’’"
84

  
85
	/** The punct_strong1. */
86
	public static String punct_strong1 = ".!?";
87

  
88
	/** The punct_strong2. */
89
	public static String punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
90

  
91
	/** The punct_paren_open1. */
92
	public static String punct_paren_open1 = "«";
93

  
94
	/** The punct_paren_open2. */
95
	public static String punct_paren_open2 = "``|\\(|\\[|\\{";
96

  
97
	/** The punct_paren_close1. */
98
	public static String punct_paren_close1 = "»";
99

  
100
	/** The punct_paren_close2. */
101
	public static String punct_paren_close2 = "''|‘‘|’’|\\)|\\]|\\}";
102

  
103
	/** The punct_weak. */
104
	public static String punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\≤≥<>\\—ν√μ•@≈→αγ∞≡σ&%|#¼½Θĩ†φθΓ§ẽə∈";
105

  
106
	public static String entity = "&[^;]+;";
107

  
108
	public static String seg_tags = "seg"
109

  
110
	//// COMBINAISONS ////
111

  
112
	/** The corr_tags. */
113
	public static String corr_tags = "$corr_tags_no_seg|$seg_tags";
114
	/** The extraword_tags. */
115
	public static String extraword_tags = "$div_tags|$q_tags|$extraword1_tags";
116

  
117
	/** The punct_strong. */
118
	public static String punct_strong = "[$punct_strong1]+|$punct_strong2";
119

  
120
	/** The punct_paren_open. */
121
	public static String punct_paren_open = "$punct_paren_open1|$punct_paren_open2";
122

  
123
	/** The punct_paren_close. */
124
	public static String punct_paren_close = "$punct_paren_close1|$punct_paren_close2";
125

  
126
	/** The punct_paren. */
127
	public static String punct_paren = "$punct_paren_open|$punct_paren_close";
128

  
129
	/** The punct_all. */
130
	public static String punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
131

  
132
	/** The word_chars. */
133
	public static String word_chars = "[^ $punct_quotes$punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|$entity";
134

  
135
	public static def tests = [
136
		new TTest(/\A(.*-)(je|m[eo]i|tu|t[eo]i|lui|luy|ilz|ils?|no?u?s|vo?u?s|on|leurs|ce|ci|là|elles?)(.*)\Z/),
137
		new TTest(/\A(.*)(-)()\Z/, "pon"),
138
		//new TTest(/\A(.*)(Bande dessinée|eau de vie|machine à écrire|Moyen Âge|petit pois|poule d'eau|Avoir l'air|se rendre compte|faire semblant|prendre froid|s'en aller|Comme il faut|bon marché|bon enfant|en retard|en colère|à la mode|de bonne humeur|hors de propos|de travers|en vacances|de standing|à l'abandon|sans défense|pieds nus|Tout à fait|d'ores et déjà|de temps en temps|tout de suite|de gré ou de force|ne pas|ne jamais|ne plus|sans doute|on ne peut plus|Quelque chose|ce dernier|n'importe quoi|N'importe quel|beaucoup de|plus d'un|peu de|un tas de|un groupe de|je ne sais quel|De sorte que|bien que|parce que|c'est pourquoi|de même que|depuis que|Quant à|au-dessus de|par rapport à|à côté de|grâce à|hors de|le long de|Par ma barbe !|Mon dieu !|Eh bien !|Bonne nuit !|Pas de panique !)(.*)\Z/),
139
		new TTest(/\A([Aa]ujourd'hui)(.*)/,0,1,2),
140
		
141
		// BFM ONLY
142
//		new TTest(/\A()([.·][^ .·]+[.·])(.*)\Z/, "num"), 
143

  
144
		// TXM REFMAN ONLY
145
//		new TTest(/\A(.*)($tag_all)(.*)\Z/, "tag"),
146
//		new TTest(/\A(.*)([0-9][0-9][\/][0-9][0-9][\/][0-9][0-9])(.*)\Z/, "date", 1,2,3),
147
//		new TTest(/\A(.*)([0-9][0-9]:[0-9][0-9]:[0-9][0-9])(.*)\Z/, "time", 1,2,3),
148
//		new TTest(/\A([^0-9]*)([0-9]+(\.[0-9]+[a-z]+))(.*)\Z/, "version", 1,2,4),
149
//		new TTest(/\A([^0-9]*)([0-9]+(\.[0-9]+)(\.[0-9]+)+)(\.[0-9]+[a-z]+)?(.*)\Z/, "version", 1,2,6),
150
//		new TTest(/\A()([a-z]++:\/\/)(.*)\Z/, "protocol", 1,2,3),
151
//		new TTest(/\A()([0-9]+([.,][0-9]+)?)()\Z/, "num", 1,2,4),
152
//		new TTest(/\A()(([A-Z]:)?([\/\\][^\/\\]+)+)()\Z/, "path", 1,2,5),
153
		
154
		new TTest(/\A([\p{Ps}|\p{Pe}|\p{Pi}|\p{Pf}|\p{Po}|\p{S}]*)(\p{L}++['‘’])(.*)\Z/, "w", 1, 2, 3),
155
		new TTest(/\A(.*)(\.\.\.)(.*)\Z/, "pon"),
156
		new TTest(/\A(.*)(\p{Ps}|\p{Pe}|\p{Pi}|\p{Pf}|\p{Po}|\p{S})(.*)\Z/, "pon")
157
		
158
	];
159
	//tag_all = "<[^>]+>";
160
	//enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
161
	//encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
162
	//div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl";
163
	//q_tags = "q|quote|item|stage";
164
	//extraword_tags = "$div_tags|$q_tags|expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
165
	//corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno";
166
	//corr_tags = "$corr_tags_no_seg|seg";
167
	//word_tags = "w|abbr|num";
168
	//intraword_tags = "c|ex";
169

  
170
	public static String whitespaces = "[\\p{Z}\\p{C}]+";
171
	public static String regElision = "['‘’]" // obsolete
172
	public static String regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]"
173

  
174
	//punct_strong1 = ".!?";
175
	//punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
176
	//punct_strong = "[$punct_strong1]|$punct_strong2";
177
	//punct_paren_open1 = "«";
178
	//punct_paren_open2 = "<<|``|\\(|\\[|\\{";
179
	//punct_paren_open = "$punct_paren_open1|$punct_paren_open2";
180
	//punct_paren_close1 = "»";
181
	//punct_paren_close2 = ">>|''|‘‘|’’|\\)|\\]|\\}";
182
	//punct_paren_close = "$punct_paren_close1|$punct_paren_close2";
183
	//punct_paren = "$punct_paren_open|$punct_paren_close";
184
	//punct_weak = ",;∼ˆ·*:\"`'“”\\+±=/\\-≤≥<>\\—_ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
185
	//punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
186
	//word_chars = "[^ $punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|&[^;]+;";
187

  
188
	public static reset() {
189
		tag_all = "<[^>]+>";
190
		enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
191
		encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
192
		div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl";
193
		q_tags = "q|quote|item|stage|cit";
194
		extraword1_tags = "expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
195
		corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno|surplus";
196
		word_tags = "w";
197
		intraword_tags = "c|ex|caesura";
198
		punct_quotes = "'‘’’"
199
		punct_strong1 = ".!?";
200
		punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
201
		punct_paren_open1 = "«";
202
		punct_paren_open2 = "``|\\(|\\[|\\{";
203
		punct_paren_close1 = "»";
204
		punct_paren_close2 = "''|‘‘|’’|\\)|\\]|\\}";
205
		punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\-≤≥<>\\—ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
206
		entity = "&[^;]+;";
207
		seg_tags = "seg"
208
		
209
		whitespaces = "[\\p{Z}\\p{C}]+";
210
		regElision = "['‘’]"
211
		regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]"
212

  
213
		recombine();
214
	}
215

  
216
	public static recombine() {
217
		corr_tags = "$corr_tags_no_seg|$seg_tags";
218
		extraword_tags = "$div_tags|$q_tags|$extraword1_tags";
219
		punct_strong = "[$punct_strong1]|$punct_strong2";
220
		punct_paren_open = "$punct_paren_open1|$punct_paren_open2";
221
		punct_paren_close = "$punct_paren_close1|$punct_paren_close2";
222
		punct_paren = "$punct_paren_open|$punct_paren_close";
223
		punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
224
		word_chars = "[^ $punct_quotes$punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|$entity";
225
	}
226

  
227
	/**
228
	 * To dom.
229
	 *
230
	 * @param doc the doc
231
	 * @param parent the parent
232
	 * @return the java.lang. object
233
	 */
234
	public static toDom(Document doc, Node parent) {
235
		Element tokenizer = doc.createElement("tokenizer");
236
		tokenizer.setAttribute("onlyThoseTests", "false")
237
		parent.appendChild(tokenizer)
238
		
239
		//String tag_all = "<[^>]+>";
240
		Element p = doc.createElement("param");
241
		p.setAttribute("key","tag_all");
242
		p.setTextContent(tag_all);
243
		tokenizer.appendChild(p)
244
		//String enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
245
		p = doc.createElement("param");
246
		p.setAttribute("key","enclitics");
247
		p.setTextContent(enclitics);
248
		tokenizer.appendChild(p)
249
		//String encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
250
		p = doc.createElement("param");
251
		p.setAttribute("key","encliticsFR");
252
		p.setTextContent(encliticsFR);
253
		tokenizer.appendChild(p)
254
		//String div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl";
255
		p = doc.createElement("param");
256
		p.setAttribute("key","div_tags");
257
		p.setTextContent(div_tags);
258
		tokenizer.appendChild(p)
259
		//String q_tags = "q|quote|item|stage";
260
		p = doc.createElement("param");
261
		p.setAttribute("key","q_tags");
262
		p.setTextContent(q_tags);
263
		tokenizer.appendChild(p)
264
		//String extraword_tags = "$div_tags|$q_tags|expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
265
		p = doc.createElement("param");
266
		p.setAttribute("key","extraword_tags");
267
		p.setTextContent(extraword_tags);
268
		tokenizer.appendChild(p)
269
		//String corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno";
270
		p = doc.createElement("param");
271
		p.setAttribute("key","enclitics");
272
		p.setTextContent(enclitics);
273
		tokenizer.appendChild(p)
274
		//String corr_tags = "$corr_tags_no_seg|seg";
275
		p = doc.createElement("param");
276
		p.setAttribute("key","corr_tags");
277
		p.setTextContent(corr_tags);
278
		tokenizer.appendChild(p)
279
		//String word_tags = "w|abbr|num";
280
		p = doc.createElement("param");
281
		p.setAttribute("key","word_tags");
282
		p.setTextContent(word_tags);
283
		tokenizer.appendChild(p)
284
		//String intraword_tags = "c|ex";
285
		p = doc.createElement("param");
286
		p.setAttribute("key","intraword_tags");
287
		p.setTextContent(intraword_tags);
288
		tokenizer.appendChild(p)
289

  
290
		//String punct_strong1 = ".!?";
291
		p = doc.createElement("param");
292
		p.setAttribute("key","punct_strong1");
293
		p.setTextContent(punct_strong1);
294
		tokenizer.appendChild(p)
295
		//String punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
296
		p = doc.createElement("param");
297
		p.setAttribute("key","punct_strong2");
298
		p.setTextContent(punct_strong2);
299
		tokenizer.appendChild(p)
300
		//String punct_strong = "[$punct_strong1]|$punct_strong2";
301
		p = doc.createElement("param");
302
		p.setAttribute("key","punct_strong");
303
		p.setTextContent(punct_strong);
304
		tokenizer.appendChild(p)
305
		//String punct_paren_open1 = "«";
306
		p = doc.createElement("param");
307
		p.setAttribute("key","punct_paren_open1");
308
		p.setTextContent(punct_paren_open1);
309
		tokenizer.appendChild(p)
310
		//String punct_paren_open2 = "<<|``|\\(|\\[|\\{";
311
		p = doc.createElement("param");
312
		p.setAttribute("key","punct_paren_open2");
313
		p.setTextContent(punct_paren_open2);
314
		tokenizer.appendChild(p)
315
		//String punct_paren_open = "$punct_paren_open1|$punct_paren_open2";
316
		p = doc.createElement("param");
317
		p.setAttribute("key","punct_paren_open");
318
		p.setTextContent(punct_paren_open);
319
		tokenizer.appendChild(p)
320
		//String punct_paren_close1 = "»";
321
		p = doc.createElement("param");
322
		p.setAttribute("key","punct_paren_close1");
323
		p.setTextContent(punct_paren_close1);
324
		tokenizer.appendChild(p)
325
		//String punct_paren_close2 = ">>|''|‘‘|’’|\\)|\\]|\\}";
326
		p = doc.createElement("param");
327
		p.setAttribute("key","punct_paren_close2");
328
		p.setTextContent(punct_paren_close2);
329
		tokenizer.appendChild(p)
330
		//String punct_paren_close = "$punct_paren_close1|$punct_paren_close2";
331
		p = doc.createElement("param");
332
		p.setAttribute("key","punct_paren_close");
333
		p.setTextContent(punct_paren_close);
334
		tokenizer.appendChild(p)
335
		//String punct_paren = "$punct_paren_open|$punct_paren_close";
336
		p = doc.createElement("param");
337
		p.setAttribute("key","punct_paren");
338
		p.setTextContent(punct_paren);
339
		tokenizer.appendChild(p)
340
		//String punct_weak = ",;∼ˆ·*:\"`'“”\\+±=/\\-≤≥<>\\—_ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
341
		p = doc.createElement("param");
342
		p.setAttribute("key","punct_weak");
343
		p.setTextContent(punct_weak);
344
		tokenizer.appendChild(p)
345
		//String punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
346
		p = doc.createElement("param");
347
		p.setAttribute("key","punct_all");
348
		p.setTextContent(punct_all);
349
		tokenizer.appendChild(p)
350
		//String word_chars = "[^ $punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|&[^;]+;";
351
		p = doc.createElement("param");
352
		p.setAttribute("key","word_chars");
353
		p.setTextContent(word_chars);
354
		tokenizer.appendChild(p)
355
		//whitespaces = "[\\p{Z}\\p{C}]+";
356
		p = doc.createElement("param");
357
		p.setAttribute("key","whitespaces");
358
		p.setTextContent(whitespaces);
359
		tokenizer.appendChild(p)
360
		//regElision = "['‘’]"
361
		p = doc.createElement("param");
362
		p.setAttribute("key","regElision");
363
		p.setTextContent(regElision);
364
		tokenizer.appendChild(p)
365
		//regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]"
366
		p = doc.createElement("param");
367
		p.setAttribute("key","regPunct");
368
		p.setTextContent(regPunct);
369
		tokenizer.appendChild(p)
370
	}
371

  
372
	/**
373
	 * Dump.
374
	 */
375
	public static void dump() {
376
		println "BRUT"
377
		println "tag_all = $tag_all"
378
		println "enclitics = $enclitics"
379
		println "encliticsFR = $encliticsFR"
380
		println "div_tags = $div_tags"
381
		println "q_tags = $q_tags"
382
		println "extraword1_tags = $extraword1_tags"
383
		println "corr_tags_no_seg = $corr_tags_no_seg"
384
		println "word_tags = $word_tags"
385
		println "intraword_tags = $intraword_tags"
386
		println "punct_quotes = $punct_quotes"
387
		println "punct_strong1 = $punct_strong1"
388
		println "punct_strong2 = $punct_strong2"
389
		println "punct_paren_open1 = $punct_paren_open1"
390
		println "punct_paren_open2 = $punct_paren_open2"
391
		println "punct_paren_close1 = $punct_paren_close1"
392
		println "punct_paren_close2 = $punct_paren_close2"
393
		println "punct_weak = $punct_weak"
394
		println "entity = $entity"
395
		println "seg_tags = $seg_tags"
396
		println "COMBINED"
397
		println "corr_tags = $corr_tags"
398
		println "extraword_tags = $extraword_tags"
399
		println "punct_strong = $punct_strong"
400
		println "punct_paren_open = $punct_paren_open"
401
		println "punct_paren_close = $punct_paren_close"
402
		println "punct_paren = $punct_paren"
403
		println "punct_all = $punct_all"
404
		println "word_chars = $word_chars"
405
		println "whitespaces = $whitespaces"
406
		println "regElision = $regElision"
407
		println "regPunct = $regPunct"
408
		println "TESTS"
409
		for(def test : tests)
410
			println " $test"
411
	}
412

  
413
	public static boolean isValid() {
414
		try { Pattern.compile(tag_all) }
415
		catch (Exception e) { println "tag_all=${tag_all}: "+e;return false; }
416

  
417
		try { Pattern.compile(enclitics) }
418
		catch (Exception e) { println "enclitics=${enclitics}: "+e;return false; }
419

  
420
		try { Pattern.compile(encliticsFR) }
421
		catch (Exception e) { println "encliticsFR=${encliticsFR}: "+e;return false; }
422

  
423
		try { Pattern.compile(div_tags) }
424
		catch (Exception e) { println "div_tags=${div_tags}: "+e;return false; }
425

  
426
		try { Pattern.compile(q_tags) }
427
		catch (Exception e) { println "q_tags=${q_tags}: "+e;return false; }
428

  
429
		try { Pattern.compile(extraword1_tags) }
430
		catch (Exception e) { println "extraword1_tags=${extraword1_tags}: "+e;return false; }
431

  
432
		try { Pattern.compile(corr_tags_no_seg) }
433
		catch (Exception e) { println "corr_tags_no_seg=${corr_tags_no_seg}: "+e;return false; }
434

  
435
		try { Pattern.compile(word_tags) }
436
		catch (Exception e) { println "word_tags=${word_tags}: "+e;return false; }
437

  
438
		try { Pattern.compile(intraword_tags) }
439
		catch (Exception e) { println "intraword_tags=${intraword_tags}: "+e;return false; }
440

  
441
		try { Pattern.compile(punct_quotes) }
442
		catch (Exception e) { println "punct_quotes=${punct_quotes}: "+e;return false; }
443

  
444
		try { Pattern.compile(punct_strong1) }
445
		catch (Exception e) { println "punct_strong1=${punct_strong1}: "+e;return false; }
446

  
447
		try { Pattern.compile(punct_strong2) }
448
		catch (Exception e) { println "punct_strong2=${punct_strong2}: "+e;return false; }
449

  
450
		try { Pattern.compile(punct_paren_open1) }
451
		catch (Exception e) { println "punct_paren_open1=${punct_paren_open1}: "+e;return false; }
452

  
453
		try { Pattern.compile(punct_paren_open2) }
454
		catch (Exception e) { println "punct_paren_open2=${punct_paren_open2}: "+e;return false; }
455

  
456
		try { Pattern.compile(punct_paren_close1) }
457
		catch (Exception e) { println "punct_paren_close1=${punct_paren_close1}: "+e;return false; }
458

  
459
		try { Pattern.compile(punct_paren_close2) }
460
		catch (Exception e) { println "punct_paren_close2=${punct_paren_close2}: "+e;return false; }
461

  
462
		try { Pattern.compile(punct_weak) }
463
		catch (Exception e) { println "punct_weak=${punct_weak}: "+e;return false; }
464

  
465
		try { Pattern.compile(entity) }
466
		catch (Exception e) { println "entity=${entity}: "+e;return false; }
467

  
468
		try { Pattern.compile(seg_tags) }
469
		catch (Exception e) { println "seg_tags=${seg_tags}: "+e;return false; }
470

  
471
		try { Pattern.compile(corr_tags) }
472
		catch (Exception e) { println "corr_tags: "+e;return false; }
473

  
474
		try { 	Pattern.compile(extraword_tags) }
475
		catch (Exception e) { println "extraword_tags: "+e;return false; }
476

  
477
		if (punct_strong != null)
478
		try { 	Pattern.compile(punct_strong) }
479
		catch (Exception e) { println "punct_strong: "+e;return false; }
480

  
481
		try { Pattern.compile(punct_paren_open) }
482
		catch (Exception e) { println "punct_paren_open: "+e;return false; }
483

  
484
		try { Pattern.compile(punct_paren_close) }
485
		catch (Exception e) { println "punct_paren_close: "+e;return false; }
486

  
487
		try { Pattern.compile(punct_paren) }
488
		catch (Exception e) { println "punct_paren: "+e;return false; }
489

  
490
		try { Pattern.compile(punct_all) }
491
		catch (Exception e) { println "punct_all: "+e;return false; }
492

  
493
		try { Pattern.compile(word_chars) }
494
		catch (Exception e) { println "word_chars: "+e;return false; }
495
		
496
		if (regPunct != null)
497
		try { Pattern.compile(regPunct) }
498
		catch (Exception e) { println "regPunct=${regPunct}: "+e;return false; }
499

  
500
		if (regElision != null)
501
		try { Pattern.compile(regElision) }
502
		catch (Exception e) { println "regElision=${regElision}: "+e;return false; }
503

  
504
		if (whitespaces != null)
505
		try { Pattern.compile(whitespaces) }
506
		catch (Exception e) { println "whitespaces=${whitespaces}: "+e;return false; }
507

  
508
		for(TTest test : tests) {
509
			try { Pattern.compile(test.regex) }
510
			catch (Exception e) { println test.regex+": "+e;return false; }
511
		}
512
		return true;
513
	}
514

  
515
	public static boolean loadFromNode(Element tokenizerElement) {
516
		//load params
517
		String tmp_strong_punct;
518
		NodeList params = tokenizerElement.getElementsByTagName("param");
519
		for (int i = 0 ; i < params.getLength() ; i++) {
520
			Element param = params.item(i);
521
			String key = param.getAttribute("name")
522
			String value = param.getAttribute("value")
523
			if (value == null || value.length() == 0) value = param.getTextContent()
524
			if (value.length() == 0) value = null;
525
			
526
			if (debug) println (" Tokenizer parametrized with $key=$value.")
527
			if (key == "tag_all")
528
				tag_all = value
529
			else if (key == "enclitics")
530
				enclitics = value
531
			else if (key == "encliticsFR")
532
				encliticsFR = value
533
			else if (key == "div_tags")
534
				div_tags = value
535
			else if (key == "q_tags")
536
				q_tags = value
537
			else if (key == "extraword1_tags")
538
				extraword1_tags = value
539
			else if (key == "corr_tags_no_seg")
540
				corr_tags_no_seg = value
541
			else if (key == "word_tags")
542
				word_tags = value
543
			else if (key == "intraword_tags")
544
				intraword_tags = value
545
			else if (key == "punct_quotes")
546
				punct_quotes = value
547
			else if (key == "punct_strong1")
548
				punct_strong1 = value
549
			else if (key == "punct_strong2")
550
				punct_strong2 = value
551
			else if (key == "punct_paren_open1")
552
				punct_paren_open1 = value
553
			else if (key == "punct_paren_open2")
554
				punct_paren_open2 = value
555
			else if (key == "punct_paren_close1")
556
				punct_paren_close1 = value
557
			else if (key == "punct_paren_close2")
558
				punct_paren_close2 = value
559
			else if (key == "punct_weak")
560
				punct_weak = value
561
			else if (key == "entity")
562
				entity = value
563
			else if (key == "seg_tags")
564
				seg_tags = value
565
			else if (key == "regPunct")
566
				regPunct = value
567
			else if (key == "regElision")
568
				regElision = value
569
			else if (key == "whitespaces")
570
				whitespaces = value
571
			else if (key == "punct_strong") // this is temporary
572
				tmp_strong_punct = value // this is temporary
573
			else
574
				println "MISSING TOKENIZER KEY: "+key
575
		}
576
		//recombine
577
		recombine();
578
		if (tmp_strong_punct != null) punct_strong = tmp_strong_punct;  // this is temporary
579
		
580
		String shouldResetTests = tokenizerElement.getAttribute("onlyThoseTests")
581
		if ("true".equals(shouldResetTests)) {
582
			println "Warning: tokenizer only using import parameters tests"
583
			tests = []
584
		}
585
			
586
		NodeList testsList = tokenizerElement.getElementsByTagName("test");
587
		if (testsList.getLength() > 0) {
588
			//println "Add "+testsList.getLength()+" tests to the tokenizer"
589
			for (int i = 0 ; i < testsList.getLength() ; i++) {
590
				Element test = testsList.item(i);
591
				tests.add(i, new TTest(test.getTextContent(), test.getAttribute("type"),
592
						Integer.parseInt(test.getAttribute("before")),
593
						Integer.parseInt(test.getAttribute("hit")),
594
						Integer.parseInt(test.getAttribute("after")))
595
						)
596
			}
597
			//println "Tests: "+tests
598
		}
599

  
600
		if (isValid())
601
			return true
602
		else {
603
			reset();
604
			recombine();
605
			return false;
606
		}
607
	}
608

  
609
	/**
610
	 * The main method.
611
	 *
612
	 * @param args the arguments
613
	 */
614
	public static void main(String[] args) {
615

  
616
		Document doc = DomUtils.load(new File("/home/mdecorde/xml/discours/import.xml"));
617
		Node corpora = doc.getDocumentElement().getElementsByTagName("corpora").item(0);
618
		Node corpus = ((Element)corpora).getElementsByTagName("corpus").item(0)
619
		Node tokenizer = ((Element)corpus).getElementsByTagName("tokenizer").item(0)
620
		println "OK? "+ TokenizerClasses.loadFromNode(tokenizer)
621
	}
622
}
tmp/org.txm.tokenizer.core/src/org/txm/tokenizer/package.html (revision 887)
1
<html>
2
<body>
3
<p>All necessary classes to tokenize XML-TEI</p>
4
</body>
5
</html>
0 6

  
tmp/org.txm.tokenizer.core/src/org/txm/tokenizer/TTest.groovy (revision 887)
1
package org.txm.tokenizer
2

  
3
public class TTest {
4
	def regex;
5
	def type = "w";
6
	def before=1, hit=2, after=3;
7
	public TTest(def regex, String type, int before, int hit, int after) {
8
		this.regex = regex
9
		this.type = type
10
		this.before = before
11
		this.hit = hit
12
		this.after = after
13
	}
14

  
15
	public TTest(def regex, int before, int hit, int after) {
16
		this.regex = regex
17
		this.before = before
18
		this.hit = hit
19
		this.after = after
20
	}
21

  
22
	public TTest(def regex, String type) {
23
		this.regex = regex
24
		this.type= type
25
	}
26

  
27
	public TTest(def regex) {
28
		this.regex = regex
29
	}
30

  
31
	/* (non-Javadoc)
32
	 * @see java.lang.Object#toString()
33
	 */
34
	@Override
35
	public String toString() {
36
		return "[regex=" + regex + ", type=" + type + ", before="+ before + ", hit=" + hit + ", after=" + after + "]";
37
	}
38
}
39

  
tmp/org.txm.tokenizer.core/build.properties (revision 887)
1
source.. = src/
2
output.. = bin/
3
bin.includes = META-INF/,\
4
               .
0 5

  
tmp/org.txm.tokenizer.core/.settings/org.eclipse.jdt.groovy.core.prefs (revision 887)
1
eclipse.preferences.version=1
2
groovy.compiler.level=-1
3
groovy.script.filters=**/*.dsld,y,**/*.gradle,n
0 4

  
tmp/org.txm.tokenizer.core/.settings/org.eclipse.jdt.core.prefs (revision 887)
1
eclipse.preferences.version=1
2
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
4
org.eclipse.jdt.core.compiler.compliance=1.7
5
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
6
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
7
org.eclipse.jdt.core.compiler.source=1.7
0 8

  

Also available in: Unified diff