Revision 2390

tmp/org.txm.tokenizer.core/src/org/txm/tokenizer/TokenizerClasses.java (revision 2390)
30 30
import java.io.File;
31 31
import java.io.FileWriter;
32 32
import java.io.IOException;
33
import java.lang.reflect.Field;
34
import java.lang.reflect.Modifier;
33 35
import java.util.ArrayList;
34 36
import java.util.List;
35 37
import java.util.prefs.Preferences;
......
49 51
import javax.xml.stream.*;
50 52
import java.net.URL;
51 53

  
52
// TODO: Auto-generated Javadoc
53 54
/**
54
 * The Class TokenizerClasses.
55
 * Holder for tokenizer rules regexp
55 56
 */
56 57
public class TokenizerClasses {
57 58

  
58 59
	//// ROOTS ? ////
59 60
	public static boolean debug = false;
60 61
	/** The tag_all. */
61
	public static String tag_all = "<[A-Za-z][^>]+>";
62
	public static String tag_all = null;
62 63

  
63 64
	/** The Weblex enclitics. */
64
	public static String enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
65
	public static String encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
65
	public static String enclitics = null;
66
	public static String encliticsFR = null;
66 67

  
67 68
	/** The TT enclitics. */
68
	public static String FClitic_en = "'(s|re|ve|d|m|em|ll)|n['‘’]t";
69
	public static String PClitic_fr = "[dcjlmnstDCJLNMST][\'‘’]|[Qq]u[\'‘’]|[Jj]usqu[\'‘’]|[Ll]orsqu[\'‘’]|[Pp]uisqu[\'‘’]|[Qq]uoiqu[\'‘’]";
70
	public static String FClitic_fr = "-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m[\'‘’]|-moi|-nous|-on|-toi|-tu|-t[\'‘’]|-vous|-en|-y|-ci|-là";
71
	public static String PClitic_it = "[dD][ae]ll[\'‘’]|[nN]ell[\'‘’]|[Aa]ll[\'‘’]|[lLDd][\'‘’]|[Ss]ull[\'‘’]|[Qq]uest[\'‘’]|[Uu]n[\'‘’]|[Ss]enz[\'‘’]|[Tt]utt[\'‘’]";
72
	public static String FClitic_gl = "-la|-las|-lo|-los|-nos";
69
	public static String FClitic_en = null;
70
	public static String PClitic_fr = null;
71
	public static String FClitic_fr = null;
72
	public static String PClitic_it = null;
73
	public static String FClitic_gl = null;
73 74

  
74 75
	/** The div_tags. */
75
	public static String div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl|opener|dateline";
76
	public static String div_tags = null;
76 77

  
77 78
	/** The q_tags. */
78
	public static String q_tags = "q|quote|said|item|stage|cit|label|heraldry";
79
	public static String q_tags = null;
79 80

  
80 81
	/** The extraword_tags. */
81
	public static String extraword1_tags = "expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
82
	public static String extraword1_tags = null;
82 83

  
83 84
	/** The corr_tags_no_seg. */
84
	public static String corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno|surplus";
85
	public static String corr_tags_no_seg = null;
85 86

  
86 87
	/** The word_tags. */
87
	public static String word_tags = "w";
88
	public static String word_tags = null;
88 89
	/** the element to create when a word is created */
89
	public static String word_element_to_create = "w";
90
	public static String word_element_to_create = null;
90 91

  
91 92
	/** The intraword_tags. */
92
	public static String intraword_tags = "c|ex|caesura";
93
	public static String intraword_tags = null;
93 94

  
94 95
	/** The punct_quotes. */
95
	public static String punct_quotes = "'‘’’";
96
	public static String punct_quotes = null;
96 97

  
97 98
	/** The punct_strong1. */
98
	public static String punct_strong1 = ".!?";
99
	public static String punct_strong1 = null;
99 100

  
100 101
	/** The punct_strong2. */
101
	public static String punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
102
	public static String punct_strong2 = null;
102 103

  
103 104
	/** The punct_paren_open1. */
104
	public static String punct_paren_open1 = "«";
105
	public static String punct_paren_open1 = null;
105 106

  
106 107
	/** The punct_paren_open2. */
107
	public static String punct_paren_open2 = "``|\\(|\\[|\\{";
108
	public static String punct_paren_open2 = null;
108 109

  
109 110
	/** The punct_paren_close1. */
110
	public static String punct_paren_close1 = "»";
111
	public static String punct_paren_close1 = null;
111 112

  
112 113
	/** The punct_paren_close2. */
113
	public static String punct_paren_close2 = "''|‘‘|’’|\\)|\\]|\\}";
114
	public static String punct_paren_close2 = null;
114 115

  
115 116
	/** The punct_weak. */
116
	public static String punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\≤≥<>\\—ν√μ•@≈→αγ∞≡σ&%|#¼½Θĩ†φθΓ§ẽə∈";
117
	public static String punct_weak = null;
117 118

  
118
	public static String entity = "&[^;]+;";
119
	public static String entity = null;
119 120

  
120
	public static String seg_tags = "seg";
121
	public static String seg_tags = null;
121 122

  
122 123
	//// COMBINAISONS ////
123 124

  
124 125
	/** The corr_tags. */
125
	public static String corr_tags = ""+corr_tags_no_seg+"|"+seg_tags+"";
126
	public static String corr_tags = null;
126 127
	/** The extraword_tags. */
127
	public static String extraword_tags = ""+div_tags+"|"+q_tags+"|"+extraword1_tags+"";
128
	public static String extraword_tags = null;
128 129

  
129 130
	/** The punct_strong. */
130
	public static String punct_strong = "["+punct_strong1+"]+|"+punct_strong2+"";
131
	public static String punct_strong = null;
131 132

  
132 133
	/** The punct_paren_open. */
133
	public static String punct_paren_open = ""+punct_paren_open1+"|"+punct_paren_open2+"";
134
	public static String punct_paren_open = null;
134 135

  
135 136
	/** The punct_paren_close. */
136
	public static String punct_paren_close = ""+punct_paren_close1+"|"+punct_paren_close2+"";
137
	public static String punct_paren_close = null;
137 138

  
138 139
	/** The punct_paren. */
139
	public static String punct_paren = ""+punct_paren_open+"|"+punct_paren_close+"";
140
	public static String punct_paren = null;
140 141

  
141 142
	/** The punct_all. */
142
	public static String punct_all = ""+punct_strong+"|"+punct_paren+"|["+punct_weak+"]+";
143
	public static String punct_all = null;
143 144

  
144 145
	/** The word_chars. */
145
	public static String word_chars = "[^ "+punct_quotes+""+punct_strong1+""+punct_paren_open1+""+punct_paren_close1+""+punct_weak+"]+|"+entity+"";
146
	public static String word_chars = null;
146 147

  
147 148
	public static TTest[] tests = {
148 149
			//		new TTest(/\A(.*-)(je|m[eo]i|tu|t[eo]i|lui|luy|ilz|ils?|no?u?s|vo?u?s|on|leurs|ce|ci|là|elles?)(.*)\Z/),
......
168 169
			//		new TTest(/\A(.*)(\p{Ps}|\p{Pe}|\p{Pi}|\p{Pf}|\p{Po}|\p{S})(.*)\Z/, "pon")
169 170
	};
170 171

  
171
	//tag_all = "<[^>]+>";
172
	//enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
173
	//encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
174
	//div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl";
175
	//q_tags = "q|quote|item|stage";
176
	//extraword_tags = "$div_tags|$q_tags|expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
177
	//corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno";
178
	//corr_tags = "$corr_tags_no_seg|seg";
179
	//word_tags = "w|abbr|num";
180
	//intraword_tags = "c|ex";
172
	public static String whitespaces = null;
173
	public static String regElision = null;
174
	public static String regPunct = null;
175
	
176
	static {
177
		reset(); // Initialize static fields
178
	}
181 179

  
182
	public static String whitespaces = "[\\p{Z}\\p{C}]+";
183
	public static String regElision = "['‘’]"; // obsolete
184
	public static String regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]";
185

  
186 180
	//punct_strong1 = ".!?";
187 181
	//punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
188 182
	//punct_strong = "[$punct_strong1]|$punct_strong2";
......
198 192
	//word_chars = "[^ $punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|&[^;]+;";
199 193

  
200 194
	public static void reset() {
201
		tag_all = "<[^>]+>";
202
		enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
203
		encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
204
		div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl";
205
		q_tags = "q|quote|item|stage|cit";
195
		tag_all = "<[A-Za-z][^>]+>";
196
		
197
		div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl|opener|dateline";
198
		q_tags = "q|quote|said|item|stage|cit|label|heraldry";
206 199
		extraword1_tags = "expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
207 200
		corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno|surplus";
208 201
		word_tags = "w";
202
		word_element_to_create = "w";
209 203
		intraword_tags = "c|ex|caesura";
210 204
		punct_quotes = "'‘’’";
211 205
		punct_strong1 = ".!?";
......
214 208
		punct_paren_open2 = "``|\\(|\\[|\\{";
215 209
		punct_paren_close1 = "»";
216 210
		punct_paren_close2 = "''|‘‘|’’|\\)|\\]|\\}";
217
		punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\-≤≥<>\\—ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
211
		punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\≤≥<>\\—ν√μ•@≈→αγ∞≡σ&%|#¼½Θĩ†φθΓ§ẽə∈";
218 212
		entity = "&[^;]+;";
219 213
		seg_tags = "seg";
214
		
215
		enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
216
		encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
217
		
218
		/** The TT enclitics. */
219
		FClitic_en = "'(s|re|ve|d|m|em|ll)|n['‘’]t";
220
		PClitic_fr = "[dcjlmnstDCJLNMST][\'‘’]|[Qq]u[\'‘’]|[Jj]usqu[\'‘’]|[Ll]orsqu[\'‘’]|[Pp]uisqu[\'‘’]|[Qq]uoiqu[\'‘’]";
221
		FClitic_fr = "-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m[\'‘’]|-moi|-nous|-on|-toi|-tu|-t[\'‘’]|-vous|-en|-y|-ci|-là";
222
		PClitic_it = "[dD][ae]ll[\'‘’]|[nN]ell[\'‘’]|[Aa]ll[\'‘’]|[lLDd][\'‘’]|[Ss]ull[\'‘’]|[Qq]uest[\'‘’]|[Uu]n[\'‘’]|[Ss]enz[\'‘’]|[Tt]utt[\'‘’]";
223
		FClitic_gl = "-la|-las|-lo|-los|-nos";
220 224

  
221 225
		whitespaces = "[\\p{Z}\\p{C}]+";
222 226
		regElision = "['‘’]";
......
233 237
		punct_paren_close = ""+punct_paren_close1+"|"+punct_paren_close2+"";
234 238
		punct_paren = ""+punct_paren_open+"|"+punct_paren_close+"";
235 239
		punct_all = ""+punct_strong+"|"+punct_paren+"|["+punct_weak+"]";
236
		word_chars = "[^ "+punct_quotes+""+punct_strong1+""+punct_paren_open1+""+punct_paren_close1+""+punct_weak+"]|"+entity+"";
240
		word_chars = "[^ "+punct_quotes+""+punct_strong1+""+punct_paren_open1+""+punct_paren_close1+""+punct_weak+"]+|"+entity+"";
237 241
	}
238 242

  
239 243
	/**
......
243 247
	 * @param parent the parent
244 248
	 * @return the java.lang. object
245 249
	 */
250
	@Deprecated
246 251
	public static void toDom(Document doc, Node parent) {
247 252
		Element tokenizer = doc.createElement("tokenizer");
248 253
		tokenizer.setAttribute("onlyThoseTests", "false");
249 254
		parent.appendChild(tokenizer);
250

  
255
		
251 256
		//String tag_all = "<[^>]+>";
252 257
		Element p = doc.createElement("param");
253 258
		p.setAttribute("key","tag_all");
......
526 531
		return true;
527 532
	}
528 533

  
534
	/**
535
	 * old way used in TXM 0.7.9 AND in corpus 0.7.9 -> 0.8.0 restoration
536
	 * @param tokenizerElement
537
	 * @return
538
	 */
539
	@Deprecated
529 540
	public static boolean loadFromNode(Element tokenizerElement) {
530 541
		//load params
531 542
		String tmp_strong_punct = null;
......
631 642
	 */
632 643
	public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException {
633 644

  
634
		Document doc = DomUtils.load(new File("/home/mdecorde/xml/discours/import.xml"));
635
		Node corpora = doc.getDocumentElement().getElementsByTagName("corpora").item(0);
636
		Node corpus = ((Element)corpora).getElementsByTagName("corpus").item(0);
637
		Node tokenizer = ((Element)corpus).getElementsByTagName("tokenizer").item(0);
638
		System.out.println("OK? "+ TokenizerClasses.loadFromNode((Element) tokenizer));
645
//		Document doc = DomUtils.load(new File("/home/mdecorde/xml/discours/import.xml"));
646
//		Node corpora = doc.getDocumentElement().getElementsByTagName("corpora").item(0);
647
//		Node corpus = ((Element)corpora).getElementsByTagName("corpus").item(0);
648
//		Node tokenizer = ((Element)corpus).getElementsByTagName("tokenizer").item(0);
649
//		System.out.println("OK? "+ TokenizerClasses.loadFromNode((Element) tokenizer));
639 650
	}
640 651

  
641 652
	public static boolean loadFromProject(ProjectScope projectScope) {
642 653
		//load params
643
		String tmp_strong_punct = null;
644 654
		IEclipsePreferences params = projectScope.getNode("Tokenizer");
655
		
645 656
		try {
646
			String[] keys = params.keys();
647
			for (String key : keys) {
648
				//					Element param = (Element) params.item(i);
649
				//					String key = param.getAttribute("name");
650
				String value = params.get(key, "");
651
				//					if (value == null || value.length() == 0) value = param.getTextContent();
652
				if (value.length() == 0) value = null;
653

  
654
				if (debug) System.out.println(" Tokenizer parametrized with "+key+"="+value+"");
655

  
656
				if (key.equals("tag_all"))
657
					tag_all = value;
658
				else if (key.equals("enclitics"))
659
					enclitics = value;
660
				else if (key.equals("encliticsFR"))
661
					encliticsFR = value;
662
				else if (key.equals("div_tags"))
663
					div_tags = value;
664
				else if (key.equals("q_tags"))
665
					q_tags = value;
666
				else if (key.equals("extraword1_tags"))
667
					extraword1_tags = value;
668
				else if (key.equals("corr_tags_no_seg"))
669
					corr_tags_no_seg = value;
670
				else if (key.equals("word_tags")) {
671
					word_tags = value;
672
					word_element_to_create = value; // FIXME for now
673
				} else if (key.equals("intraword_tags"))
674
					intraword_tags = value;
675
				else if (key.equals("punct_quotes"))
676
					punct_quotes = value;
677
				else if (key.equals("punct_strong1"))
678
					punct_strong1 = value;
679
				else if (key.equals("punct_strong2"))
680
					punct_strong2 = value;
681
				else if (key.equals("punct_paren_open1"))
682
					punct_paren_open1 = value;
683
				else if (key.equals("punct_paren_open2"))
684
					punct_paren_open2 = value;
685
				else if (key.equals("punct_paren_close1"))
686
					punct_paren_close1 = value;
687
				else if (key.equals("punct_paren_close2"))
688
					punct_paren_close2 = value;
689
				else if (key.equals("punct_weak"))
690
					punct_weak = value;
691
				else if (key.equals("entity"))
692
					entity = value;
693
				else if (key.equals("seg_tags"))
694
					seg_tags = value;
695
				else if (key.equals("regPunct"))
696
					regPunct = value;
697
				else if (key.equals("regElision"))
698
					regElision = value;
699
				else if (key.equals("whitespaces"))
700
					whitespaces = value;
701
				else if (key.equals("punct_strong")) // this is temporary
702
					tmp_strong_punct = value; // this is temporary
703
//				else
704
//					System.out.println("MISSING TOKENIZER KEY: "+key);
657
			
658
			Field[] fields = TokenizerClasses.class.getFields();
659
			for (Field field : fields) {
660
				int m = field.getModifiers();
661
				if (Modifier.isStatic(m) && Modifier.isPublic(m) && field.getType().equals(String.class)) {
662
					String name = field.getName();
663
					String value = params.get(name, null);
664
					if (value != null) {
665
						try {
666
							if (debug) System.out.println(" Tokenizer parametrized with "+name+"="+value);
667
							field.set(field, value);
668
						}
669
						catch (IllegalArgumentException e) {
670
							// TODO Auto-generated catch block
671
							e.printStackTrace();
672
						}
673
						catch (IllegalAccessException e) {
674
							// TODO Auto-generated catch block
675
							e.printStackTrace();
676
						}
677
					}
678
				}
705 679
			}
680
//			// old way, now using Reflection
681
			
682
//			String[] keys = params.keys();
683
//			for (String key : keys) {
684
//				//					Element param = (Element) params.item(i);
685
//				//					String key = param.getAttribute("name");
686
//				String value = params.get(key, "");
687
//				//					if (value == null || value.length() == 0) value = param.getTextContent();
688
//				if (value.length() == 0) value = null;
689
//
690
//				if (debug) System.out.println(" Tokenizer parametrized with "+key+"="+value+"");
691
//
692
//				if (key.equals("tag_all"))
693
//					tag_all = value;
694
//				else if (key.equals("enclitics"))
695
//					enclitics = value;
696
//				else if (key.equals("encliticsFR"))
697
//					encliticsFR = value;
698
//				else if (key.equals("div_tags"))
699
//					div_tags = value;
700
//				else if (key.equals("q_tags"))
701
//					q_tags = value;
702
//				else if (key.equals("extraword1_tags"))
703
//					extraword1_tags = value;
704
//				else if (key.equals("corr_tags_no_seg"))
705
//					corr_tags_no_seg = value;
706
//				else if (key.equals("word_tags")) {
707
//					word_tags = value;
708
//					word_element_to_create = value; // FIXME for now
709
//				} else if (key.equals("intraword_tags"))
710
//					intraword_tags = value;
711
//				else if (key.equals("punct_quotes"))
712
//					punct_quotes = value;
713
//				else if (key.equals("punct_strong1"))
714
//					punct_strong1 = value;
715
//				else if (key.equals("punct_strong2"))
716
//					punct_strong2 = value;
717
//				else if (key.equals("punct_paren_open1"))
718
//					punct_paren_open1 = value;
719
//				else if (key.equals("punct_paren_open2"))
720
//					punct_paren_open2 = value;
721
//				else if (key.equals("punct_paren_close1"))
722
//					punct_paren_close1 = value;
723
//				else if (key.equals("punct_paren_close2"))
724
//					punct_paren_close2 = value;
725
//				else if (key.equals("punct_weak"))
726
//					punct_weak = value;
727
//				else if (key.equals("entity"))
728
//					entity = value;
729
//				else if (key.equals("seg_tags"))
730
//					seg_tags = value;
731
//				else if (key.equals("regPunct"))
732
//					regPunct = value;
733
//				else if (key.equals("regElision"))
734
//					regElision = value;
735
//				else if (key.equals("whitespaces"))
736
//					whitespaces = value;
737
//				else if (key.equals("punct_strong")) // this is temporary
738
//					tmp_strong_punct = value; // this is temporary
739
////				else
740
////					System.out.println("MISSING TOKENIZER KEY: "+key);
741
//			}
706 742

  
707 743
			//recombine
708 744
			recombine();
709
			if (tmp_strong_punct != null) punct_strong = tmp_strong_punct;  // this is temporary
710 745

  
711 746
			if (params.getBoolean("onlyThoseTests", false)) {
712 747
				System.out.println("Warning: tokenizer only using import parameters tests");
713 748
				//tests = new ArrayList<>();
714 749
			}
715 750

  
716
			org.osgi.service.prefs.Preferences testsList = params.node("test");
751
			org.osgi.service.prefs.Preferences testsList = params.node("tests");
717 752
			String[] tests;
718 753
			tests = testsList.childrenNames();
719 754
			//System.out.println("Add "+testsList.getLength()+" tests to the tokenizer"
......
733 768
			e.printStackTrace();
734 769
		}
735 770

  
736
		if (isValid())
771
		if (isValid()) {
737 772
			return true;
738
		else {
773
		} else {
739 774
			reset();
740 775
			recombine();
741 776
			return false;

Also available in: Unified diff