Revision 479 tmp/org.txm.groovy.core/src/groovy/filters/TagSentences/TagSentences.groovy

TagSentences.groovy (revision 479)
131 131
		// System.out.println("end sentences");
132 132
	}
133 133
	
134
	def reg_empty_test = /\S/
135
	
134 136
	/**
135 137
	 * Test before after.
136 138
	 *
137 139
	 * @return true, if successful
138 140
	 */
139 141
	boolean testBeforeAfter() {
140
		if (_before ==~ /\S/ || _after ==~ /\S/) {
142
		if (_before ==~ reg_empty_test || _after ==~ reg_empty_test) {
141 143
			print "TagSentences : ERROR: $line";
142 144
			return false;
143 145
		}
......
172 174
	/** The MAXPRINT. */
173 175
	int MAXPRINT = 0
174 176
	
177
	def reg_comment = /\A\s*<!--.*-->\s*\Z/
178
	def reg_out_of_sentence = /^(.*)<s( [^>]*)?>(.*)$/
179
	def reg_sentence_with_no_n_attribute = /<s( [^>]*)?>/
180
	def reg_end_of_sentence = /^(.*)<\/s>(.*)$/
181
	def reg_punct = /^(.*)<w type="pon"[^>]*>.*<\/w>(.*)$/
182
	def reg_strong_punct = /^(.*)<w [^>]*>$strongPunct<\/w>(.*)/
183
	def reg_word = /^(.*)<w .*<\/w>(.*)/
184
	def reg_corr_tags_no_seg = /^\s*(<($corr_tags_no_seg)( [^>\/]*)?>)\s*$/
185
	def reg_corr_tags_no_seg_alone = "<($corr_tags_no_seg)>"
186
	def reg_corr_tags_no_seg2 = /^(.*)<\/($corr_tags_no_seg)>(.*)$/
187
	def reg_block_tags = /^(.*)<\/($div_tags|$q_tags)>(.*)$/
188
	def reg_block_tag_alone = "<($div_tags|$q_tags)>"
189
	
175 190
	/* (non-Javadoc)
176 191
	 * @see org.txm.importer.filters.Filter#filter()
177 192
	 */
......
182 197
		// in the var line is the current line
183 198
		//line = line.trim()
184 199
		
185
		if( scounter > MINPRINT && scounter < MAXPRINT)
200
		if (scounter > MINPRINT && scounter < MAXPRINT)
186 201
			println linetype+"LINE : "+line;
187 202
		
188
		if (line ==~ /\A\s*<!--.*-->\s*\Z/) {		
203
		if (line ==~ reg_comment) {		
189 204
			output.print(line);
190 205
		}
191
		else if (linetype ==~ /out/) { 
206
		else if (linetype == "out") { 
192 207
			//on est en dehors d'une phrase
193
			if ( (m = line =~ /^(.*)<s( [^>]*)?>(.*)$/) ) { 
208
			if ( (m = line =~ reg_out_of_sentence) ) { 
194 209
				//on trouve une balise de phrase => on met à jour le numéro, on est dans une phrase
195 210
				affect13(m);
196 211
				if (pending == "yes") {
......
199 214
				}
200 215
				
201 216
				scounter++;
202
				line = (line =~ /<s( [^>]*)?>/).replaceAll("<s n=\""+scounter+"\">");
217
				line = (line =~ reg_sentence_with_no_n_attribute).replaceAll("<s n=\"$scounter\">");
203 218
				output.print(line+"\n");	
204 219
				linetype = "in";
205 220
			}
206
			else if( (m = line =~ /^(.*)<\/s>(.*)$/) ) { ////on trouve une balise de phrase fermante
221
			else if( (m = line =~ reg_end_of_sentence) ) { ////on trouve une balise de phrase fermante
207 222
				//println "found </s> "+line
208 223
				affect12(m);
209 224
				if (pending == "yes") {
......
218 233
					System.err.println("Found </s>, but pending = no and linetype= out : "+line+" ; scount $scounter");
219 234
				}
220 235
			}
221
			else if ( (	m = line =~ /^(.*)<w type="pon"[^>]*>.*<\/w>(.*)$/)  && pending ==~ /yes/) { 
236
			else if ( (	m = line =~ reg_punct)  && pending== "yes") { 
222 237
				// on trouve une ponctuation
223 238
				// et la balise de la phrase précédente n'est pas fermée => on ne fait rien
224 239
							//println "found w type pon"
......
226 241
				output.print(line);
227 242
			}
228 243
			
229
			else if( (m = line =~ /^(.*)<w .*<\/w>(.*)/) ) //on rencontre un mot...
244
			else if( (m = line =~ reg_word) ) //on rencontre un mot...
230 245
			{ 	
231 246
				//println "found <w> open a sentence"
232 247
				affect12(m);
......
236 251
					pending = "no";
237 252
				}
238 253
				scounter++;
239
				output.print("<s n=\""+scounter+"\">\n"+line);
254
				output.print("<s n=\"$scounter\">\n"+line);
240 255
				linetype = "in";
241 256
			}
242
			else if( (m = line =~ /^\s*(<($corr_tags_no_seg)( [^>\/]*)?>)\s*$/) 
243
			|| line.matches("<($corr_tags_no_seg)>"))
257
			else if( (m = line =~ reg_corr_tags_no_seg) 
258
			|| line.matches(reg_corr_tags_no_seg_alone))
244 259
			{ 
245 260
				if(pending == "yes")
246 261
				{
......
260 275
					open_corr_tags.push(tag_name);
261 276

  
262 277
					scounter++;
263
					output.print("<s n=\""+scounter+"\">\n"); //on ouvre une <s>
278
					output.print("<s n=\"$scounter\">\n"); //on ouvre une <s>
264 279
					linetype = "in";
265 280
				}
266 281
				if (scounter > MINPRINT && scounter < MAXPRINT) {
267
					println "stacks "+open_corr_tags+" ; "+open_div_tags
282
					println "stacks $open_corr_tags ; $open_div_tags"
268 283
				}
269 284
				output.print(line);
270
			} else if( ( m = line =~ /^(.*)<\/($corr_tags_no_seg)>(.*)$/) ) 
285
			} else if( ( m = line =~ reg_corr_tags_no_seg2) ) 
271 286
			{ 
272 287
				//on trouve la balise fermante correspondante à la dernière correction ouverte	
273 288
						affect13(m);
......
279 294
					open_div_tags.pop();
280 295
					if(open_div_tags.size() == 0)
281 296
					{
282
						if(pending == "yes")
283
						{
297
						if (pending == "yes") {
284 298
							output.print("</s>\n"+line);// comme une div tag
285 299
							pending = "no";
286
						}
287
						else
288
						{
300
						} else {
289 301
							output.print(line);// comme une div tag
290 302
						}
291
					}
292
					else
293
					{
303
					} else {
294 304
						output.print(line);// comme une div tag
295 305
					}
296
				}
297
				else
298
				{	
306
				} else {	
299 307
					open_corr_tags.pop();
300 308
					
301
					if(pending == "yes")
302
					{
303
						if(open_corr_tags.size() == 0)
304
						{
309
					if (pending == "yes") {
310
						if (open_corr_tags.size() == 0) {
305 311
							output.print("</s>\n");
306 312
							output.print(line);
307 313
							linetype = "out";
308 314
							pending = "no";
309
						}
310
						else
311
						{
315
						} else {
312 316
							output.print(line);
313 317
						}
314
					}
315
					else
316
					{
318
					} else {
317 319
						output.print(line);
318 320
					}
319 321
				}
320
				if( scounter > MINPRINT && scounter < MAXPRINT)
321
				{
322
				
323
				if( scounter > MINPRINT && scounter < MAXPRINT) {
322 324
					println "stacks "+open_corr_tags+" ; "+open_div_tags
323 325
				}
324
			}
325
			else if( (	m = line =~ /^(.*)<\/($div_tags|$q_tags)>(.*)$/) ||
326
			line.matches("<($div_tags|$q_tags)>")) { ////on trouve une balise de citation ou division fermante
326
			} else if( (	m = line =~ reg_block_tags) ||
327
						line.matches(reg_block_tag_alone)) { ////on trouve une balise de citation ou division fermante
327 328
				//println "found closing div|quote tag "+line
328 329
				//affect13(m);
329 330
				
330
				if (pending == "yes") 
331
				{	//println "a sentence was closed"
331
				if (pending == "yes") {	//println "a sentence was closed"
332 332
					output.print("</s>\n$line");
333 333
					pending = "no";
334 334
				} else {
......
342 342
				
343 343
				//pending = "no";
344 344
				//push @s_errors, scounter;
345
				if(pending == "yes")
346
				{
345
				if (pending == "yes") {
347 346
					output.print("</s>\n");
348 347
					pending = "no";
349 348
				}
350 349
				
351 350
				output.print("$line");
352
			}		
353
			else {	//println "ELSE de 'out'"
351
			}	 else {	//println "ELSE de 'out'"
354 352
				output.print(line);
355 353
			}
356 354
		}
357
		else if (linetype ==~ /in/) //on est à l'intérieur d'une phrase
355
		else if (linetype == "in") //on est à l'intérieur d'une phrase
358 356
				{ 	//println "in sentence"
359 357
			
360
				if( ( m = line =~ /^(.*)<w [^>]*>$strongPunct<\/w>(.*)/) ) { ////on trouve une ponctuation forte ==> on est à l'extérieur d'une phrase
358
				if ( ( m = line =~ reg_strong_punct) ) { ////on trouve une ponctuation forte ==> on est à l'extérieur d'une phrase
361 359
						//println "found word .!? "+line
362 360
				affect12(m);
363 361
				
364
				if(open_corr_tags.size() == 0) // il n'y a pas de correction en cours
362
				if (open_corr_tags.size() == 0) // il n'y a pas de correction en cours
365 363
				{
366 364
					linetype = "out";
367 365
					pending ="yes"
......
372 370
					pending = "yes"
373 371
					output.print(line);
374 372
				}
375
			}else if( (m = line =~ /^(.*)<w .*<\/w>(.*)/) || (m = line =~ /^(.*)<w( [^>]*)?>.*<\/w>(.*)/) ) //on rencontre un mot...
373
			} else if( (m = line =~ reg_word) || (m = line =~ /^(.*)<w( [^>]*)?>.*<\/w>(.*)/) ) //on rencontre un mot...
376 374
			{ 	
377 375
				//println "found <w> open a sentence"
378 376
				affect12(m);
379 377
				output.print(line);
380 378
				
381 379
			}
382
			else if( (m = line =~ /^\s*(<($corr_tags_no_seg)( [^>\/]*)?>)\s*$/) 
383
			|| line.matches("<($corr_tags_no_seg)>"))
380
			else if( (m = line =~ reg_corr_tags_no_seg) 
381
			|| line.matches(reg_corr_tags_no_seg_alone))
384 382
			{ 
385 383
				if(scounter > MINPRINT && scounter < MAXPRINT)
386 384
					System.out.println("open corr "+line);
......
399 397
				{ 
400 398
					open_corr_tags.push(tag_name);
401 399
				}
402
				if(scounter > MINPRINT && scounter < MAXPRINT)
403
				{
400
				if (scounter > MINPRINT && scounter < MAXPRINT) {
404 401
					println "stacks "+open_corr_tags+" ; "+open_div_tags
405 402
				}
406 403
				output.print(line);
407 404
			}
408
			else if( ( m = line =~ /^(.*)<\/($corr_tags_no_seg)>(.*)$/) ) 
405
			else if( ( m = line =~ reg_corr_tags_no_seg2) ) 
409 406
			{ 
410 407
				//on trouve la balise fermante correspondante à la dernière correction ouverte	
411
				if(scounter > MINPRINT && scounter < MAXPRINT)
408
				if (scounter > MINPRINT && scounter < MAXPRINT)
412 409
					System.out.println("closing corr "+line);
413 410
				affect13(m);
414 411
				
......
431 428
						pending = "no";
432 429
					}
433 430
				}
434
				if(scounter > MINPRINT && scounter < MAXPRINT)
435
				{
431
				if (scounter > MINPRINT && scounter < MAXPRINT) {
436 432
					println "stacks "+open_corr_tags+" ; "+open_div_tags
437 433
				}
438 434
			}
439
			else if( (m = line =~ /^(.*)<\/s>(.*)$/) ) { ////on trouve une balise de phrase fermante
435
			else if( (m = line =~ reg_out_of_sentence) ) { ////on trouve une balise de phrase fermante
440 436
				//println "found </s> "+line
441 437
				affect12(m);
442 438
				
......
444 440
				//pending = "no";
445 441
				output.print(line);
446 442
			}
447
			else if( (m = line =~ /^(.*)<\/($div_tags|$q_tags)>(.*)$/) ||
443
			else if( (m = line =~ reg_block_tags) ||
448 444
			line.matches("</($div_tags|$q_tags)>")) { ////on trouve une balise de division ou de citation fermante ==> on ferme une </s>
449 445
				//println "found div or quote closing tag "+line
450 446
				//affect13(m);
......
467 463
				
468 464
			}		
469 465
			else {
470
				if(scounter > MINPRINT && scounter < MAXPRINT)
466
				if (scounter > MINPRINT && scounter < MAXPRINT)
471 467
					println "ELSE "+line
468
				
472 469
				output.print(line);
473 470
			}
474 471
		}

Also available in: Unified diff