Révision 3268

tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/filters/Tokeniser/ChunkTokenizerXml.groovy (revision 3268)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

  
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate:$
41
// $LastChangedRevision:$
42
// $LastChangedBy:$
43
//
44
package org.txm.tokenizer;
45

  
46
import static groovy.transform.TypeCheckingMode.SKIP
47
import groovy.transform.CompileStatic
48

  
49
import java.util.Map.Entry
50
import java.util.regex.Matcher
51
import java.util.regex.Pattern
52

  
53
import javax.xml.stream.*
54

  
55
import org.txm.importer.PersonalNamespaceContext
56
import org.txm.tokenizer.StringTokenizer
57
import org.txm.tokenizer.SimpleStringTokenizer
58
import org.txm.tokenizer.TokenizerClasses
59
import org.txm.xml.DOMIdentityHook
60
import org.txm.xml.IdentityHook
61
import org.txm.xml.XMLParser
62
import org.txm.xml.XMLProcessor
63
import org.txm.xml.XPathHookActivator
64
import org.w3c.dom.Node
65

  
66
@CompileStatic
67
public class ChunkTokenizerXml extends XMLProcessor {
68
	
69
	XPathHookActivator activator;
70
	DOMIdentityHook hook;
71
	
72
	StringTokenizer stringTokenizer;
73
	boolean retokenize = false
74
	LinkedHashMap<String, String>retokenizedWordProperties = new LinkedHashMap()
75
	
76
	/** The word_tags. */
77
	String word_tags;
78
	String word_element_to_create
79
	Pattern reg_word_tags;
80
	/** The intraword_tags. */
81
	String intraword_tags
82
	/** The word_chars. */
83
	String word_chars
84
	
85
	/** The outside_text_tags_ignore_content. */
86
	String note_content = null;
87
	String outside_text_tags_keep_content = null // tag and content NOT removed but not tokenized
88
	String outside_text_tags = null // tag and content removed
89
	String startTag = null
90
	Pattern reg_note_content;
91
	Pattern reg_outside_text_tags_keep_content;
92
	Pattern reg_outside_text_tags;
93
	Pattern reg_startTag;
94
	
95
	/** The DEBUG. */
96
	public boolean DEBUG = false;
97
	
98
	/** The outfile is the result file. */
99
	File outfile;
100
	
101
	/** The infile. */
102
	File infile;
103
	
104
	String lang;
105
	
106
	/** The buffer. */
107
	StringBuffer buffer;
108
	
109
	/** The writer. */
110
	XMLStreamWriter writer;
111
	BufferedOutputStream output;
112
	
113
	/** The parser. */
114
	XMLStreamReader parser
115
	
116
	/** The localname. */
117
	String localname;
118
	
119
	/** The prefix. */
120
	String prefix;
121
	String filename;
122
	
123
	Pattern regLN;
124
	Pattern regCTRL;
125
	
126
	public ChunkTokenizerXml(File infile) {
127
		this(infile, "");
128
	}
129
	
130
	public ChunkTokenizerXml(File infile, String lang) {
131
		this(infile, new TokenizerClasses(lang));
132
	}
133
	
134
	/**
135
	 * Instantiates a new simple tokenizer xml.
136
	 *
137
	 * @param infile the infile
138
	 * @param outfile the outfile
139
	 */
140
	public ChunkTokenizerXml(File infile, TokenizerClasses tc) {
141
		super(infile)
142
		this.lang = tc.lang;
143
		this.stringTokenizer = new SimpleStringTokenizer(lang);
144
		
145
		word_tags = tc.word_tags;
146
		word_element_to_create = tc.word_element_to_create;
147
		reg_word_tags = Pattern.compile(word_tags);
148
		
149
		intraword_tags = tc.intraword_tags;
150
		word_chars = tc.word_chars;
151
		
152
		this.outfile = outfile;
153
		this.infile = infile;
154
		this.filename = infile.getName();
155
		int index = filename.lastIndexOf(".");
156
		if (index > 0) filename = filename.substring(0, index);
157
		
158
		regLN = Pattern.compile("/\n/");
159
		regCTRL = Pattern.compile("/\\p{C}/");
160
		
161
		activator = new XPathHookActivator<>(hook, "//div|p|ab");
162
		
163
		hook = new DOMIdentityHook("in_text_hook", activator, this) {
164
			
165
			String id;
166
			
167
			boolean inAna = false;
168
			
169
			boolean inForm = false;
170
			
171
			boolean inW = false;
172
			
173
			ArrayList<String[]> anaValues = new ArrayList<>();
174
			
175
			ArrayList<String[]> formValues = new ArrayList<>();
176
			
177
			StringBuilder value = new StringBuilder();
178
			
179
			String resp = "";
180
			
181
			String type = "";
182
			
183
			/**
184
			 * extends this method to process the DOM before it is written
185
			 */
186
			public void processDom() {
187
				
188
				ArrayList<Node> textNodes = getTextNodes(dom);
189
				if (textNodes.size() == 0) return; // easy
190
				
191
				StringBuilder buffer = new StringBuilder(); // build a string to tokenize
192
				for (Node textNode : textNodes) {
193
					buffer.append(" "+textNode.getTextContent());
194
				}
195
				
196
				int nNode = 0;
197
				Node currentTextNode = textNodes.get(0);
198
				String currentText = currentTextNode.getTextContent();
199
				int curentTextIndex = 0;
200
				StringBuilder currentNewText = new StringBuilder()
201
				ArrayList<String> currentWords = new ArrayList<String>()
202
				List<List<String>> sentences = stringTokenizer.processText(buffer.toString());
203
				//println "text="+buffer.toString()
204
				println "sentences=$sentences"
205
				for (List<String> sent : sentences) {
206
					if (nNode >= textNodes.size()) { // all nodes are updated
207
						break;
208
					}
209
					
210
					for (String word : sent) {
211
						if (nNode >= textNodes.size()) { // all nodes are updated
212
							break;
213
						}
214
						
215
						int idx = currentText.indexOf(word, curentTextIndex);
216
						if (idx >= 0) {
217
							curentTextIndex = idx + word.length();
218
						} else {
219
							println "HOUSTON: word=$word nNode=$nNode currentText=$currentText index=$curentTextIndex words=$currentWords"
220
							currentTextNode.setTextContent("");
221
							for (String w : currentWords) {
222
								Node newChild = dom.getOwnerDocument().createElementNS(null, "w");
223
								newChild.setAttribute("id", "W_ID")
224
								newChild.setTextContent(w);
225
								
226
								currentTextNode.getParentNode().insertBefore(newChild, currentTextNode)
227
							}
228
							currentTextNode.getParentNode().removeChild(currentTextNode)
229
							
230
							currentNewText = new StringBuilder()
231
							currentWords.clear();
232
							curentTextIndex = 0;
233
							nNode++;
234
							if (nNode < textNodes.size()) {
235
								currentTextNode = textNodes.get(nNode);
236
								currentText = currentTextNode.getTextContent();
237
							}
238
							
239
						}
240
						
241
						currentWords.add(word)
242
					}
243
				}
244
			}
245
			
246
			public ArrayList<Node> getTextNodes(Node element) {
247
				def children = element.getChildNodes()
248
				ArrayList<Node> texts = new ArrayList<Node>()
249
				for (int i = 0 ; i < children.getLength() ; i++) {
250
					def node = children.item(i);
251
					if (node.getNodeType() == Node.TEXT_NODE) {
252
						texts.add(node)
253
					} else if (node.getNodeType() == Node.ELEMENT_NODE) {
254
						if (node.getLocalName().equals("w")) {
255
							texts.add(node)
256
						} else {
257
							texts.addAll(getTextNodes(node));
258
						}
259
					}
260
				}
261
				return texts;
262
			}
263
		};
264
	}
265
	
266
	/**
267
	 * Replace the default SimpleStringTokenizer with another
268
	 * @param stringTokenizer a StringTokenizer
269
	 */
270
	public void setStringTokenizer(StringTokenizer stringTokenizer) {
271
		if (stringTokenizer == null) return;
272
		this.stringTokenizer = stringTokenizer;
273
	}
274
	
275
	/**
276
	 * Fill infos.
277
	 *
278
	 * @param event the event
279
	 * @return the java.lang. object
280
	 */
281
	public fillInfos(int event) {
282
		if (event == XMLStreamConstants.START_ELEMENT || event == XMLStreamConstants.END_ELEMENT) {
283
			localname = parser.getLocalName();
284
			prefix = parser.getPrefix();
285
		}
286
	}
287
	
288
	/**
289
	 * Donothing: just write what is read
290
	 * 
291
	 * special processing for words: prefix id attributes with "w_" and remove special characters like \n \t etc.
292
	 *
293
	 * @param event the event
294
	 * @param wordid the wordid
295
	 * @return the java.lang. object
296
	 */
297
	public donothing(int event, Integer wordid) {
298
		
299
		if (event == XMLStreamConstants.START_ELEMENT ) {
300
			
301
			localname = parser.getLocalName();
302
			if (wordid != null) {
303
				localname = word_element_to_create;
304
			}
305
			
306
			if (prefix != null && prefix.length() > 0) {
307
				writer.writeStartElement(prefix+":"+localname);
308
			} else {
309
				//				if(namespace != null)
310
				//					writer.writeStartElement(namespace, localname);
311
				//				else
312
				writer.writeStartElement(localname);
313
			}
314
			//			if(parser.getNamespaceCount() > 0)
315
			//				writer.writeDefaultNamespace(parser.getNamespaceURI(0))
316
			//			for(int i = 1 ; i < parser.getNamespaceCount() ; i++)
317
			//				writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
318
			
319
			String namespace_prefix;
320
			for (int i = 0 ; i<   parser.getNamespaceCount() ; i++) {
321
				namespace_prefix = parser.getNamespacePrefix(i);
322
				if ((namespace_prefix != null)&&   (namespace_prefix.length()>   0)) {
323
					writer.writeNamespace(namespace_prefix, parser.getNamespaceURI(i));
324
				} else {
325
					writer.writeDefaultNamespace(parser.getNamespaceURI(i));
326
				}
327
			}
328
			
329
			String attrprefix, attname;
330
			boolean hasId = false;
331
			//boolean hasType = false
332
			boolean hasN = false
333
			for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
334
				attname = parser.getAttributeLocalName(i);
335
				attrprefix = parser.getAttributePrefix(i);
336
				if ("id".equals(attname)) hasId = true;
337
				//if ("type".equals(attname)) hasType = true;
338
				if ("n".equals(attname)) hasN = true;
339
				
340
				if (attrprefix != null && attrprefix.length() > 0) {
341
					writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i))
342
				} else {
343
					writer.writeAttribute(attname, parser.getAttributeValue(i))
344
				}
345
			}
346
			
347
			if (wordid != null && !hasId && localname == word_element_to_create) {
348
				writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
349
			}
350
			if (!hasN && localname == word_element_to_create) {
351
				writer.writeAttribute("n", ""+wordcount);
352
			}
353
			if (!reg_word_tags.matcher(localname).matches()) {
354
				writer.writeCharacters("\n");
355
			}
356
		}
357
		else if(event == XMLStreamConstants.END_ELEMENT)
358
		{
359
			writer.writeEndElement();
360
			writer.writeCharacters("\n");
361
		}
362
		else if(event == XMLStreamConstants.CHARACTERS)
363
		{
364
			//println parser.getText();
365
			//writer.writeCharacters("𦟛");
366
			
367
			
368
			// checks if the token starts with an high surrogate
369
			//			if(isHighSurrogate(parser.getText().charAt(0)))	{
370
			//				println "warning: invalid UTF-8 XML range, token " + parser.getText() + " has been replaced.";
371
			//				writer.writeCharacters("__invalidXMLChar__")
372
			//				//writer.writeCharacters("𦟛");
373
			//				println "high surrogate: " + Integer.toHexString((int)parser.getText().charAt(0));
374
			//				println "low surrogate: " + Integer.toHexString((int)parser.getText().charAt(1));
375
			//				int charSum = parser.getText().charAt(0) + parser.getText().charAt(1);
376
			//				println "char sum: " + charSum;
377
			//				println "test " + surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
378
			//				int scalar = surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
379
			//				//writer.writeCharacters(String.valueOf((char)112692));
380
			//				writer.writeCharacters("&#" + scalar + ";");
381
			//			}
382
			//			else
383
			
384
			
385
			if (insideword) { // ensure there is not \t or \n in the word form value
386
				writer.writeCharacters(parser.getText().trim().replace("\n", " ").replace("\t", " "));
387
			} else {
388
				writer.writeCharacters(parser.getText());
389
			}
390
		}
391
	}
392
	
393
	
394
	/**
395
	 * Converts the specified surrogates pair to scalar.
396
	 * @param highSurrogate
397
	 * @param lowSurrogate
398
	 * @return
399
	 */
400
	public int surrogatesPairToScalar(char highSurrogate, char lowSurrogate)	{
401
		return ((highSurrogate - 0xD800) * 0x400) + (lowSurrogate - 0xDC00) + 0x10000;
402
	}
403
	
404
	/**
405
	 * Checks if the specified character is an high/leading surrogate.
406
	 * @param character
407
	 * @return
408
	 */
409
	public boolean isHighSurrogate(char character)	{
410
		return (character >= 0xD800 && character <= 0xDBFF);
411
	}
412
	
413
	
414
	
415
	/** The wordcount. */
416
	int wordcount = 0;
417
	
418
	/** The ignorecontent. */
419
	boolean ignorecontent = true; // tokenization starts with element name matching $reg_startTag
420
	boolean insideword = false;
421
	/**
422
	 * Process.
423
	 *
424
	 * @return true, if successful
425
	 */
426
	public boolean processDOMElement() {
427
		if (!infile.exists()) {
428
			println "$infile does not exists"
429
			return false;
430
		}
431
		XMLOutputFactory factory = XMLOutputFactory.newInstance();
432
		output = new BufferedOutputStream(new FileOutputStream(outfile))
433
		writer = factory.createXMLStreamWriter(output, "UTF-8")
434
		writer.setNamespaceContext(new PersonalNamespaceContext());
435
		
436
		def inputData = infile.toURI().toURL().openStream();
437
		def inputfactory = XMLInputFactory.newInstance();
438
		//inputfactory.setProperty("http://apache.org/xml/properties/input-buffer-size", new Integer(2048));
439
		//inputfactory.setExpandEntityReferences(false);
440
		XMLInputFactory.newInstance();
441
		parser = inputfactory.createXMLStreamReader(inputData);
442
		//println "PARSER: "+parser.getClass()
443
		writer.writeStartDocument("UTF-8","1.0");
444
		writer.writeCharacters("\n");
445
		
446
		int previousEvent = 0;
447
		boolean startProcess = false;
448
		if (startTag == null) // if no startTag specified we process from the start
449
			startProcess = true;
450
		ignorecontent = !startProcess;
451
		
452
		buffer = new StringBuffer();
453
		//println "process - start start tag: "+startTag+" startProcess: $startProcess"
454
		//		println "reg_outside_text_tags_keep_content=$reg_outside_text_tags_keep_content"
455
		//		println "reg_outside_text_tags=$reg_outside_text_tags"
456
		//		println "reg_note_content=$reg_note_content"
457
		try {
458
			for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
459
				if (!startProcess) {
460
					if (event == XMLStreamConstants.START_ELEMENT) {
461
						if (reg_startTag.matcher(parser.getLocalName()).matches()) {
462
							startProcess = true
463
							ignorecontent = false;
464
						}
465
					}
466
					if (!startProcess) {
467
						donothing(event, null);
468
						continue;
469
					}
470
				}
471
				
472
				if (previousEvent == XMLStreamConstants.CHARACTERS && previousEvent != event) {
473
					processWord(); // tokenize now!
474
					buffer.setLength(0);
475
				}
476
				fillInfos(event);//get localname and prefix
477
				if (event == XMLStreamConstants.START_ELEMENT) {
478
					//println "Open: "+localname;
479
					localname = parser.getLocalName()
480
					if (reg_word_tags.matcher(localname).matches()) { // ignore the content of the word but keep counting
481
						//println "Found pretagged word";
482
						
483
						if (retokenize) {
484
							retokenizedWordProperties.clear()
485
							for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
486
								retokenizedWordProperties[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i)
487
							}
488
						} else {
489
							wordcount++;
490
							donothing(event, wordcount);
491
							//ignorecontent = true;
492
							insideword = true;
493
						}
494
					} else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
495
						// ignore the tag only
496
						donothing(event, null); // write the tag
497
						//println "IGNORING NOTE CONTENT OF "+localname
498
						ignorecontent = true;
499
					} else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
500
						// 	ignore the content only
501
						donothing(event, null); // write the tag
502
						//println "IGNORING CONTENT OF "+localname
503
						ignorecontent = true;
504
					} else if (reg_outside_text_tags != null && reg_outside_text_tags.matcher(localname).matches()) { // ignore the tag and its content of the tag
505
						goToEndOfElement(localname); // parse until the end of the element is passed
506
					} else {
507
						donothing(event, null);
508
					}
509
				} else if(event == XMLStreamConstants.END_ELEMENT) {
510
					//println "Close: "+localname;
511
					localname = parser.getLocalName()
512
					if (reg_word_tags.matcher(localname).matches()) {
513
						if (retokenize) {
514
							retokenizedWordProperties.clear()
515
						} else {
516
							//ignorecontent = false;
517
							insideword = false;
518
							writer.writeEndElement();
519
							writer.writeCharacters("\n");
520
						}
521
					} else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag
522
						ignorecontent = false;
523
						donothing(event, null);
524
					} else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
525
						ignorecontent = false;
526
						donothing(event, null);
527
					} else {
528
						donothing(event, null);
529
					}
530
				} else if (event == XMLStreamConstants.CHARACTERS) {
531
					if (ignorecontent || insideword) {
532
						//println " dont tokenize chars: "+parser.getText().trim();
533
						donothing(event, null);
534
					} else {
535
						//println " process chars: "+parser.getText().trim();
536
						buffer.append(parser.getText());
537
						if (buffer.length() >= 12800 && buffer.charAt(buffer.length()-1) == " ") {
538
							processWord();
539
							buffer = new StringBuffer();
540
						}
541
					}
542
				} else if (event == XMLStreamConstants.COMMENT) {
543
					writer.writeComment(parser.getText())
544
				} else if (event == XMLStreamConstants.DTD) {
545
					//println "DTD!";
546
				} else {
547
					if (DEBUG) println "Warning in $infile: ignore XML event at location "+parser.getLocation()
548
				}
549
				previousEvent = event;
550
			}
551
			
552
			parser.close()
553
			writer.close();
554
			output.close();
555
			inputData.close();
556
		} catch (Exception e) {
557
			System.err.println("Error : "+infile);
558
			e.printStackTrace();
559
			if (writer != null) writer.close();
560
			if (output != null) output.close();
561
			if (parser != null) parser.close();
562
			if (inputData != null) inputData.close();
563
			return false;
564
		}
565
		return true;
566
	}
567
	
568
	public void setRetokenize(boolean retokenize) {
569
		this.retokenize = retokenize
570
	}
571
	
572
	/**
573
	 * Set the element and content to ignore
574
	 * 
575
	 * @param regexp
576
	 */
577
	public void setOutSideTextTags(String regexp) {
578
		this.outside_text_tags = regexp;
579
		this.reg_outside_text_tags = Pattern.compile(outside_text_tags);
580
	}
581
	
582
	/**
583
	 * Set element content to NOT tokenize
584
	 *
585
	 * @param regexp
586
	 */
587
	public void setNote(String regexp) {
588
		this.note_content = regexp;
589
		this.reg_note_content = Pattern.compile(note_content);
590
	}
591
	
592
	/**
593
	 * Set the element to ignore but not their content
594
	 *
595
	 * @param regexp
596
	 */
597
	public void setOutSideTextTagsAndKeepContent(String regexp) {
598
		this.outside_text_tags_keep_content = regexp;
599
		this.reg_outside_text_tags_keep_content = Pattern.compile(outside_text_tags_keep_content);
600
	}
601
	
602
	protected void goToEndOfElement(String name) {
603
		//println "START ignoring tag and content of $name"
604
		def openedTags = []
605
		for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
606
			if (event == XMLStreamConstants.START_ELEMENT) {
607
				openedTags << parser.getLocalName()
608
				//println "append "+openedTags
609
			} else if (event == XMLStreamConstants.END_ELEMENT) {
610
				if (openedTags.size() == 0 && name == parser.getLocalName()) {
611
					//println "END ignoring tag and content of $name"
612
					return;
613
				}
614
				openedTags.pop()
615
				//println "pop $openedTags"
616
			}
617
		}
618
	}
619
	
620
	public final static String WHITESPACE = " ";
621
	public final static String EMPTY = "";
622
	/**
623
	 * Process word.
624
	 */
625
	protected void processWord() {
626
		String text = buffer.toString();//parser.getText().trim().replace("\t", " ");
627
		//if (DEBUG) println "-- chars: "+text+"--";
628
		text = regLN.matcher(text).replaceAll(WHITESPACE);
629
		text = regCTRL.matcher(text).replaceAll(EMPTY);						// remove ctrl characters
630
		
631
		def sentences = stringTokenizer.processText(text);
632
		for (def words : sentences) {
633
			for (def word : words) {
634
				wordcount++;
635
				writer.writeStartElement(word_element_to_create);
636
				writeWordAttributes();// id
637
				writer.writeCharacters(word);
638
				writer.writeEndElement();
639
				writer.writeCharacters("\n");
640
			}
641
			if (stringTokenizer.doSentences())  {
642
				writer.writeProcessingInstruction("txm", "</s>")
643
			}
644
		}
645
	}
646
	
647
	/**
648
	 * Write word attributes.
649
	 *
650
	 * @return the java.lang. object
651
	 */
652
	protected writeWordAttributes() {
653
		writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
654
		writer.writeAttribute("n",""+wordcount);
655
		for (String attr : retokenizedWordProperties.keySet()) {
656
			if ("id" == attr) {
657
				writer.writeAttribute("old-id", retokenizedWordProperties[attr]);
658
			} else if ("n" == attr) {
659
				writer.writeAttribute("old-n", retokenizedWordProperties[attr]);
660
			} else {
661
				writer.writeAttribute(attr, retokenizedWordProperties[attr]);
662
			}
663
		}
664
	}
665
	
666
	public void setStartTag(String tag)
667
	{
668
		this.startTag = tag;
669
		this.reg_startTag = Pattern.compile(startTag);
670
	}
671
	
672
	/**
673
	 * Tokenize.
674
	 *
675
	 * @param str the str
676
	 * @return the list
677
	 */
678
	public List<String> tokenize(String str)
679
	{
680
		return str.tokenize()	// cut by whitespace
681
	}
682
	
683
	/**
684
	 * The main method.
685
	 *
686
	 * @param args the arguments
687
	 */
688
	public static void main(String[] args)
689
	{
690
		String lang = "fr"
691
		File inFile = new File(System.getProperty("user.home"), "SVN/TXMSVN/trunk/corpora/tokenizer/test1.xml")
692
		File outFile = new File(System.getProperty("user.home"), "SVN/TXMSVN/trunk/corpora/tokenizer/test1-tmp.xml")
693
		
694
		println "processing "+inFile
695
		
696
		ChunkTokenizerXml tokenizer = new ChunkTokenizerXml(inFile, lang)
697
		tokenizer.setRetokenize(false)
698
		tokenizer.setNote("note")
699
		//tokenizer.setOutSideTextTags("outsideToEdit")
700
		tokenizer.setOutSideTextTagsAndKeepContent("outsideToEdit")
701
		//tokenizer.setDEBUG false
702
		tokenizer.process(outFile);
703
		
704
		println "Done"
705
		
706
	}
707
}
tmp/org.txm.groovy.core/src/java/org/txm/groovy/core/ChunkTokenizerXml.groovy (revision 3268)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

  
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate:$
41
// $LastChangedRevision:$
42
// $LastChangedBy:$
43
//
44
package org.txm.groovy.core;
45

  
46
import static groovy.transform.TypeCheckingMode.SKIP
47
import groovy.transform.CompileStatic
48

  
49
import java.util.Map.Entry
50
import java.util.regex.Matcher
51
import java.util.regex.Pattern
52

  
53
import javax.xml.stream.*
54

  
55
import org.txm.importer.PersonalNamespaceContext
56
import org.txm.tokenizer.StringTokenizer
57
import org.txm.tokenizer.SimpleStringTokenizer
58
import org.txm.tokenizer.TokenizerClasses
59
import org.txm.xml.DOMIdentityHook
60
import org.txm.xml.IdentityHook
61
import org.txm.xml.XMLParser
62
import org.txm.xml.XMLProcessor
63
import org.txm.xml.XPathHookActivator
64
import org.w3c.dom.Node
65

  
66
@CompileStatic
67
public class ChunkTokenizerXml extends XMLProcessor {
68
	
69
	XPathHookActivator activator;
70
	DOMIdentityHook hook;
71
	
72
	StringTokenizer stringTokenizer;
73
	boolean retokenize = false
74
	LinkedHashMap<String, String>retokenizedWordProperties = new LinkedHashMap()
75
	
76
	/** The word_tags. */
77
	String word_tags;
78
	String word_element_to_create
79
	Pattern reg_word_tags;
80
	/** The intraword_tags. */
81
	String intraword_tags
82
	/** The word_chars. */
83
	String word_chars
84
	
85
	/** The outside_text_tags_ignore_content. */
86
	String note_content = null;
87
	String outside_text_tags_keep_content = null // tag and content NOT removed but not tokenized
88
	String outside_text_tags = null // tag and content removed
89
	String startTag = null
90
	Pattern reg_note_content;
91
	Pattern reg_outside_text_tags_keep_content;
92
	Pattern reg_outside_text_tags;
93
	Pattern reg_startTag;
94
	
95
	/** The DEBUG. */
96
	public boolean DEBUG = false;
97
	
98
	/** The outfile is the result file. */
99
	File outfile;
100
	
101
	/** The infile. */
102
	File infile;
103
	
104
	String lang;
105
	
106
	/** The buffer. */
107
	StringBuffer buffer;
108
	
109
	/** The writer. */
110
	XMLStreamWriter writer;
111
	BufferedOutputStream output;
112
	
113
	/** The parser. */
114
	XMLStreamReader parser
115
	
116
	/** The localname. */
117
	String localname;
118
	
119
	/** The prefix. */
120
	String prefix;
121
	String filename;
122
	
123
	Pattern regLN;
124
	Pattern regCTRL;
125
	
126
	public ChunkTokenizerXml(File infile) {
127
		this(infile, "");
128
	}
129
	
130
	public ChunkTokenizerXml(File infile, String lang) {
131
		this(infile, new TokenizerClasses(lang));
132
	}
133
	
134
	/**
135
	 * Instantiates a new simple tokenizer xml.
136
	 *
137
	 * @param infile the infile
138
	 * @param outfile the outfile
139
	 */
140
	public ChunkTokenizerXml(File infile, TokenizerClasses tc) {
141
		this.lang = tc.lang;
142
		this.stringTokenizer = new SimpleStringTokenizer(lang);
143
		
144
		word_tags = tc.word_tags;
145
		word_element_to_create = tc.word_element_to_create;
146
		reg_word_tags = Pattern.compile(word_tags);
147
		
148
		intraword_tags = tc.intraword_tags;
149
		word_chars = tc.word_chars;
150
		
151
		this.outfile = outfile;
152
		this.infile = infile;
153
		this.filename = infile.getName();
154
		int index = filename.lastIndexOf(".");
155
		if (index > 0) filename = filename.substring(0, index);
156
		
157
		regLN = Pattern.compile("/\n/");
158
		regCTRL = Pattern.compile("/\\p{C}/");
159
		
160
		activator = new XPathHookActivator<>(hook, "//w");
161
		
162
		hook = new DOMIdentityHook("in_text_hook", activator, this) {
163
			
164
			String id;
165
			
166
			boolean inAna = false;
167
			
168
			boolean inForm = false;
169
			
170
			boolean inW = false;
171
			
172
			ArrayList<String[]> anaValues = new ArrayList<>();
173
			
174
			ArrayList<String[]> formValues = new ArrayList<>();
175
			
176
			StringBuilder value = new StringBuilder();
177
			
178
			String resp = "";
179
			
180
			String type = "";
181
			
182
			@Override
183
			public boolean deactivate() {
184
				return true;
185
			}
186
			
187
			@Override
188
			public boolean _activate() {
189
				return true;
190
			}
191
			
192
			/**
193
			 * extends this method to process the DOM before it is written
194
			 */
195
			public void processDom() {
196
				println "tokenizing: "+dom
197
				ArrayList<Node> textNodes = getTextNodes(dom);
198
				if (textNodes.size() == 0) return; // easy
199
				
200
				StringBuilder buffer = new StringBuilder(); // build a string to tokenize
201
				for (Node textNode : textNodes) {
202
					buffer.append(textNode.getTextContent());
203
				}
204
				
205
				int nNode = 0;
206
				Node currentTextNode = textNodes.get(0);
207
				String currentText = currentTextNode.getTextContent();
208
				int curentTextIndex = 0;
209
				StringBuilder currentNewText = new StringBuilder()
210
				ArrayList<String> currentWords = new ArrayList<String>()
211
				List<List<String>> sentences = stringTokenizer.processText(buffer.toString());
212
				for (List<String> sent : sentences) {
213
					if (nNode >= textNodes.size()) { // all nodes are updated
214
						break;
215
					}
216
					
217
					for (String word : sent) {
218
						if (nNode >= textNodes.size()) { // all nodes are updated
219
							break;
220
						}
221
						
222
						int idx = currentText.indexOf(word, curentTextIndex);
223
						if (idx >= 0) {
224
							curentTextIndex = idx + word.length();
225
						} else {
226
							println "HOUSTON: word=$word nNode=$nNode currentText=$currentText index=$curentTextIndex words=$currentWords"
227
							currentTextNode.setTextContent("");
228
							for (String w : currentWords) {
229
								Node newChild = dom.getOwnerDocument().createElement("w");
230
								newChild.setAttribute("id", "W_ID")
231
								newChild.setTextContent(w);
232
								dom.insertBefore(newChild, currentTextNode)
233
							}
234
							
235
							currentNewText = new StringBuilder()
236
							currentWords.clear();
237
							curentTextIndex = 0;
238
							nNode++;
239
							if (nNode < textNodes.size()) {
240
								currentTextNode = textNodes.get(nNode);
241
							}
242
						}
243
					}
244
				}
245
				
246
				
247
			}
248
			
249
			public ArrayList<Node> getTextNodes(def element) {
250
				def children = dom.getChildNodes()
251
				ArrayList<Node> texts = new ArrayList<Node>()
252
				for (int i = 0 ; i < children.getLength() ; i++) {
253
					def node = children.item(i);
254
					if (node.getNodeType() == Node.TEXT_NODE) {
255
						texts.add(node)
256
					} else if (node.getNodeType() == Node.ELEMENT_NODE) {
257
						texts.addAll(getTextNodes(node));
258
					}
259
				}
260
				return texts;
261
			}
262
		};
263
	}
264
	
265
	/**
266
	 * Replace the default SimpleStringTokenizer with another
267
	 * @param stringTokenizer a StringTokenizer
268
	 */
269
	public void setStringTokenizer(StringTokenizer stringTokenizer) {
270
		if (stringTokenizer == null) return;
271
		this.stringTokenizer = stringTokenizer;
272
	}
273
	
274
	/**
275
	 * Fill infos.
276
	 *
277
	 * @param event the event
278
	 * @return the java.lang. object
279
	 */
280
	public fillInfos(int event) {
281
		if (event == XMLStreamConstants.START_ELEMENT || event == XMLStreamConstants.END_ELEMENT) {
282
			localname = parser.getLocalName();
283
			prefix = parser.getPrefix();
284
		}
285
	}
286
	
287
	/**
288
	 * Donothing: just write what is read
289
	 * 
290
	 * special processing for words: prefix id attributes with "w_" and remove special characters like \n \t etc.
291
	 *
292
	 * @param event the event
293
	 * @param wordid the wordid
294
	 * @return the java.lang. object
295
	 */
296
	public donothing(int event, Integer wordid) {
297
		if (event == XMLStreamConstants.START_ELEMENT ) {
298
			
299
			localname = parser.getLocalName();
300
			if (wordid != null) {
301
				localname = word_element_to_create;
302
			}
303
			
304
			if (prefix != null && prefix.length() > 0) {
305
				writer.writeStartElement(prefix+":"+localname);
306
			} else {
307
				//				if(namespace != null)
308
				//					writer.writeStartElement(namespace, localname);
309
				//				else
310
				writer.writeStartElement(localname);
311
			}
312
			//			if(parser.getNamespaceCount() > 0)
313
			//				writer.writeDefaultNamespace(parser.getNamespaceURI(0))
314
			//			for(int i = 1 ; i < parser.getNamespaceCount() ; i++)
315
			//				writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
316
			
317
			String namespace_prefix;
318
			for (int i = 0 ; i<   parser.getNamespaceCount() ; i++) {
319
				namespace_prefix = parser.getNamespacePrefix(i);
320
				if ((namespace_prefix != null)&&   (namespace_prefix.length()>   0)) {
321
					writer.writeNamespace(namespace_prefix, parser.getNamespaceURI(i));
322
				} else {
323
					writer.writeDefaultNamespace(parser.getNamespaceURI(i));
324
				}
325
			}
326
			
327
			String attrprefix, attname;
328
			boolean hasId = false;
329
			//boolean hasType = false
330
			boolean hasN = false
331
			for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
332
				attname = parser.getAttributeLocalName(i);
333
				attrprefix = parser.getAttributePrefix(i);
334
				if ("id".equals(attname)) hasId = true;
335
				//if ("type".equals(attname)) hasType = true;
336
				if ("n".equals(attname)) hasN = true;
337
				
338
				if (attrprefix != null && attrprefix.length() > 0) {
339
					writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i))
340
				} else {
341
					writer.writeAttribute(attname, parser.getAttributeValue(i))
342
				}
343
			}
344
			
345
			if (wordid != null && !hasId && localname == word_element_to_create) {
346
				writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
347
			}
348
			if (!hasN && localname == word_element_to_create) {
349
				writer.writeAttribute("n", ""+wordcount);
350
			}
351
			if (!reg_word_tags.matcher(localname).matches()) {
352
				writer.writeCharacters("\n");
353
			}
354
		}
355
		else if(event == XMLStreamConstants.END_ELEMENT)
356
		{
357
			writer.writeEndElement();
358
			writer.writeCharacters("\n");
359
		}
360
		else if(event == XMLStreamConstants.CHARACTERS)
361
		{
362
			//println parser.getText();
363
			//writer.writeCharacters("𦟛");
364
			
365
			
366
			// checks if the token starts with an high surrogate
367
			//			if(isHighSurrogate(parser.getText().charAt(0)))	{
368
			//				println "warning: invalid UTF-8 XML range, token " + parser.getText() + " has been replaced.";
369
			//				writer.writeCharacters("__invalidXMLChar__")
370
			//				//writer.writeCharacters("𦟛");
371
			//				println "high surrogate: " + Integer.toHexString((int)parser.getText().charAt(0));
372
			//				println "low surrogate: " + Integer.toHexString((int)parser.getText().charAt(1));
373
			//				int charSum = parser.getText().charAt(0) + parser.getText().charAt(1);
374
			//				println "char sum: " + charSum;
375
			//				println "test " + surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
376
			//				int scalar = surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
377
			//				//writer.writeCharacters(String.valueOf((char)112692));
378
			//				writer.writeCharacters("&#" + scalar + ";");
379
			//			}
380
			//			else
381
			
382
			
383
			if (insideword) { // ensure there is not \t or \n in the word form value
384
				writer.writeCharacters(parser.getText().trim().replace("\n", " ").replace("\t", " "));
385
			} else {
386
				writer.writeCharacters(parser.getText());
387
			}
388
		}
389
	}
390
	
391
	
392
	/**
393
	 * Converts the specified surrogates pair to scalar.
394
	 * @param highSurrogate
395
	 * @param lowSurrogate
396
	 * @return
397
	 */
398
	public int surrogatesPairToScalar(char highSurrogate, char lowSurrogate)	{
399
		return ((highSurrogate - 0xD800) * 0x400) + (lowSurrogate - 0xDC00) + 0x10000;
400
	}
401
	
402
	/**
403
	 * Checks if the specified character is an high/leading surrogate.
404
	 * @param character
405
	 * @return
406
	 */
407
	public boolean isHighSurrogate(char character)	{
408
		return (character >= 0xD800 && character <= 0xDBFF);
409
	}
410
	
411
	
412
	
413
	/** The wordcount. */
414
	int wordcount = 0;
415
	
416
	/** The ignorecontent. */
417
	boolean ignorecontent = true; // tokenization starts with element name matching $reg_startTag
418
	boolean insideword = false;
419
	/**
420
	 * Process.
421
	 *
422
	 * @return true, if successful
423
	 */
424
	public boolean processDOMElement() {
425
		if (!infile.exists()) {
426
			println "$infile does not exists"
427
			return false;
428
		}
429
		XMLOutputFactory factory = XMLOutputFactory.newInstance();
430
		output = new BufferedOutputStream(new FileOutputStream(outfile))
431
		writer = factory.createXMLStreamWriter(output, "UTF-8")
432
		writer.setNamespaceContext(new PersonalNamespaceContext());
433
		
434
		def inputData = infile.toURI().toURL().openStream();
435
		def inputfactory = XMLInputFactory.newInstance();
436
		//inputfactory.setProperty("http://apache.org/xml/properties/input-buffer-size", new Integer(2048));
437
		//inputfactory.setExpandEntityReferences(false);
438
		XMLInputFactory.newInstance();
439
		parser = inputfactory.createXMLStreamReader(inputData);
440
		//println "PARSER: "+parser.getClass()
441
		writer.writeStartDocument("UTF-8","1.0");
442
		writer.writeCharacters("\n");
443
		
444
		int previousEvent = 0;
445
		boolean startProcess = false;
446
		if (startTag == null) // if no startTag specified we process from the start
447
			startProcess = true;
448
		ignorecontent = !startProcess;
449
		
450
		buffer = new StringBuffer();
451
		//println "process - start start tag: "+startTag+" startProcess: $startProcess"
452
		//		println "reg_outside_text_tags_keep_content=$reg_outside_text_tags_keep_content"
453
		//		println "reg_outside_text_tags=$reg_outside_text_tags"
454
		//		println "reg_note_content=$reg_note_content"
455
		try {
456
			for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
457
				if (!startProcess) {
458
					if (event == XMLStreamConstants.START_ELEMENT) {
459
						if (reg_startTag.matcher(parser.getLocalName()).matches()) {
460
							startProcess = true
461
							ignorecontent = false;
462
						}
463
					}
464
					if (!startProcess) {
465
						donothing(event, null);
466
						continue;
467
					}
468
				}
469
				
470
				if (previousEvent == XMLStreamConstants.CHARACTERS && previousEvent != event) {
471
					processWord(); // tokenize now!
472
					buffer.setLength(0);
473
				}
474
				fillInfos(event);//get localname and prefix
475
				if (event == XMLStreamConstants.START_ELEMENT) {
476
					//println "Open: "+localname;
477
					localname = parser.getLocalName()
478
					if (reg_word_tags.matcher(localname).matches()) { // ignore the content of the word but keep counting
479
						//println "Found pretagged word";
480
						
481
						if (retokenize) {
482
							retokenizedWordProperties.clear()
483
							for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
484
								retokenizedWordProperties[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i)
485
							}
486
						} else {
487
							wordcount++;
488
							donothing(event, wordcount);
489
							//ignorecontent = true;
490
							insideword = true;
491
						}
492
					} else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
493
						// ignore the tag only
494
						donothing(event, null); // write the tag
495
						//println "IGNORING NOTE CONTENT OF "+localname
496
						ignorecontent = true;
497
					} else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
498
						// 	ignore the content only
499
						donothing(event, null); // write the tag
500
						//println "IGNORING CONTENT OF "+localname
501
						ignorecontent = true;
502
					} else if (reg_outside_text_tags != null && reg_outside_text_tags.matcher(localname).matches()) { // ignore the tag and its content of the tag
503
						goToEndOfElement(localname); // parse until the end of the element is passed
504
					} else {
505
						donothing(event, null);
506
					}
507
				} else if(event == XMLStreamConstants.END_ELEMENT) {
508
					//println "Close: "+localname;
509
					localname = parser.getLocalName()
510
					if (reg_word_tags.matcher(localname).matches()) {
511
						if (retokenize) {
512
							retokenizedWordProperties.clear()
513
						} else {
514
							//ignorecontent = false;
515
							insideword = false;
516
							writer.writeEndElement();
517
							writer.writeCharacters("\n");
518
						}
519
					} else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag
520
						ignorecontent = false;
521
						donothing(event, null);
522
					} else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
523
						ignorecontent = false;
524
						donothing(event, null);
525
					} else {
526
						donothing(event, null);
527
					}
528
				} else if (event == XMLStreamConstants.CHARACTERS) {
529
					if (ignorecontent || insideword) {
530
						//println " dont tokenize chars: "+parser.getText().trim();
531
						donothing(event, null);
532
					} else {
533
						//println " process chars: "+parser.getText().trim();
534
						buffer.append(parser.getText());
535
						if (buffer.length() >= 12800 && buffer.charAt(buffer.length()-1) == " ") {
536
							processWord();
537
							buffer = new StringBuffer();
538
						}
539
					}
540
				} else if (event == XMLStreamConstants.COMMENT) {
541
					writer.writeComment(parser.getText())
542
				} else if (event == XMLStreamConstants.DTD) {
543
					//println "DTD!";
544
				} else {
545
					if (DEBUG) println "Warning in $infile: ignore XML event at location "+parser.getLocation()
546
				}
547
				previousEvent = event;
548
			}
549
			
550
			parser.close()
551
			writer.close();
552
			output.close();
553
			inputData.close();
554
		} catch (Exception e) {
555
			System.err.println("Error : "+infile);
556
			e.printStackTrace();
557
			if (writer != null) writer.close();
558
			if (output != null) output.close();
559
			if (parser != null) parser.close();
560
			if (inputData != null) inputData.close();
561
			return false;
562
		}
563
		return true;
564
	}
565
	
566
	public void setRetokenize(boolean retokenize) {
567
		this.retokenize = retokenize
568
	}
569
	
570
	/**
571
	 * Set the element and content to ignore
572
	 * 
573
	 * @param regexp
574
	 */
575
	public void setOutSideTextTags(String regexp) {
576
		this.outside_text_tags = regexp;
577
		this.reg_outside_text_tags = Pattern.compile(outside_text_tags);
578
	}
579
	
580
	/**
581
	 * Set element content to NOT tokenize
582
	 *
583
	 * @param regexp
584
	 */
585
	public void setNote(String regexp) {
586
		this.note_content = regexp;
587
		this.reg_note_content = Pattern.compile(note_content);
588
	}
589
	
590
	/**
591
	 * Set the element to ignore but not their content
592
	 *
593
	 * @param regexp
594
	 */
595
	public void setOutSideTextTagsAndKeepContent(String regexp) {
596
		this.outside_text_tags_keep_content = regexp;
597
		this.reg_outside_text_tags_keep_content = Pattern.compile(outside_text_tags_keep_content);
598
	}
599
	
600
	protected void goToEndOfElement(String name) {
601
		//println "START ignoring tag and content of $name"
602
		def openedTags = []
603
		for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
604
			if (event == XMLStreamConstants.START_ELEMENT) {
605
				openedTags << parser.getLocalName()
606
				//println "append "+openedTags
607
			} else if (event == XMLStreamConstants.END_ELEMENT) {
608
				if (openedTags.size() == 0 && name == parser.getLocalName()) {
609
					//println "END ignoring tag and content of $name"
610
					return;
611
				}
612
				openedTags.pop()
613
				//println "pop $openedTags"
614
			}
615
		}
616
	}
617
	
618
	public final static String WHITESPACE = " ";
619
	public final static String EMPTY = "";
620
	/**
621
	 * Process word.
622
	 */
623
	protected void processWord() {
624
		String text = buffer.toString();//parser.getText().trim().replace("\t", " ");
625
		//if (DEBUG) println "-- chars: "+text+"--";
626
		text = regLN.matcher(text).replaceAll(WHITESPACE);
627
		text = regCTRL.matcher(text).replaceAll(EMPTY);						// remove ctrl characters
628
		
629
		def sentences = stringTokenizer.processText(text);
630
		for (def words : sentences) {
631
			for (def word : words) {
632
				wordcount++;
633
				writer.writeStartElement(word_element_to_create);
634
				writeWordAttributes();// id
635
				writer.writeCharacters(word);
636
				writer.writeEndElement();
637
				writer.writeCharacters("\n");
638
			}
639
			if (stringTokenizer.doSentences())  {
640
				writer.writeProcessingInstruction("txm", "</s>")
641
			}
642
		}
643
	}
644
	
645
	/**
646
	 * Write word attributes.
647
	 *
648
	 * @return the java.lang. object
649
	 */
650
	protected writeWordAttributes() {
651
		writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
652
		writer.writeAttribute("n",""+wordcount);
653
		for (String attr : retokenizedWordProperties.keySet()) {
654
			if ("id" == attr) {
655
				writer.writeAttribute("old-id", retokenizedWordProperties[attr]);
656
			} else if ("n" == attr) {
657
				writer.writeAttribute("old-n", retokenizedWordProperties[attr]);
658
			} else {
659
				writer.writeAttribute(attr, retokenizedWordProperties[attr]);
660
			}
661
		}
662
	}
663
	
664
	public void setStartTag(String tag)
665
	{
666
		this.startTag = tag;
667
		this.reg_startTag = Pattern.compile(startTag);
668
	}
669
	
670
	/**
671
	 * Tokenize.
672
	 *
673
	 * @param str the str
674
	 * @return the list
675
	 */
676
	public List<String> tokenize(String str)
677
	{
678
		return str.tokenize()	// cut by whitespace
679
	}
680
	
681
	/**
682
	 * The main method.
683
	 *
684
	 * @param args the arguments
685
	 */
686
	public static void main(String[] args)
687
	{
688
		String lang = "fr"
689
		File inFile = new File(System.getProperty("user.home"), "SVN/TXMSVN/trunk/corpora/tokenizer/test1.xml")
690
		File outFile = new File(System.getProperty("user.home"), "SVN/TXMSVN/trunk/corpora/tokenizer/test1-tmp.xml")
691
		
692
		println "processing "+inFile
693
		
694
		ChunkTokenizerXml tokenizer = new ChunkTokenizerXml(inFile, lang)
695
		tokenizer.setRetokenize(false)
696
		tokenizer.setNote("note")
697
		//tokenizer.setOutSideTextTags("outsideToEdit")
698
		tokenizer.setOutSideTextTagsAndKeepContent("outsideToEdit")
699
		//tokenizer.setDEBUG false
700
		tokenizer.process(outFile);
701
		
702
		println "Done"
703
		
704
	}
705
}

Formats disponibles : Unified diff