Révision 3254

tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/TranscriberTokenizer.groovy (revision 3254)
2 2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 3
// Lyon 2, University of Franche-Comté, University of Nice
4 4
// Sophia Antipolis, University of Paris 3.
5
// 
5
//
6 6
// The TXM platform is free software: you can redistribute it
7 7
// and/or modify it under the terms of the GNU General Public
8 8
// License as published by the Free Software Foundation,
9 9
// either version 2 of the License, or (at your option) any
10 10
// later version.
11
// 
11
//
12 12
// The TXM platform is distributed in the hope that it will be
13 13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 15
// PURPOSE. See the GNU General Public License for more
16 16
// details.
17
// 
17
//
18 18
// You should have received a copy of the GNU General
19 19
// Public License along with the TXM platform. If not, see
20 20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
21
//
22
//
23
//
24 24
// $LastChangedDate:$
25 25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
26
// $LastChangedBy:$
27 27
//
28 28
package org.txm.scripts.importer.transcriber
29 29

  
......
94 94
				audio = "present"
95 95
			notation = s;
96 96
			event = "";
97

  
98
			//TODO does not work (eg ' "word" '). This step should be done after the tokenizer step is done			
99
//			if (s.startsWith("\"") && s.endsWith("\"")) {
100
//				// not rapp1 or rapp2
101
//			} else if (s.startsWith("\"")) {
102
//				rapp = true;
103
//				event += "#rapp1";
104
//			} else if(s.endsWith("\"")) {
105
//				rapp = false;
106
//				event += "#rapp2";
107
//			}
108

  
97
			
98
			//TODO does not work (eg ' "word" '). This step should be done after the tokenizer step is done
99
			//			if (s.startsWith("\"") && s.endsWith("\"")) {
100
			//				// not rapp1 or rapp2
101
			//			} else if (s.startsWith("\"")) {
102
			//				rapp = true;
103
			//				event += "#rapp1";
104
			//			} else if(s.endsWith("\"")) {
105
			//				rapp = false;
106
			//				event += "#rapp2";
107
			//			}
108
			
109 109
			//test events
110 110
			if (s.startsWith("^^") && s.length() > 2) {
111
				event += "#orth";			
111
				event += "#orth";
112 112
				s = s.substring(2);
113 113
			}
114 114
			if (s.startsWith("*") && s.length() > 1) {
......
160 160
				s = s.replace("(","");
161 161
				s = s.replace(")","");
162 162
			}
163
						
163
			
164 164
			iterate(s);
165 165
		}
166 166
	}
......
173 173
	 */
174 174
	protected iterate(String s)
175 175
	{
176
		def words = stringTokenizer.processText(s);
177
		for (def word : words) {
178
			wordcount++;
179
			writer.writeStartElement(word_element_to_create);
180
			writeWordAttributes();// id
181
			writer.writeCharacters(word);
182
			writer.writeEndElement();
183
			writer.writeCharacters("\n");
176
		def sentences = stringTokenizer.processText(s);
177
		for (def words : sentences) {
178
			for (def word : words) {
179
				wordcount++;
180
				writer.writeStartElement(word_element_to_create);
181
				writeWordAttributes();// id
182
				writer.writeCharacters(word);
183
				writer.writeEndElement();
184
				writer.writeCharacters("\n");
185
			}
186
			if (stringTokenizer.doSentences())  {
187
				writer.writeProcessingInstruction("txm", "</s>")
188
			}
184 189
		}
185 190
	}
186 191
	
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZDefaultPagerStep.groovy (revision 3254)
222 222
			pagedWriter.writeDTD("<!DOCTYPE html>")
223 223
			pagedWriter.writeCharacters("\n")
224 224
			pagedWriter.writeStartElement("html")
225
			pagedWriter.writeCharacters("\n")
225
			
226
			pagedWriter.writeCharacters("\n\t")
226 227
			pagedWriter.writeEmptyElement("meta", ["http-equiv":"Content-Type", "content":"text/html","charset":"UTF-8"])
228
			
229
			
227 230
			for (String css : cssList) {
231
				pagedWriter.writeCharacters("\t\n")
228 232
				pagedWriter.writeEmptyElement("link", ["rel":"stylesheet", "type":"text/css","href":"$css"])
233
				
229 234
			}
235
			pagedWriter.writeCharacters("\t\n")
230 236
			pagedWriter.writeStartElement("head")
237
			pagedWriter.writeCharacters("\t\t\n")
231 238
			pagedWriter.writeStartElement("title")
232 239
			pagedWriter.writeCharacters(basename.toUpperCase()+" Edition - Page "+pagecount)
233 240
			pagedWriter.writeEndElement(); // </title>
241
			pagedWriter.writeCharacters("\t\t\n")
234 242
			pagedWriter.writeStartElement("script", ["src":"js/collapsible.js"]);
235 243
			pagedWriter.writeEndElement(); // </script>
244
			pagedWriter.writeCharacters("\n")
236 245
			pagedWriter.writeEndElement() // </head>
237
			pagedWriter.writeCharacters("\n")
246
			pagedWriter.writeCharacters("\t\n")
238 247
			pagedWriter.writeStartElement("body") //<body>
248
			pagedWriter.writeCharacters("\t\t\n")
239 249
			pagedWriter.writeStartElement("div", ["class": pager.getImportModule().getProject().getName()]) //<div> of the corpus
250
			pagedWriter.writeCharacters("\t\t\n")
240 251
			pagedWriter.writeStartElement("div", ["class": "txmeditionpage"]) //<div>
252
			pagedWriter.writeCharacters("\n")
241 253
			
242 254
//			println "NEW HTML: "+outfile
243 255
//			println "TAGS: "+tags
......
446 458
								}
447 459
								break;
448 460
							case "list":
449
								String type = getAttributeValue(parser, null,"type")
450
								if ("unordered" == type) {
461
								if ("unordered" == rend || "bulleted" == rend) {
451 462
									pagedWriter.writeStartElement("ul", ["class":rend])
463
								} else if ("ordered" == rend || "numbered" == rend) {numbered
464
									pagedWriter.writeStartElement("ol", ["class":rend])
452 465
								} else {
453
									pagedWriter.writeStartElement("ol", ["class":rend])
466
									pagedWriter.writeStartElement("ul", ["class":rend])
454 467
								}
455 468
								break
456 469
							case "item":
......
479 492
								break;
480 493
							case "cell":
481 494
								pagedWriter.writeStartElement("td", ["class":rend])
495
								String rows = getAttributeValue(parser, null, "rows")
496
								if (rows != null && rows.length() > 0) {
497
									pagedWriter.writeAttribute("rowspan", rows)
498
								}
499
								String cols = getAttributeValue(parser, null, "cols")
500
								if (cols != null && cols.length() > 0) {
501
									pagedWriter.writeAttribute("colspan", cols)
502
								}
482 503
								break;
483 504
							case "ref":
484 505
								pagedWriter.writeStartElement("a")
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/filters/Tokeniser/SimpleTokenizerXml.groovy (revision 3254)
152 152
	 * Replace the default SimpleStringTokenizer with another
153 153
	 * @param stringTokenizer a StringTokenizer
154 154
	 */
155
	public void seStringTokenizer(StringTokenizer stringTokenizer) {
155
	public void setStringTokenizer(StringTokenizer stringTokenizer) {
156 156
		if (stringTokenizer == null) return;
157 157
		this.stringTokenizer = stringTokenizer;
158 158
	}
......
418 418
					} else {
419 419
						//println " process chars: "+parser.getText().trim();
420 420
						buffer.append(parser.getText());
421
						if (buffer.length() >= 128 && buffer.charAt(buffer.length()-1) == " ") {
421
						if (buffer.length() >= 12800 && buffer.charAt(buffer.length()-1) == " ") {
422 422
							processWord();
423 423
							buffer = new StringBuffer();
424 424
						}
......
511 511
		//if (DEBUG) println "-- chars: "+text+"--";
512 512
		text = regLN.matcher(text).replaceAll(WHITESPACE);
513 513
		text = regCTRL.matcher(text).replaceAll(EMPTY);						// remove ctrl characters
514

  
515
		def words = stringTokenizer.processText(text);
516
		for (def word : words) {
517
			wordcount++;
518
			writer.writeStartElement(word_element_to_create);
519
			writeWordAttributes();// id
520
			writer.writeCharacters(word);
521
			writer.writeEndElement();
522
			writer.writeCharacters("\n");
514
		
515
		def sentences = stringTokenizer.processText(text);
516
		for (def words : sentences) {
517
			for (def word : words) {
518
				wordcount++;
519
				writer.writeStartElement(word_element_to_create);
520
				writeWordAttributes();// id
521
				writer.writeCharacters(word);
522
				writer.writeEndElement();
523
				writer.writeCharacters("\n");
524
			}
525
			if (stringTokenizer.doSentences())  {
526
				writer.writeProcessingInstruction("txm", "</s>")
527
			}
523 528
		}
524 529
	}
525

  
530
	
526 531
	/**
527 532
	 * Write word attributes.
528 533
	 *
tmp/org.txm.groovy.core/src/java/org/txm/groovy/core/ChunkTokenizerXml.groovy (revision 3254)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

  
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
//
38
//
39
//
40
// $LastChangedDate:$
41
// $LastChangedRevision:$
42
// $LastChangedBy:$
43
//
44
package org.txm.groovy.core;
45

  
46
import static groovy.transform.TypeCheckingMode.SKIP
47
import groovy.transform.CompileStatic
48

  
49
import java.util.Map.Entry
50
import java.util.regex.Matcher
51
import java.util.regex.Pattern
52

  
53
import javax.xml.stream.*
54

  
55
import org.txm.importer.PersonalNamespaceContext
56
import org.txm.tokenizer.StringTokenizer
57
import org.txm.tokenizer.SimpleStringTokenizer
58
import org.txm.tokenizer.TokenizerClasses
59
import org.txm.xml.DOMIdentityHook
60
import org.txm.xml.IdentityHook
61
import org.txm.xml.XMLParser
62
import org.txm.xml.XMLProcessor
63
import org.txm.xml.XPathHookActivator
64
import org.w3c.dom.Node
65

  
66
@CompileStatic
67
public class ChunkTokenizerXml extends XMLProcessor {
68
	
69
	XPathHookActivator activator;
70
	DOMIdentityHook hook;
71
	
72
	StringTokenizer stringTokenizer;
73
	boolean retokenize = false
74
	LinkedHashMap<String, String>retokenizedWordProperties = new LinkedHashMap()
75
	
76
	/** The word_tags. */
77
	String word_tags;
78
	String word_element_to_create
79
	Pattern reg_word_tags;
80
	/** The intraword_tags. */
81
	String intraword_tags
82
	/** The word_chars. */
83
	String word_chars
84
	
85
	/** The outside_text_tags_ignore_content. */
86
	String note_content = null;
87
	String outside_text_tags_keep_content = null // tag and content NOT removed but not tokenized
88
	String outside_text_tags = null // tag and content removed
89
	String startTag = null
90
	Pattern reg_note_content;
91
	Pattern reg_outside_text_tags_keep_content;
92
	Pattern reg_outside_text_tags;
93
	Pattern reg_startTag;
94
	
95
	/** The DEBUG. */
96
	public boolean DEBUG = false;
97
	
98
	/** The outfile is the result file. */
99
	File outfile;
100
	
101
	/** The infile. */
102
	File infile;
103
	
104
	String lang;
105
	
106
	/** The buffer. */
107
	StringBuffer buffer;
108
	
109
	/** The writer. */
110
	XMLStreamWriter writer;
111
	BufferedOutputStream output;
112
	
113
	/** The parser. */
114
	XMLStreamReader parser
115
	
116
	/** The localname. */
117
	String localname;
118
	
119
	/** The prefix. */
120
	String prefix;
121
	String filename;
122
	
123
	Pattern regLN;
124
	Pattern regCTRL;
125
	
126
	public ChunkTokenizerXml(File infile) {
127
		this(infile, "");
128
	}
129
	
130
	public ChunkTokenizerXml(File infile, String lang) {
131
		this(infile, new TokenizerClasses(lang));
132
	}
133
	
134
	/**
135
	 * Instantiates a new simple tokenizer xml.
136
	 *
137
	 * @param infile the infile
138
	 * @param outfile the outfile
139
	 */
140
	public ChunkTokenizerXml(File infile, TokenizerClasses tc) {
141
		this.lang = tc.lang;
142
		this.stringTokenizer = new SimpleStringTokenizer(lang);
143
		
144
		word_tags = tc.word_tags;
145
		word_element_to_create = tc.word_element_to_create;
146
		reg_word_tags = Pattern.compile(word_tags);
147
		
148
		intraword_tags = tc.intraword_tags;
149
		word_chars = tc.word_chars;
150
		
151
		this.outfile = outfile;
152
		this.infile = infile;
153
		this.filename = infile.getName();
154
		int index = filename.lastIndexOf(".");
155
		if (index > 0) filename = filename.substring(0, index);
156
		
157
		regLN = Pattern.compile("/\n/");
158
		regCTRL = Pattern.compile("/\\p{C}/");
159
		
160
		activator = new XPathHookActivator<>(hook, "//w");
161
		
162
		hook = new DOMIdentityHook("in_text_hook", activator, this) {
163
			
164
			String id;
165
			
166
			boolean inAna = false;
167
			
168
			boolean inForm = false;
169
			
170
			boolean inW = false;
171
			
172
			ArrayList<String[]> anaValues = new ArrayList<>();
173
			
174
			ArrayList<String[]> formValues = new ArrayList<>();
175
			
176
			StringBuilder value = new StringBuilder();
177
			
178
			String resp = "";
179
			
180
			String type = "";
181
			
182
			@Override
183
			public boolean deactivate() {
184
				return true;
185
			}
186
			
187
			@Override
188
			public boolean _activate() {
189
				return true;
190
			}
191
			
192
			/**
193
			 * extends this method to process the DOM before it is written
194
			 */
195
			public void processDom() {
196
				println "tokenizing: "+dom
197
				ArrayList<Node> textNodes = getTextNodes(dom);
198
				if (textNodes.size() == 0) return; // easy
199
				
200
				StringBuilder buffer = new StringBuilder(); // build a string to tokenize
201
				for (Node textNode : textNodes) {
202
					buffer.append(textNode.getTextContent());
203
				}
204
				
205
				int nNode = 0;
206
				Node currentTextNode = textNodes.get(0);
207
				String currentText = currentTextNode.getTextContent();
208
				int curentTextIndex = 0;
209
				StringBuilder currentNewText = new StringBuilder()
210
				ArrayList<String> currentWords = new ArrayList<String>()
211
				List<List<String>> sentences = stringTokenizer.processText(buffer.toString());
212
				for (List<String> sent : sentences) {
213
					if (nNode >= textNodes.size()) { // all nodes are updated
214
						break;
215
					}
216
					
217
					for (String word : sent) {
218
						if (nNode >= textNodes.size()) { // all nodes are updated
219
							break;
220
						}
221
						
222
						int idx = currentText.indexOf(word, curentTextIndex);
223
						if (idx >= 0) {
224
							curentTextIndex = idx + word.length();
225
						} else {
226
							println "HOUSTON: word=$word nNode=$nNode currentText=$currentText index=$curentTextIndex words=$currentWords"
227
							currentTextNode.setTextContent("");
228
							for (String w : currentWords) {
229
								Node newChild = dom.getOwnerDocument().createElement("w");
230
								newChild.setAttribute("id", "W_ID")
231
								newChild.setTextContent(w);
232
								dom.insertBefore(newChild, currentTextNode)
233
							}
234
							
235
							currentNewText = new StringBuilder()
236
							currentWords.clear();
237
							curentTextIndex = 0;
238
							nNode++;
239
							if (nNode < textNodes.size()) {
240
								currentTextNode = textNodes.get(nNode);
241
							}
242
						}
243
					}
244
				}
245
				
246
				
247
			}
248
			
249
			public ArrayList<Node> getTextNodes(def element) {
250
				def children = dom.getChildNodes()
251
				ArrayList<Node> texts = new ArrayList<Node>()
252
				for (int i = 0 ; i < children.getLength() ; i++) {
253
					def node = children.item(i);
254
					if (node.getNodeType() == Node.TEXT_NODE) {
255
						texts.add(node)
256
					} else if (node.getNodeType() == Node.ELEMENT_NODE) {
257
						texts.addAll(getTextNodes(node));
258
					}
259
				}
260
				return texts;
261
			}
262
		};
263
	}
264
	
265
	/**
266
	 * Replace the default SimpleStringTokenizer with another
267
	 * @param stringTokenizer a StringTokenizer
268
	 */
269
	public void setStringTokenizer(StringTokenizer stringTokenizer) {
270
		if (stringTokenizer == null) return;
271
		this.stringTokenizer = stringTokenizer;
272
	}
273
	
274
	/**
275
	 * Fill infos.
276
	 *
277
	 * @param event the event
278
	 * @return the java.lang. object
279
	 */
280
	public fillInfos(int event) {
281
		if (event == XMLStreamConstants.START_ELEMENT || event == XMLStreamConstants.END_ELEMENT) {
282
			localname = parser.getLocalName();
283
			prefix = parser.getPrefix();
284
		}
285
	}
286
	
287
	/**
288
	 * Donothing: just write what is read
289
	 * 
290
	 * special processing for words: prefix id attributes with "w_" and remove special characters like \n \t etc.
291
	 *
292
	 * @param event the event
293
	 * @param wordid the wordid
294
	 * @return the java.lang. object
295
	 */
296
	public donothing(int event, Integer wordid) {
297
		if (event == XMLStreamConstants.START_ELEMENT ) {
298
			
299
			localname = parser.getLocalName();
300
			if (wordid != null) {
301
				localname = word_element_to_create;
302
			}
303
			
304
			if (prefix != null && prefix.length() > 0) {
305
				writer.writeStartElement(prefix+":"+localname);
306
			} else {
307
				//				if(namespace != null)
308
				//					writer.writeStartElement(namespace, localname);
309
				//				else
310
				writer.writeStartElement(localname);
311
			}
312
			//			if(parser.getNamespaceCount() > 0)
313
			//				writer.writeDefaultNamespace(parser.getNamespaceURI(0))
314
			//			for(int i = 1 ; i < parser.getNamespaceCount() ; i++)
315
			//				writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
316
			
317
			String namespace_prefix;
318
			for (int i = 0 ; i<   parser.getNamespaceCount() ; i++) {
319
				namespace_prefix = parser.getNamespacePrefix(i);
320
				if ((namespace_prefix != null)&&   (namespace_prefix.length()>   0)) {
321
					writer.writeNamespace(namespace_prefix, parser.getNamespaceURI(i));
322
				} else {
323
					writer.writeDefaultNamespace(parser.getNamespaceURI(i));
324
				}
325
			}
326
			
327
			String attrprefix, attname;
328
			boolean hasId = false;
329
			//boolean hasType = false
330
			boolean hasN = false
331
			for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
332
				attname = parser.getAttributeLocalName(i);
333
				attrprefix = parser.getAttributePrefix(i);
334
				if ("id".equals(attname)) hasId = true;
335
				//if ("type".equals(attname)) hasType = true;
336
				if ("n".equals(attname)) hasN = true;
337
				
338
				if (attrprefix != null && attrprefix.length() > 0) {
339
					writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i))
340
				} else {
341
					writer.writeAttribute(attname, parser.getAttributeValue(i))
342
				}
343
			}
344
			
345
			if (wordid != null && !hasId && localname == word_element_to_create) {
346
				writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
347
			}
348
			if (!hasN && localname == word_element_to_create) {
349
				writer.writeAttribute("n", ""+wordcount);
350
			}
351
			if (!reg_word_tags.matcher(localname).matches()) {
352
				writer.writeCharacters("\n");
353
			}
354
		}
355
		else if(event == XMLStreamConstants.END_ELEMENT)
356
		{
357
			writer.writeEndElement();
358
			writer.writeCharacters("\n");
359
		}
360
		else if(event == XMLStreamConstants.CHARACTERS)
361
		{
362
			//println parser.getText();
363
			//writer.writeCharacters("𦟛");
364
			
365
			
366
			// checks if the token starts with an high surrogate
367
			//			if(isHighSurrogate(parser.getText().charAt(0)))	{
368
			//				println "warning: invalid UTF-8 XML range, token " + parser.getText() + " has been replaced.";
369
			//				writer.writeCharacters("__invalidXMLChar__")
370
			//				//writer.writeCharacters("𦟛");
371
			//				println "high surrogate: " + Integer.toHexString((int)parser.getText().charAt(0));
372
			//				println "low surrogate: " + Integer.toHexString((int)parser.getText().charAt(1));
373
			//				int charSum = parser.getText().charAt(0) + parser.getText().charAt(1);
374
			//				println "char sum: " + charSum;
375
			//				println "test " + surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
376
			//				int scalar = surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
377
			//				//writer.writeCharacters(String.valueOf((char)112692));
378
			//				writer.writeCharacters("&#" + scalar + ";");
379
			//			}
380
			//			else
381
			
382
			
383
			if (insideword) { // ensure there is not \t or \n in the word form value
384
				writer.writeCharacters(parser.getText().trim().replace("\n", " ").replace("\t", " "));
385
			} else {
386
				writer.writeCharacters(parser.getText());
387
			}
388
		}
389
	}
390
	
391
	
392
	/**
393
	 * Converts the specified surrogates pair to scalar.
394
	 * @param highSurrogate
395
	 * @param lowSurrogate
396
	 * @return
397
	 */
398
	public int surrogatesPairToScalar(char highSurrogate, char lowSurrogate)	{
399
		return ((highSurrogate - 0xD800) * 0x400) + (lowSurrogate - 0xDC00) + 0x10000;
400
	}
401
	
402
	/**
403
	 * Checks if the specified character is an high/leading surrogate.
404
	 * @param character
405
	 * @return
406
	 */
407
	public boolean isHighSurrogate(char character)	{
408
		return (character >= 0xD800 && character <= 0xDBFF);
409
	}
410
	
411
	
412
	
413
	/** The wordcount. */
414
	int wordcount = 0;
415
	
416
	/** The ignorecontent. */
417
	boolean ignorecontent = true; // tokenization starts with element name matching $reg_startTag
418
	boolean insideword = false;
419
	/**
420
	 * Process.
421
	 *
422
	 * @return true, if successful
423
	 */
424
	public boolean processDOMElement() {
425
		if (!infile.exists()) {
426
			println "$infile does not exists"
427
			return false;
428
		}
429
		XMLOutputFactory factory = XMLOutputFactory.newInstance();
430
		output = new BufferedOutputStream(new FileOutputStream(outfile))
431
		writer = factory.createXMLStreamWriter(output, "UTF-8")
432
		writer.setNamespaceContext(new PersonalNamespaceContext());
433
		
434
		def inputData = infile.toURI().toURL().openStream();
435
		def inputfactory = XMLInputFactory.newInstance();
436
		//inputfactory.setProperty("http://apache.org/xml/properties/input-buffer-size", new Integer(2048));
437
		//inputfactory.setExpandEntityReferences(false);
438
		XMLInputFactory.newInstance();
439
		parser = inputfactory.createXMLStreamReader(inputData);
440
		//println "PARSER: "+parser.getClass()
441
		writer.writeStartDocument("UTF-8","1.0");
442
		writer.writeCharacters("\n");
443
		
444
		int previousEvent = 0;
445
		boolean startProcess = false;
446
		if (startTag == null) // if no startTag specified we process from the start
447
			startProcess = true;
448
		ignorecontent = !startProcess;
449
		
450
		buffer = new StringBuffer();
451
		//println "process - start start tag: "+startTag+" startProcess: $startProcess"
452
		//		println "reg_outside_text_tags_keep_content=$reg_outside_text_tags_keep_content"
453
		//		println "reg_outside_text_tags=$reg_outside_text_tags"
454
		//		println "reg_note_content=$reg_note_content"
455
		try {
456
			for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
457
				if (!startProcess) {
458
					if (event == XMLStreamConstants.START_ELEMENT) {
459
						if (reg_startTag.matcher(parser.getLocalName()).matches()) {
460
							startProcess = true
461
							ignorecontent = false;
462
						}
463
					}
464
					if (!startProcess) {
465
						donothing(event, null);
466
						continue;
467
					}
468
				}
469
				
470
				if (previousEvent == XMLStreamConstants.CHARACTERS && previousEvent != event) {
471
					processWord(); // tokenize now!
472
					buffer.setLength(0);
473
				}
474
				fillInfos(event);//get localname and prefix
475
				if (event == XMLStreamConstants.START_ELEMENT) {
476
					//println "Open: "+localname;
477
					localname = parser.getLocalName()
478
					if (reg_word_tags.matcher(localname).matches()) { // ignore the content of the word but keep counting
479
						//println "Found pretagged word";
480
						
481
						if (retokenize) {
482
							retokenizedWordProperties.clear()
483
							for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
484
								retokenizedWordProperties[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i)
485
							}
486
						} else {
487
							wordcount++;
488
							donothing(event, wordcount);
489
							//ignorecontent = true;
490
							insideword = true;
491
						}
492
					} else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
493
						// ignore the tag only
494
						donothing(event, null); // write the tag
495
						//println "IGNORING NOTE CONTENT OF "+localname
496
						ignorecontent = true;
497
					} else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
498
						// 	ignore the content only
499
						donothing(event, null); // write the tag
500
						//println "IGNORING CONTENT OF "+localname
501
						ignorecontent = true;
502
					} else if (reg_outside_text_tags != null && reg_outside_text_tags.matcher(localname).matches()) { // ignore the tag and its content of the tag
503
						goToEndOfElement(localname); // parse until the end of the element is passed
504
					} else {
505
						donothing(event, null);
506
					}
507
				} else if(event == XMLStreamConstants.END_ELEMENT) {
508
					//println "Close: "+localname;
509
					localname = parser.getLocalName()
510
					if (reg_word_tags.matcher(localname).matches()) {
511
						if (retokenize) {
512
							retokenizedWordProperties.clear()
513
						} else {
514
							//ignorecontent = false;
515
							insideword = false;
516
							writer.writeEndElement();
517
							writer.writeCharacters("\n");
518
						}
519
					} else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag
520
						ignorecontent = false;
521
						donothing(event, null);
522
					} else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
523
						ignorecontent = false;
524
						donothing(event, null);
525
					} else {
526
						donothing(event, null);
527
					}
528
				} else if (event == XMLStreamConstants.CHARACTERS) {
529
					if (ignorecontent || insideword) {
530
						//println " dont tokenize chars: "+parser.getText().trim();
531
						donothing(event, null);
532
					} else {
533
						//println " process chars: "+parser.getText().trim();
534
						buffer.append(parser.getText());
535
						if (buffer.length() >= 12800 && buffer.charAt(buffer.length()-1) == " ") {
536
							processWord();
537
							buffer = new StringBuffer();
538
						}
539
					}
540
				} else if (event == XMLStreamConstants.COMMENT) {
541
					writer.writeComment(parser.getText())
542
				} else if (event == XMLStreamConstants.DTD) {
543
					//println "DTD!";
544
				} else {
545
					if (DEBUG) println "Warning in $infile: ignore XML event at location "+parser.getLocation()
546
				}
547
				previousEvent = event;
548
			}
549
			
550
			parser.close()
551
			writer.close();
552
			output.close();
553
			inputData.close();
554
		} catch (Exception e) {
555
			System.err.println("Error : "+infile);
556
			e.printStackTrace();
557
			if (writer != null) writer.close();
558
			if (output != null) output.close();
559
			if (parser != null) parser.close();
560
			if (inputData != null) inputData.close();
561
			return false;
562
		}
563
		return true;
564
	}
565
	
566
	public void setRetokenize(boolean retokenize) {
567
		this.retokenize = retokenize
568
	}
569
	
570
	/**
571
	 * Set the element and content to ignore
572
	 * 
573
	 * @param regexp
574
	 */
575
	public void setOutSideTextTags(String regexp) {
576
		this.outside_text_tags = regexp;
577
		this.reg_outside_text_tags = Pattern.compile(outside_text_tags);
578
	}
579
	
580
	/**
581
	 * Set element content to NOT tokenize
582
	 *
583
	 * @param regexp
584
	 */
585
	public void setNote(String regexp) {
586
		this.note_content = regexp;
587
		this.reg_note_content = Pattern.compile(note_content);
588
	}
589
	
590
	/**
591
	 * Set the element to ignore but not their content
592
	 *
593
	 * @param regexp
594
	 */
595
	public void setOutSideTextTagsAndKeepContent(String regexp) {
596
		this.outside_text_tags_keep_content = regexp;
597
		this.reg_outside_text_tags_keep_content = Pattern.compile(outside_text_tags_keep_content);
598
	}
599
	
600
	protected void goToEndOfElement(String name) {
601
		//println "START ignoring tag and content of $name"
602
		def openedTags = []
603
		for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
604
			if (event == XMLStreamConstants.START_ELEMENT) {
605
				openedTags << parser.getLocalName()
606
				//println "append "+openedTags
607
			} else if (event == XMLStreamConstants.END_ELEMENT) {
608
				if (openedTags.size() == 0 && name == parser.getLocalName()) {
609
					//println "END ignoring tag and content of $name"
610
					return;
611
				}
612
				openedTags.pop()
613
				//println "pop $openedTags"
614
			}
615
		}
616
	}
617
	
618
	public final static String WHITESPACE = " ";
619
	public final static String EMPTY = "";
620
	/**
621
	 * Process word.
622
	 */
623
	protected void processWord() {
624
		String text = buffer.toString();//parser.getText().trim().replace("\t", " ");
625
		//if (DEBUG) println "-- chars: "+text+"--";
626
		text = regLN.matcher(text).replaceAll(WHITESPACE);
627
		text = regCTRL.matcher(text).replaceAll(EMPTY);						// remove ctrl characters
628
		
629
		def sentences = stringTokenizer.processText(text);
630
		for (def words : sentences) {
631
			for (def word : words) {
632
				wordcount++;
633
				writer.writeStartElement(word_element_to_create);
634
				writeWordAttributes();// id
635
				writer.writeCharacters(word);
636
				writer.writeEndElement();
637
				writer.writeCharacters("\n");
638
			}
639
			if (stringTokenizer.doSentences())  {
640
				writer.writeProcessingInstruction("txm", "</s>")
641
			}
642
		}
643
	}
644
	
645
	/**
646
	 * Write word attributes.
647
	 *
648
	 * @return the java.lang. object
649
	 */
650
	protected writeWordAttributes() {
651
		writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
652
		writer.writeAttribute("n",""+wordcount);
653
		for (String attr : retokenizedWordProperties.keySet()) {
654
			if ("id" == attr) {
655
				writer.writeAttribute("old-id", retokenizedWordProperties[attr]);
656
			} else if ("n" == attr) {
657
				writer.writeAttribute("old-n", retokenizedWordProperties[attr]);
658
			} else {
659
				writer.writeAttribute(attr, retokenizedWordProperties[attr]);
660
			}
661
		}
662
	}
663
	
664
	public void setStartTag(String tag)
665
	{
666
		this.startTag = tag;
667
		this.reg_startTag = Pattern.compile(startTag);
668
	}
669
	
670
	/**
671
	 * Tokenize.
672
	 *
673
	 * @param str the str
674
	 * @return the list
675
	 */
676
	public List<String> tokenize(String str)
677
	{
678
		return str.tokenize()	// cut by whitespace
679
	}
680
	
681
	/**
682
	 * The main method.
683
	 *
684
	 * @param args the arguments
685
	 */
686
	public static void main(String[] args)
687
	{
688
		String lang = "fr"
689
		File inFile = new File(System.getProperty("user.home"), "SVN/TXMSVN/trunk/corpora/tokenizer/test1.xml")
690
		File outFile = new File(System.getProperty("user.home"), "SVN/TXMSVN/trunk/corpora/tokenizer/test1-tmp.xml")
691
		
692
		println "processing "+inFile
693
		
694
		ChunkTokenizerXml tokenizer = new ChunkTokenizerXml(inFile, lang)
695
		tokenizer.setRetokenize(false)
696
		tokenizer.setNote("note")
697
		//tokenizer.setOutSideTextTags("outsideToEdit")
698
		tokenizer.setOutSideTextTagsAndKeepContent("outsideToEdit")
699
		//tokenizer.setDEBUG false
700
		tokenizer.process(outFile);
701
		
702
		println "Done"
703
		
704
	}
705
}

Formats disponibles : Unified diff