/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 3268

     // Copyright © 2010-2013 ENS de Lyon.
     // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
     // Lyon 2, University of Franche-Comté, University of Nice
     // Sophia Antipolis, University of Paris 3.
     //
     // The TXM platform is free software: you can redistribute it
     // and/or modify it under the terms of the GNU General Public
     // License as published by the Free Software Foundation,
     // either version 2 of the License, or (at your option) any
     // later version.
     //
     // The TXM platform is distributed in the hope that it will be
     // useful, but WITHOUT ANY WARRANTY; without even the implied
     // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
     // PURPOSE. See the GNU General Public License for more
     // details.
     //
     // You should have received a copy of the GNU General
     // Public License along with the TXM platform. If not, see
     // http://www.gnu.org/licenses.
     //
     // This file is part of the TXM platform.
     //
     // The TXM platform is free software: you can redistribute it and/or modif y
     // it under the terms of the GNU General Public License as published by
     // the Free Software Foundation, either version 3 of the License, or
     // (at your option) any later version.
     //
     // The TXM platform is distributed in the hope that it will be useful,
     // but WITHOUT ANY WARRANTY; without even the implied warranty of
     // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     // GNU General Public License for more details.
     //
     // You should have received a copy of the GNU General Public License
     // along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
     //
     //
     //
     // $LastChangedDate:$
     // $LastChangedRevision:$
     // $LastChangedBy:$
     //
     package org.txm.tokenizer;
     import static groovy.transform.TypeCheckingMode.SKIP
     import groovy.transform.CompileStatic
     import java.util.Map.Entry
     import java.util.regex.Matcher
     import java.util.regex.Pattern
     import javax.xml.stream.*
     import org.txm.importer.PersonalNamespaceContext
     import org.txm.tokenizer.StringTokenizer
     import org.txm.tokenizer.SimpleStringTokenizer
     import org.txm.tokenizer.TokenizerClasses
     import org.txm.xml.DOMIdentityHook
     import org.txm.xml.IdentityHook
     import org.txm.xml.XMLParser
     import org.txm.xml.XMLProcessor
     import org.txm.xml.XPathHookActivator
     import org.w3c.dom.Node
     @CompileStatic
     public class ChunkTokenizerXml extends XMLProcessor {
     	XPathHookActivator activator;
     	DOMIdentityHook hook;
     	StringTokenizer stringTokenizer;
     	boolean retokenize = false
     	LinkedHashMap<String, String>retokenizedWordProperties = new LinkedHashMap()
     	/** The word_tags. */
     	String word_tags;
     	String word_element_to_create
     	Pattern reg_word_tags;
     	/** The intraword_tags. */
     	String intraword_tags
     	/** The word_chars. */
     	String word_chars
     	/** The outside_text_tags_ignore_content. */
     	String note_content = null;
     	String outside_text_tags_keep_content = null // tag and content NOT removed but not tokenized
     	String outside_text_tags = null // tag and content removed
     	String startTag = null
     	Pattern reg_note_content;
     	Pattern reg_outside_text_tags_keep_content;
     	Pattern reg_outside_text_tags;
     	Pattern reg_startTag;
     	/** The DEBUG. */
     	public boolean DEBUG = false;
     	/** The outfile is the result file. */
     	File outfile;
     	/** The infile. */
     	File infile;
     	String lang;
     	/** The buffer. */
     	StringBuffer buffer;
     	/** The writer. */
     	XMLStreamWriter writer;
     	BufferedOutputStream output;
     	/** The parser. */
     	XMLStreamReader parser
     	/** The localname. */
     	String localname;
     	/** The prefix. */
     	String prefix;
     	String filename;
     	Pattern regLN;
     	Pattern regCTRL;
     	public ChunkTokenizerXml(File infile) {
     		this(infile, "");
+    	}
     	public ChunkTokenizerXml(File infile, String lang) {
     		this(infile, new TokenizerClasses(lang));
+    	}
     	/**
     	 * Instantiates a new simple tokenizer xml.
+    	 *
     	 * @param infile the infile
     	 * @param outfile the outfile
     	 */
     	public ChunkTokenizerXml(File infile, TokenizerClasses tc) {
     		super(infile)
     		this.lang = tc.lang;
     		this.stringTokenizer = new SimpleStringTokenizer(lang);
     		word_tags = tc.word_tags;
     		word_element_to_create = tc.word_element_to_create;
     		reg_word_tags = Pattern.compile(word_tags);
     		intraword_tags = tc.intraword_tags;
     		word_chars = tc.word_chars;
     		this.outfile = outfile;
     		this.infile = infile;
     		this.filename = infile.getName();
     		int index = filename.lastIndexOf(".");
     		if (index > 0) filename = filename.substring(0, index);
     		regLN = Pattern.compile("/\n/");
     		regCTRL = Pattern.compile("/\\p{C}/");
     		activator = new XPathHookActivator<>(hook, "//div|p|ab");
     		hook = new DOMIdentityHook("in_text_hook", activator, this) {
     			String id;
     			boolean inAna = false;
     			boolean inForm = false;
     			boolean inW = false;
     			ArrayList<String[]> anaValues = new ArrayList<>();
     			ArrayList<String[]> formValues = new ArrayList<>();
     			StringBuilder value = new StringBuilder();
     			String resp = "";
     			String type = "";
     			/**
     			 * extends this method to process the DOM before it is written
     			 */
     			public void processDom() {
     				ArrayList<Node> textNodes = getTextNodes(dom);
     				if (textNodes.size() == 0) return; // easy
     				StringBuilder buffer = new StringBuilder(); // build a string to tokenize
     				for (Node textNode : textNodes) {
     					buffer.append(" "+textNode.getTextContent());
+    				}
     				int nNode = 0;
     				Node currentTextNode = textNodes.get(0);
     				String currentText = currentTextNode.getTextContent();
     				int curentTextIndex = 0;
     				StringBuilder currentNewText = new StringBuilder()
     				ArrayList<String> currentWords = new ArrayList<String>()
     				List<List<String>> sentences = stringTokenizer.processText(buffer.toString());
     				//println "text="+buffer.toString()
     				println "sentences=$sentences"
     				for (List<String> sent : sentences) {
     					if (nNode >= textNodes.size()) { // all nodes are updated
     						break;
+    					}
     					for (String word : sent) {
     						if (nNode >= textNodes.size()) { // all nodes are updated
     							break;
+    						}
     						int idx = currentText.indexOf(word, curentTextIndex);
     						if (idx >= 0) {
     							curentTextIndex = idx + word.length();
     						} else {
     							println "HOUSTON: word=$word nNode=$nNode currentText=$currentText index=$curentTextIndex words=$currentWords"
     							currentTextNode.setTextContent("");
     							for (String w : currentWords) {
     								Node newChild = dom.getOwnerDocument().createElementNS(null, "w");
     								newChild.setAttribute("id", "W_ID")
     								newChild.setTextContent(w);
     								currentTextNode.getParentNode().insertBefore(newChild, currentTextNode)
+    							}
     							currentTextNode.getParentNode().removeChild(currentTextNode)
     							currentNewText = new StringBuilder()
     							currentWords.clear();
     							curentTextIndex = 0;
     							nNode++;
     							if (nNode < textNodes.size()) {
     								currentTextNode = textNodes.get(nNode);
     								currentText = currentTextNode.getTextContent();
+    							}
+    						}
     						currentWords.add(word)
+    					}
+    				}
+    			}
     			public ArrayList<Node> getTextNodes(Node element) {
     				def children = element.getChildNodes()
     				ArrayList<Node> texts = new ArrayList<Node>()
     				for (int i = 0 ; i < children.getLength() ; i++) {
     					def node = children.item(i);
     					if (node.getNodeType() == Node.TEXT_NODE) {
     						texts.add(node)
     					} else if (node.getNodeType() == Node.ELEMENT_NODE) {
     						if (node.getLocalName().equals("w")) {
     							texts.add(node)
     						} else {
     							texts.addAll(getTextNodes(node));
+    						}
+    					}
+    				}
     				return texts;
+    			}
     		};
+    	}
     	/**
     	 * Replace the default SimpleStringTokenizer with another
     	 * @param stringTokenizer a StringTokenizer
     	 */
     	public void setStringTokenizer(StringTokenizer stringTokenizer) {
     		if (stringTokenizer == null) return;
     		this.stringTokenizer = stringTokenizer;
+    	}
     	/**
     	 * Fill infos.
+    	 *
     	 * @param event the event
     	 * @return the java.lang. object
     	 */
     	public fillInfos(int event) {
     		if (event == XMLStreamConstants.START_ELEMENT || event == XMLStreamConstants.END_ELEMENT) {
     			localname = parser.getLocalName();
     			prefix = parser.getPrefix();
+    		}
+    	}
     	/**
     	 * Donothing: just write what is read
+    	 *
     	 * special processing for words: prefix id attributes with "w_" and remove special characters like \n \t etc.
+    	 *
     	 * @param event the event
     	 * @param wordid the wordid
     	 * @return the java.lang. object
     	 */
     	public donothing(int event, Integer wordid) {
     		if (event == XMLStreamConstants.START_ELEMENT ) {
     			localname = parser.getLocalName();
     			if (wordid != null) {
     				localname = word_element_to_create;
+    			}
     			if (prefix != null && prefix.length() > 0) {
     				writer.writeStartElement(prefix+":"+localname);
     			} else {
     				//				if(namespace != null)
     				//					writer.writeStartElement(namespace, localname);
     				//				else
     				writer.writeStartElement(localname);
+    			}
     			//			if(parser.getNamespaceCount() > 0)
     			//				writer.writeDefaultNamespace(parser.getNamespaceURI(0))
     			//			for(int i = 1 ; i < parser.getNamespaceCount() ; i++)
     			//				writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
     			String namespace_prefix;
     			for (int i = 0 ; i<   parser.getNamespaceCount() ; i++) {
     				namespace_prefix = parser.getNamespacePrefix(i);
     				if ((namespace_prefix != null)&&   (namespace_prefix.length()>   0)) {
     					writer.writeNamespace(namespace_prefix, parser.getNamespaceURI(i));
     				} else {
     					writer.writeDefaultNamespace(parser.getNamespaceURI(i));
+    				}
+    			}
     			String attrprefix, attname;
     			boolean hasId = false;
     			//boolean hasType = false
     			boolean hasN = false
     			for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
     				attname = parser.getAttributeLocalName(i);
     				attrprefix = parser.getAttributePrefix(i);
     				if ("id".equals(attname)) hasId = true;
     				//if ("type".equals(attname)) hasType = true;
     				if ("n".equals(attname)) hasN = true;
     				if (attrprefix != null && attrprefix.length() > 0) {
     					writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i))
     				} else {
     					writer.writeAttribute(attname, parser.getAttributeValue(i))
+    				}
+    			}
     			if (wordid != null && !hasId && localname == word_element_to_create) {
     				writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
+    			}
     			if (!hasN && localname == word_element_to_create) {
     				writer.writeAttribute("n", ""+wordcount);
+    			}
     			if (!reg_word_tags.matcher(localname).matches()) {
     				writer.writeCharacters("\n");
+    			}
+    		}
     		else if(event == XMLStreamConstants.END_ELEMENT)
+    		{
     			writer.writeEndElement();
     			writer.writeCharacters("\n");
+    		}
     		else if(event == XMLStreamConstants.CHARACTERS)
+    		{
     			//println parser.getText();
     			//writer.writeCharacters("𦟛");
     			// checks if the token starts with an high surrogate
     			//			if(isHighSurrogate(parser.getText().charAt(0)))	{
     			//				println "warning: invalid UTF-8 XML range, token " + parser.getText() + " has been replaced.";
     			//				writer.writeCharacters("__invalidXMLChar__")
     			//				//writer.writeCharacters("𦟛");
     			//				println "high surrogate: " + Integer.toHexString((int)parser.getText().charAt(0));
     			//				println "low surrogate: " + Integer.toHexString((int)parser.getText().charAt(1));
     			//				int charSum = parser.getText().charAt(0) + parser.getText().charAt(1);
     			//				println "char sum: " + charSum;
     			//				println "test " + surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
     			//				int scalar = surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
     			//				//writer.writeCharacters(String.valueOf((char)112692));
     			//				writer.writeCharacters("&#" + scalar + ";");
     			//			}
     			//			else
     			if (insideword) { // ensure there is not \t or \n in the word form value
     				writer.writeCharacters(parser.getText().trim().replace("\n", " ").replace("\t", " "));
     			} else {
     				writer.writeCharacters(parser.getText());
+    			}
+    		}
+    	}
     	/**
     	 * Converts the specified surrogates pair to scalar.
     	 * @param highSurrogate
     	 * @param lowSurrogate
     	 * @return
     	 */
     	public int surrogatesPairToScalar(char highSurrogate, char lowSurrogate)	{
     		return ((highSurrogate - 0xD800) * 0x400) + (lowSurrogate - 0xDC00) + 0x10000;
+    	}
     	/**
     	 * Checks if the specified character is an high/leading surrogate.
     	 * @param character
     	 * @return
     	 */
     	public boolean isHighSurrogate(char character)	{
     		return (character >= 0xD800 && character <= 0xDBFF);
+    	}
     	/** The wordcount. */
     	int wordcount = 0;
     	/** The ignorecontent. */
     	boolean ignorecontent = true; // tokenization starts with element name matching $reg_startTag
     	boolean insideword = false;
     	/**
     	 * Process.
+    	 *
     	 * @return true, if successful
     	 */
     	public boolean processDOMElement() {
     		if (!infile.exists()) {
     			println "$infile does not exists"
     			return false;
+    		}
     		XMLOutputFactory factory = XMLOutputFactory.newInstance();
     		output = new BufferedOutputStream(new FileOutputStream(outfile))
     		writer = factory.createXMLStreamWriter(output, "UTF-8")
     		writer.setNamespaceContext(new PersonalNamespaceContext());
     		def inputData = infile.toURI().toURL().openStream();
     		def inputfactory = XMLInputFactory.newInstance();
     		//inputfactory.setProperty("http://apache.org/xml/properties/input-buffer-size", new Integer(2048));
     		//inputfactory.setExpandEntityReferences(false);
     		XMLInputFactory.newInstance();
     		parser = inputfactory.createXMLStreamReader(inputData);
     		//println "PARSER: "+parser.getClass()
     		writer.writeStartDocument("UTF-8","1.0");
     		writer.writeCharacters("\n");
     		int previousEvent = 0;
     		boolean startProcess = false;
     		if (startTag == null) // if no startTag specified we process from the start
     			startProcess = true;
     		ignorecontent = !startProcess;
     		buffer = new StringBuffer();
     		//println "process - start start tag: "+startTag+" startProcess: $startProcess"
     		//		println "reg_outside_text_tags_keep_content=$reg_outside_text_tags_keep_content"
     		//		println "reg_outside_text_tags=$reg_outside_text_tags"
     		//		println "reg_note_content=$reg_note_content"
     		try {
     			for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
     				if (!startProcess) {
     					if (event == XMLStreamConstants.START_ELEMENT) {
     						if (reg_startTag.matcher(parser.getLocalName()).matches()) {
     							startProcess = true
     							ignorecontent = false;
+    						}
+    					}
     					if (!startProcess) {
     						donothing(event, null);
     						continue;
+    					}
+    				}
     				if (previousEvent == XMLStreamConstants.CHARACTERS && previousEvent != event) {
     					processWord(); // tokenize now!
     					buffer.setLength(0);
+    				}
     				fillInfos(event);//get localname and prefix
     				if (event == XMLStreamConstants.START_ELEMENT) {
     					//println "Open: "+localname;
     					localname = parser.getLocalName()
     					if (reg_word_tags.matcher(localname).matches()) { // ignore the content of the word but keep counting
     						//println "Found pretagged word";
     						if (retokenize) {
     							retokenizedWordProperties.clear()
     							for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
     								retokenizedWordProperties[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i)
+    							}
     						} else {
     							wordcount++;
     							donothing(event, wordcount);
     							//ignorecontent = true;
     							insideword = true;
+    						}
     					} else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
     						// ignore the tag only
     						donothing(event, null); // write the tag
     						//println "IGNORING NOTE CONTENT OF "+localname
     						ignorecontent = true;
     					} else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
     						// 	ignore the content only
     						donothing(event, null); // write the tag
     						//println "IGNORING CONTENT OF "+localname
     						ignorecontent = true;
     					} else if (reg_outside_text_tags != null && reg_outside_text_tags.matcher(localname).matches()) { // ignore the tag and its content of the tag
     						goToEndOfElement(localname); // parse until the end of the element is passed
     					} else {
     						donothing(event, null);
+    					}
     				} else if(event == XMLStreamConstants.END_ELEMENT) {
     					//println "Close: "+localname;
     					localname = parser.getLocalName()
     					if (reg_word_tags.matcher(localname).matches()) {
     						if (retokenize) {
     							retokenizedWordProperties.clear()
     						} else {
     							//ignorecontent = false;
     							insideword = false;
     							writer.writeEndElement();
     							writer.writeCharacters("\n");
+    						}
     					} else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag
     						ignorecontent = false;
     						donothing(event, null);
     					} else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
     						ignorecontent = false;
     						donothing(event, null);
     					} else {
     						donothing(event, null);
+    					}
     				} else if (event == XMLStreamConstants.CHARACTERS) {
     					if (ignorecontent || insideword) {
     						//println " dont tokenize chars: "+parser.getText().trim();
     						donothing(event, null);
     					} else {
     						//println " process chars: "+parser.getText().trim();
     						buffer.append(parser.getText());
     						if (buffer.length() >= 12800 && buffer.charAt(buffer.length()-1) == " ") {
     							processWord();
     							buffer = new StringBuffer();
+    						}
+    					}
     				} else if (event == XMLStreamConstants.COMMENT) {
     					writer.writeComment(parser.getText())
     				} else if (event == XMLStreamConstants.DTD) {
     					//println "DTD!";
     				} else {
     					if (DEBUG) println "Warning in $infile: ignore XML event at location "+parser.getLocation()
+    				}
     				previousEvent = event;
+    			}
     			parser.close()
     			writer.close();
     			output.close();
     			inputData.close();
     		} catch (Exception e) {
     			System.err.println("Error : "+infile);
     			e.printStackTrace();
     			if (writer != null) writer.close();
     			if (output != null) output.close();
     			if (parser != null) parser.close();
     			if (inputData != null) inputData.close();
     			return false;
+    		}
     		return true;
+    	}
     	public void setRetokenize(boolean retokenize) {
     		this.retokenize = retokenize
+    	}
     	/**
     	 * Set the element and content to ignore
+    	 *
     	 * @param regexp
     	 */
     	public void setOutSideTextTags(String regexp) {
     		this.outside_text_tags = regexp;
     		this.reg_outside_text_tags = Pattern.compile(outside_text_tags);
+    	}
     	/**
     	 * Set element content to NOT tokenize
+    	 *
     	 * @param regexp
     	 */
     	public void setNote(String regexp) {
     		this.note_content = regexp;
     		this.reg_note_content = Pattern.compile(note_content);
+    	}
     	/**
     	 * Set the element to ignore but not their content
+    	 *
     	 * @param regexp
     	 */
     	public void setOutSideTextTagsAndKeepContent(String regexp) {
     		this.outside_text_tags_keep_content = regexp;
     		this.reg_outside_text_tags_keep_content = Pattern.compile(outside_text_tags_keep_content);
+    	}
     	protected void goToEndOfElement(String name) {
     		//println "START ignoring tag and content of $name"
     		def openedTags = []
     		for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
     			if (event == XMLStreamConstants.START_ELEMENT) {
     				openedTags << parser.getLocalName()
     				//println "append "+openedTags
     			} else if (event == XMLStreamConstants.END_ELEMENT) {
     				if (openedTags.size() == 0 && name == parser.getLocalName()) {
     					//println "END ignoring tag and content of $name"
     					return;
+    				}
     				openedTags.pop()
     				//println "pop $openedTags"
+    			}
+    		}
+    	}
     	public final static String WHITESPACE = " ";
     	public final static String EMPTY = "";
     	/**
     	 * Process word.
     	 */
     	protected void processWord() {
     		String text = buffer.toString();//parser.getText().trim().replace("\t", " ");
     		//if (DEBUG) println "-- chars: "+text+"--";
     		text = regLN.matcher(text).replaceAll(WHITESPACE);
     		text = regCTRL.matcher(text).replaceAll(EMPTY);						// remove ctrl characters
     		def sentences = stringTokenizer.processText(text);
     		for (def words : sentences) {
     			for (def word : words) {
     				wordcount++;
     				writer.writeStartElement(word_element_to_create);
     				writeWordAttributes();// id
     				writer.writeCharacters(word);
     				writer.writeEndElement();
     				writer.writeCharacters("\n");
+    			}
     			if (stringTokenizer.doSentences())  {
     				writer.writeProcessingInstruction("txm", "</s>")
+    			}
+    		}
+    	}
     	/**
     	 * Write word attributes.
+    	 *
     	 * @return the java.lang. object
     	 */
     	protected writeWordAttributes() {
     		writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
     		writer.writeAttribute("n",""+wordcount);
     		for (String attr : retokenizedWordProperties.keySet()) {
     			if ("id" == attr) {
     				writer.writeAttribute("old-id", retokenizedWordProperties[attr]);
     			} else if ("n" == attr) {
     				writer.writeAttribute("old-n", retokenizedWordProperties[attr]);
     			} else {
     				writer.writeAttribute(attr, retokenizedWordProperties[attr]);
+    			}
+    		}
+    	}
     	public void setStartTag(String tag)
+    	{
     		this.startTag = tag;
     		this.reg_startTag = Pattern.compile(startTag);
+    	}
     	/**
     	 * Tokenize.
+    	 *
     	 * @param str the str
     	 * @return the list
     	 */
     	public List<String> tokenize(String str)
+    	{
     		return str.tokenize()	// cut by whitespace
+    	}
     	/**
     	 * The main method.
+    	 *
     	 * @param args the arguments
     	 */
     	public static void main(String[] args)
+    	{
     		String lang = "fr"
     		File inFile = new File(System.getProperty("user.home"), "SVN/TXMSVN/trunk/corpora/tokenizer/test1.xml")
     		File outFile = new File(System.getProperty("user.home"), "SVN/TXMSVN/trunk/corpora/tokenizer/test1-tmp.xml")
     		println "processing "+inFile
     		ChunkTokenizerXml tokenizer = new ChunkTokenizerXml(inFile, lang)
     		tokenizer.setRetokenize(false)
     		tokenizer.setNote("note")
     		//tokenizer.setOutSideTextTags("outsideToEdit")
     		tokenizer.setOutSideTextTagsAndKeepContent("outsideToEdit")
     		//tokenizer.setDEBUG false
     		tokenizer.process(outFile);
     		println "Done"
+    	}
+    }

     // Copyright © 2010-2013 ENS de Lyon.
     // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
     // Lyon 2, University of Franche-Comté, University of Nice
     // Sophia Antipolis, University of Paris 3.
     //
     // The TXM platform is free software: you can redistribute it
     // and/or modify it under the terms of the GNU General Public
     // License as published by the Free Software Foundation,
     // either version 2 of the License, or (at your option) any
     // later version.
     //
     // The TXM platform is distributed in the hope that it will be
     // useful, but WITHOUT ANY WARRANTY; without even the implied
     // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
     // PURPOSE. See the GNU General Public License for more
     // details.
     //
     // You should have received a copy of the GNU General
     // Public License along with the TXM platform. If not, see
     // http://www.gnu.org/licenses.
     //
     // This file is part of the TXM platform.
     //
     // The TXM platform is free software: you can redistribute it and/or modif y
     // it under the terms of the GNU General Public License as published by
     // the Free Software Foundation, either version 3 of the License, or
     // (at your option) any later version.
     //
     // The TXM platform is distributed in the hope that it will be useful,
     // but WITHOUT ANY WARRANTY; without even the implied warranty of
     // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     // GNU General Public License for more details.
     //
     // You should have received a copy of the GNU General Public License
     // along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
     //
     //
     //
     // $LastChangedDate:$
     // $LastChangedRevision:$
     // $LastChangedBy:$
     //
     package org.txm.groovy.core;
     import static groovy.transform.TypeCheckingMode.SKIP
     import groovy.transform.CompileStatic
     import java.util.Map.Entry
     import java.util.regex.Matcher
     import java.util.regex.Pattern
     import javax.xml.stream.*
     import org.txm.importer.PersonalNamespaceContext
     import org.txm.tokenizer.StringTokenizer
     import org.txm.tokenizer.SimpleStringTokenizer
     import org.txm.tokenizer.TokenizerClasses
     import org.txm.xml.DOMIdentityHook
     import org.txm.xml.IdentityHook
     import org.txm.xml.XMLParser
     import org.txm.xml.XMLProcessor
     import org.txm.xml.XPathHookActivator
     import org.w3c.dom.Node
     @CompileStatic
     public class ChunkTokenizerXml extends XMLProcessor {
     	XPathHookActivator activator;
     	DOMIdentityHook hook;
     	StringTokenizer stringTokenizer;
     	boolean retokenize = false
     	LinkedHashMap<String, String>retokenizedWordProperties = new LinkedHashMap()
     	/** The word_tags. */
     	String word_tags;
     	String word_element_to_create
     	Pattern reg_word_tags;
     	/** The intraword_tags. */
     	String intraword_tags
     	/** The word_chars. */
     	String word_chars
     	/** The outside_text_tags_ignore_content. */
     	String note_content = null;
     	String outside_text_tags_keep_content = null // tag and content NOT removed but not tokenized
     	String outside_text_tags = null // tag and content removed
     	String startTag = null
     	Pattern reg_note_content;
     	Pattern reg_outside_text_tags_keep_content;
     	Pattern reg_outside_text_tags;
     	Pattern reg_startTag;
     	/** The DEBUG. */
     	public boolean DEBUG = false;
     	/** The outfile is the result file. */
     	File outfile;
     	/** The infile. */
     	File infile;
     	String lang;
     	/** The buffer. */
     	StringBuffer buffer;
     	/** The writer. */
     	XMLStreamWriter writer;
     	BufferedOutputStream output;
     	/** The parser. */
     	XMLStreamReader parser
     	/** The localname. */
     	String localname;
     	/** The prefix. */
     	String prefix;
     	String filename;
     	Pattern regLN;
     	Pattern regCTRL;
     	public ChunkTokenizerXml(File infile) {
     		this(infile, "");
+    	}
     	public ChunkTokenizerXml(File infile, String lang) {
     		this(infile, new TokenizerClasses(lang));
+    	}
     	/**
     	 * Instantiates a new simple tokenizer xml.
+    	 *
     	 * @param infile the infile
     	 * @param outfile the outfile
     	 */
     	public ChunkTokenizerXml(File infile, TokenizerClasses tc) {
     		this.lang = tc.lang;
     		this.stringTokenizer = new SimpleStringTokenizer(lang);
     		word_tags = tc.word_tags;
     		word_element_to_create = tc.word_element_to_create;
     		reg_word_tags = Pattern.compile(word_tags);
     		intraword_tags = tc.intraword_tags;
     		word_chars = tc.word_chars;
     		this.outfile = outfile;
     		this.infile = infile;
     		this.filename = infile.getName();
     		int index = filename.lastIndexOf(".");
     		if (index > 0) filename = filename.substring(0, index);
     		regLN = Pattern.compile("/\n/");
     		regCTRL = Pattern.compile("/\\p{C}/");
     		activator = new XPathHookActivator<>(hook, "//w");
     		hook = new DOMIdentityHook("in_text_hook", activator, this) {
     			String id;
     			boolean inAna = false;
     			boolean inForm = false;
     			boolean inW = false;
     			ArrayList<String[]> anaValues = new ArrayList<>();
     			ArrayList<String[]> formValues = new ArrayList<>();
     			StringBuilder value = new StringBuilder();
     			String resp = "";
     			String type = "";
     			@Override
     			public boolean deactivate() {
     				return true;
+    			}
     			@Override
     			public boolean _activate() {
     				return true;
+    			}
     			/**
     			 * extends this method to process the DOM before it is written
     			 */
     			public void processDom() {
     				println "tokenizing: "+dom
     				ArrayList<Node> textNodes = getTextNodes(dom);
     				if (textNodes.size() == 0) return; // easy
     				StringBuilder buffer = new StringBuilder(); // build a string to tokenize
     				for (Node textNode : textNodes) {
     					buffer.append(textNode.getTextContent());
+    				}
     				int nNode = 0;
     				Node currentTextNode = textNodes.get(0);
     				String currentText = currentTextNode.getTextContent();
     				int curentTextIndex = 0;
     				StringBuilder currentNewText = new StringBuilder()
     				ArrayList<String> currentWords = new ArrayList<String>()
     				List<List<String>> sentences = stringTokenizer.processText(buffer.toString());
     				for (List<String> sent : sentences) {
     					if (nNode >= textNodes.size()) { // all nodes are updated
     						break;
+    					}
     					for (String word : sent) {
     						if (nNode >= textNodes.size()) { // all nodes are updated
     							break;
+    						}
     						int idx = currentText.indexOf(word, curentTextIndex);
     						if (idx >= 0) {
     							curentTextIndex = idx + word.length();
     						} else {
     							println "HOUSTON: word=$word nNode=$nNode currentText=$currentText index=$curentTextIndex words=$currentWords"
     							currentTextNode.setTextContent("");
     							for (String w : currentWords) {
     								Node newChild = dom.getOwnerDocument().createElement("w");
     								newChild.setAttribute("id", "W_ID")
     								newChild.setTextContent(w);
     								dom.insertBefore(newChild, currentTextNode)
+    							}
     							currentNewText = new StringBuilder()
     							currentWords.clear();
     							curentTextIndex = 0;
     							nNode++;
     							if (nNode < textNodes.size()) {
     								currentTextNode = textNodes.get(nNode);
+    							}
+    						}
+    					}
+    				}
+    			}
     			public ArrayList<Node> getTextNodes(def element) {
     				def children = dom.getChildNodes()
     				ArrayList<Node> texts = new ArrayList<Node>()
     				for (int i = 0 ; i < children.getLength() ; i++) {
     					def node = children.item(i);
     					if (node.getNodeType() == Node.TEXT_NODE) {
     						texts.add(node)
     					} else if (node.getNodeType() == Node.ELEMENT_NODE) {
     						texts.addAll(getTextNodes(node));
+    					}
+    				}
     				return texts;
+    			}
     		};
+    	}
     	/**
     	 * Replace the default SimpleStringTokenizer with another
     	 * @param stringTokenizer a StringTokenizer
     	 */
     	public void setStringTokenizer(StringTokenizer stringTokenizer) {
     		if (stringTokenizer == null) return;
     		this.stringTokenizer = stringTokenizer;
+    	}
     	/**
     	 * Fill infos.
+    	 *
     	 * @param event the event
     	 * @return the java.lang. object
     	 */
     	public fillInfos(int event) {
     		if (event == XMLStreamConstants.START_ELEMENT || event == XMLStreamConstants.END_ELEMENT) {
     			localname = parser.getLocalName();
     			prefix = parser.getPrefix();
+    		}
+    	}
     	/**
     	 * Donothing: just write what is read
+    	 *
     	 * special processing for words: prefix id attributes with "w_" and remove special characters like \n \t etc.
+    	 *
     	 * @param event the event
     	 * @param wordid the wordid
     	 * @return the java.lang. object
     	 */
     	public donothing(int event, Integer wordid) {
     		if (event == XMLStreamConstants.START_ELEMENT ) {
     			localname = parser.getLocalName();
     			if (wordid != null) {
     				localname = word_element_to_create;
+    			}
     			if (prefix != null && prefix.length() > 0) {
     				writer.writeStartElement(prefix+":"+localname);
     			} else {
     				//				if(namespace != null)
     				//					writer.writeStartElement(namespace, localname);
     				//				else
     				writer.writeStartElement(localname);
+    			}
     			//			if(parser.getNamespaceCount() > 0)
     			//				writer.writeDefaultNamespace(parser.getNamespaceURI(0))
     			//			for(int i = 1 ; i < parser.getNamespaceCount() ; i++)
     			//				writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
     			String namespace_prefix;
     			for (int i = 0 ; i<   parser.getNamespaceCount() ; i++) {
     				namespace_prefix = parser.getNamespacePrefix(i);
     				if ((namespace_prefix != null)&&   (namespace_prefix.length()>   0)) {
     					writer.writeNamespace(namespace_prefix, parser.getNamespaceURI(i));
     				} else {
     					writer.writeDefaultNamespace(parser.getNamespaceURI(i));
+    				}
+    			}
     			String attrprefix, attname;
     			boolean hasId = false;
     			//boolean hasType = false
     			boolean hasN = false
     			for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
     				attname = parser.getAttributeLocalName(i);
     				attrprefix = parser.getAttributePrefix(i);
     				if ("id".equals(attname)) hasId = true;
     				//if ("type".equals(attname)) hasType = true;
     				if ("n".equals(attname)) hasN = true;
     				if (attrprefix != null && attrprefix.length() > 0) {
     					writer.writeAttribute(attrprefix+":"+attname, parser.getAttributeValue(i))
     				} else {
     					writer.writeAttribute(attname, parser.getAttributeValue(i))
+    				}
+    			}
     			if (wordid != null && !hasId && localname == word_element_to_create) {
     				writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
+    			}
     			if (!hasN && localname == word_element_to_create) {
     				writer.writeAttribute("n", ""+wordcount);
+    			}
     			if (!reg_word_tags.matcher(localname).matches()) {
     				writer.writeCharacters("\n");
+    			}
+    		}
     		else if(event == XMLStreamConstants.END_ELEMENT)
+    		{
     			writer.writeEndElement();
     			writer.writeCharacters("\n");
+    		}
     		else if(event == XMLStreamConstants.CHARACTERS)
+    		{
     			//println parser.getText();
     			//writer.writeCharacters("𦟛");
     			// checks if the token starts with an high surrogate
     			//			if(isHighSurrogate(parser.getText().charAt(0)))	{
     			//				println "warning: invalid UTF-8 XML range, token " + parser.getText() + " has been replaced.";
     			//				writer.writeCharacters("__invalidXMLChar__")
     			//				//writer.writeCharacters("𦟛");
     			//				println "high surrogate: " + Integer.toHexString((int)parser.getText().charAt(0));
     			//				println "low surrogate: " + Integer.toHexString((int)parser.getText().charAt(1));
     			//				int charSum = parser.getText().charAt(0) + parser.getText().charAt(1);
     			//				println "char sum: " + charSum;
     			//				println "test " + surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
     			//				int scalar = surrogatesPairToScalar(parser.getText().charAt(0), parser.getText().charAt(1));
     			//				//writer.writeCharacters(String.valueOf((char)112692));
     			//				writer.writeCharacters("&#" + scalar + ";");
     			//			}
     			//			else
     			if (insideword) { // ensure there is not \t or \n in the word form value
     				writer.writeCharacters(parser.getText().trim().replace("\n", " ").replace("\t", " "));
     			} else {
     				writer.writeCharacters(parser.getText());
+    			}
+    		}
+    	}
     	/**
     	 * Converts the specified surrogates pair to scalar.
     	 * @param highSurrogate
     	 * @param lowSurrogate
     	 * @return
     	 */
     	public int surrogatesPairToScalar(char highSurrogate, char lowSurrogate)	{
     		return ((highSurrogate - 0xD800) * 0x400) + (lowSurrogate - 0xDC00) + 0x10000;
+    	}
     	/**
     	 * Checks if the specified character is an high/leading surrogate.
     	 * @param character
     	 * @return
     	 */
     	public boolean isHighSurrogate(char character)	{
     		return (character >= 0xD800 && character <= 0xDBFF);
+    	}
     	/** The wordcount. */
     	int wordcount = 0;
     	/** The ignorecontent. */
     	boolean ignorecontent = true; // tokenization starts with element name matching $reg_startTag
     	boolean insideword = false;
     	/**
     	 * Process.
+    	 *
     	 * @return true, if successful
     	 */
     	public boolean processDOMElement() {
     		if (!infile.exists()) {
     			println "$infile does not exists"
     			return false;
+    		}
     		XMLOutputFactory factory = XMLOutputFactory.newInstance();
     		output = new BufferedOutputStream(new FileOutputStream(outfile))
     		writer = factory.createXMLStreamWriter(output, "UTF-8")
     		writer.setNamespaceContext(new PersonalNamespaceContext());
     		def inputData = infile.toURI().toURL().openStream();
     		def inputfactory = XMLInputFactory.newInstance();
     		//inputfactory.setProperty("http://apache.org/xml/properties/input-buffer-size", new Integer(2048));
     		//inputfactory.setExpandEntityReferences(false);
     		XMLInputFactory.newInstance();
     		parser = inputfactory.createXMLStreamReader(inputData);
     		//println "PARSER: "+parser.getClass()
     		writer.writeStartDocument("UTF-8","1.0");
     		writer.writeCharacters("\n");
     		int previousEvent = 0;
     		boolean startProcess = false;
     		if (startTag == null) // if no startTag specified we process from the start
     			startProcess = true;
     		ignorecontent = !startProcess;
     		buffer = new StringBuffer();
     		//println "process - start start tag: "+startTag+" startProcess: $startProcess"
     		//		println "reg_outside_text_tags_keep_content=$reg_outside_text_tags_keep_content"
     		//		println "reg_outside_text_tags=$reg_outside_text_tags"
     		//		println "reg_note_content=$reg_note_content"
     		try {
     			for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
     				if (!startProcess) {
     					if (event == XMLStreamConstants.START_ELEMENT) {
     						if (reg_startTag.matcher(parser.getLocalName()).matches()) {
     							startProcess = true
     							ignorecontent = false;
+    						}
+    					}
     					if (!startProcess) {
     						donothing(event, null);
     						continue;
+    					}
+    				}
     				if (previousEvent == XMLStreamConstants.CHARACTERS && previousEvent != event) {
     					processWord(); // tokenize now!
     					buffer.setLength(0);
+    				}
     				fillInfos(event);//get localname and prefix
     				if (event == XMLStreamConstants.START_ELEMENT) {
     					//println "Open: "+localname;
     					localname = parser.getLocalName()
     					if (reg_word_tags.matcher(localname).matches()) { // ignore the content of the word but keep counting
     						//println "Found pretagged word";
     						if (retokenize) {
     							retokenizedWordProperties.clear()
     							for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
     								retokenizedWordProperties[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i)
+    							}
     						} else {
     							wordcount++;
     							donothing(event, wordcount);
     							//ignorecontent = true;
     							insideword = true;
+    						}
     					} else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
     						// ignore the tag only
     						donothing(event, null); // write the tag
     						//println "IGNORING NOTE CONTENT OF "+localname
     						ignorecontent = true;
     					} else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
     						// 	ignore the content only
     						donothing(event, null); // write the tag
     						//println "IGNORING CONTENT OF "+localname
     						ignorecontent = true;
     					} else if (reg_outside_text_tags != null && reg_outside_text_tags.matcher(localname).matches()) { // ignore the tag and its content of the tag
     						goToEndOfElement(localname); // parse until the end of the element is passed
     					} else {
     						donothing(event, null);
+    					}
     				} else if(event == XMLStreamConstants.END_ELEMENT) {
     					//println "Close: "+localname;
     					localname = parser.getLocalName()
     					if (reg_word_tags.matcher(localname).matches()) {
     						if (retokenize) {
     							retokenizedWordProperties.clear()
     						} else {
     							//ignorecontent = false;
     							insideword = false;
     							writer.writeEndElement();
     							writer.writeCharacters("\n");
+    						}
     					} else if (reg_outside_text_tags_keep_content != null && reg_outside_text_tags_keep_content.matcher(localname).matches()) { // ignore the content of the tag
     						ignorecontent = false;
     						donothing(event, null);
     					} else if (reg_note_content != null && reg_note_content.matcher(localname).matches()) { // ignore the content of the tag ONLY
     						ignorecontent = false;
     						donothing(event, null);
     					} else {
     						donothing(event, null);
+    					}
     				} else if (event == XMLStreamConstants.CHARACTERS) {
     					if (ignorecontent || insideword) {
     						//println " dont tokenize chars: "+parser.getText().trim();
     						donothing(event, null);
     					} else {
     						//println " process chars: "+parser.getText().trim();
     						buffer.append(parser.getText());
     						if (buffer.length() >= 12800 && buffer.charAt(buffer.length()-1) == " ") {
     							processWord();
     							buffer = new StringBuffer();
+    						}
+    					}
     				} else if (event == XMLStreamConstants.COMMENT) {
     					writer.writeComment(parser.getText())
     				} else if (event == XMLStreamConstants.DTD) {
     					//println "DTD!";
     				} else {
     					if (DEBUG) println "Warning in $infile: ignore XML event at location "+parser.getLocation()
+    				}
     				previousEvent = event;
+    			}
     			parser.close()
     			writer.close();
     			output.close();
     			inputData.close();
     		} catch (Exception e) {
     			System.err.println("Error : "+infile);
     			e.printStackTrace();
     			if (writer != null) writer.close();
     			if (output != null) output.close();
     			if (parser != null) parser.close();
     			if (inputData != null) inputData.close();
     			return false;
+    		}
     		return true;
+    	}
     	public void setRetokenize(boolean retokenize) {
     		this.retokenize = retokenize
+    	}
     	/**
     	 * Set the element and content to ignore
+    	 *
     	 * @param regexp
     	 */
     	public void setOutSideTextTags(String regexp) {
     		this.outside_text_tags = regexp;
     		this.reg_outside_text_tags = Pattern.compile(outside_text_tags);
+    	}
     	/**
     	 * Set element content to NOT tokenize
+    	 *
     	 * @param regexp
     	 */
     	public void setNote(String regexp) {
     		this.note_content = regexp;
     		this.reg_note_content = Pattern.compile(note_content);
+    	}
     	/**
     	 * Set the element to ignore but not their content
+    	 *
     	 * @param regexp
     	 */
     	public void setOutSideTextTagsAndKeepContent(String regexp) {
     		this.outside_text_tags_keep_content = regexp;
     		this.reg_outside_text_tags_keep_content = Pattern.compile(outside_text_tags_keep_content);
+    	}
     	protected void goToEndOfElement(String name) {
     		//println "START ignoring tag and content of $name"
     		def openedTags = []
     		for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
     			if (event == XMLStreamConstants.START_ELEMENT) {
     				openedTags << parser.getLocalName()
     				//println "append "+openedTags
     			} else if (event == XMLStreamConstants.END_ELEMENT) {
     				if (openedTags.size() == 0 && name == parser.getLocalName()) {
     					//println "END ignoring tag and content of $name"
     					return;
+    				}
     				openedTags.pop()
     				//println "pop $openedTags"
+    			}
+    		}
+    	}
     	public final static String WHITESPACE = " ";
     	public final static String EMPTY = "";
     	/**
     	 * Process word.
     	 */
     	protected void processWord() {
     		String text = buffer.toString();//parser.getText().trim().replace("\t", " ");
     		//if (DEBUG) println "-- chars: "+text+"--";
     		text = regLN.matcher(text).replaceAll(WHITESPACE);
     		text = regCTRL.matcher(text).replaceAll(EMPTY);						// remove ctrl characters
     		def sentences = stringTokenizer.processText(text);
     		for (def words : sentences) {
     			for (def word : words) {
     				wordcount++;
     				writer.writeStartElement(word_element_to_create);
     				writeWordAttributes();// id
     				writer.writeCharacters(word);
     				writer.writeEndElement();
     				writer.writeCharacters("\n");
+    			}
     			if (stringTokenizer.doSentences())  {
     				writer.writeProcessingInstruction("txm", "</s>")
+    			}
+    		}
+    	}
     	/**
     	 * Write word attributes.
+    	 *
     	 * @return the java.lang. object
     	 */
     	protected writeWordAttributes() {
     		writer.writeAttribute("id", "w_"+filename+"_"+wordcount);
     		writer.writeAttribute("n",""+wordcount);
     		for (String attr : retokenizedWordProperties.keySet()) {
     			if ("id" == attr) {
     				writer.writeAttribute("old-id", retokenizedWordProperties[attr]);
     			} else if ("n" == attr) {
     				writer.writeAttribute("old-n", retokenizedWordProperties[attr]);
     			} else {
     				writer.writeAttribute(attr, retokenizedWordProperties[attr]);
+    			}
+    		}
+    	}
     	public void setStartTag(String tag)
+    	{
     		this.startTag = tag;
     		this.reg_startTag = Pattern.compile(startTag);
+    	}
     	/**
     	 * Tokenize.
+    	 *
     	 * @param str the str
     	 * @return the list
     	 */
     	public List<String> tokenize(String str)
+    	{
     		return str.tokenize()	// cut by whitespace
+    	}
     	/**
     	 * The main method.
+    	 *
     	 * @param args the arguments
     	 */
     	public static void main(String[] args)
+    	{
     		String lang = "fr"
     		File inFile = new File(System.getProperty("user.home"), "SVN/TXMSVN/trunk/corpora/tokenizer/test1.xml")
     		File outFile = new File(System.getProperty("user.home"), "SVN/TXMSVN/trunk/corpora/tokenizer/test1-tmp.xml")
     		println "processing "+inFile
     		ChunkTokenizerXml tokenizer = new ChunkTokenizerXml(inFile, lang)
     		tokenizer.setRetokenize(false)
     		tokenizer.setNote("note")
     		//tokenizer.setOutSideTextTags("outsideToEdit")
     		tokenizer.setOutSideTextTagsAndKeepContent("outsideToEdit")
     		//tokenizer.setDEBUG false
     		tokenizer.process(outFile);
     		println "Done"
+    	}
+    }

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 3268