/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 887

     eclipse.preferences.version=1
     org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
     org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
     org.eclipse.jdt.core.compiler.compliance=1.7
     org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
     org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
     org.eclipse.jdt.core.compiler.source=1.7

tmp/org.txm.tokenizer.core/.settings/org.eclipse.jdt.groovy.core.prefs (revision 887)
	1	eclipse.preferences.version=1
	2	groovy.compiler.level=-1
	3	groovy.script.filters=*/.dsld,y,*/.gradle,n
0	4

     <?xml version="1.0" encoding="UTF-8"?>
     <classpath>
     	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7"/>
     	<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
     	<classpathentry kind="src" path="src"/>
     	<classpathentry exported="true" kind="con" path="GROOVY_SUPPORT"/>
     	<classpathentry exported="true" kind="con" path="GROOVY_DSL_SUPPORT"/>
     	<classpathentry kind="output" path="bin"/>
     </classpath>

     Manifest-Version: 1.0
     Bundle-ManifestVersion: 2
     Bundle-Name: org.txm.tokenizer.core
     Bundle-SymbolicName: org.txm.tokenizer.core
     Bundle-Version: 1.0.0.qualifier
     Automatic-Module-Name: org.txm.tokenizer.core
     Bundle-RequiredExecutionEnvironment: JavaSE-1.7
     Export-Package: org.txm.tokenizer
     Require-Bundle: org.txm.libs.groovy-all;bundle-version="2.3.7",
      org.txm.utils;bundle-version="1.0.0"

     <?xml version="1.0" encoding="UTF-8"?>
     <projectDescription>
     	<name>org.txm.tokenizer.core</name>
     	<comment></comment>
     	<projects>
     	</projects>
     	<buildSpec>
     		<buildCommand>
     			<name>org.eclipse.jdt.core.javabuilder</name>
     			<arguments>
     			</arguments>
     		</buildCommand>
     		<buildCommand>
     			<name>org.eclipse.pde.ManifestBuilder</name>
     			<arguments>
     			</arguments>
     		</buildCommand>
     		<buildCommand>
     			<name>org.eclipse.pde.SchemaBuilder</name>
     			<arguments>
     			</arguments>
     		</buildCommand>
     	</buildSpec>
     	<natures>
     		<nature>org.eclipse.jdt.groovy.core.groovyNature</nature>
     		<nature>org.eclipse.pde.PluginNature</nature>
     		<nature>org.eclipse.jdt.core.javanature</nature>
     	</natures>
     </projectDescription>

     package org.txm.tokenizer
     public class TTest {
     	def regex;
     	def type = "w";
     	def before=1, hit=2, after=3;
     	public TTest(def regex, String type, int before, int hit, int after) {
     		this.regex = regex
     		this.type = type
     		this.before = before
     		this.hit = hit
     		this.after = after
+    	}
     	public TTest(def regex, int before, int hit, int after) {
     		this.regex = regex
     		this.before = before
     		this.hit = hit
     		this.after = after
+    	}
     	public TTest(def regex, String type) {
     		this.regex = regex
     		this.type= type
+    	}
     	public TTest(def regex) {
     		this.regex = regex
+    	}
     	/* (non-Javadoc)
     	 * @see java.lang.Object#toString()
     	 */
     	@Override
     	public String toString() {
     		return "[regex=" + regex + ", type=" + type + ", before="+ before + ", hit=" + hit + ", after=" + after + "]";
+    	}
+    }

     // Copyright © 2010-2013 ENS de Lyon.
     // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
     // Lyon 2, University of Franche-Comté, University of Nice
     // Sophia Antipolis, University of Paris 3.
     //
     // The TXM platform is free software: you can redistribute it
     // and/or modify it under the terms of the GNU General Public
     // License as published by the Free Software Foundation,
     // either version 2 of the License, or (at your option) any
     // later version.
     //
     // The TXM platform is distributed in the hope that it will be
     // useful, but WITHOUT ANY WARRANTY; without even the implied
     // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
     // PURPOSE. See the GNU General Public License for more
     // details.
     //
     // You should have received a copy of the GNU General
     // Public License along with the TXM platform. If not, see
     // http://www.gnu.org/licenses.
     //
     //
     //
     // $LastChangedDate: 2017-04-06 09:11:32 +0200 (jeu. 06 avril 2017) $
     // $LastChangedRevision: 3425 $
     // $LastChangedBy: mdecorde $
     //
     package org.txm.tokenizer
     import java.io.FileWriter;
     import java.util.regex.Pattern;
     import org.txm.utils.xml.DomUtils;
     import org.w3c.dom.Document;
     import org.w3c.dom.Element
     import org.w3c.dom.Node;
     import org.w3c.dom.NodeList;
     import javax.xml.stream.*;
     import java.net.URL;
     // TODO: Auto-generated Javadoc
     /**
      * The Class TokenizerClasses.
      */
     class TokenizerClasses {
     	//// ROOTS ? ////
     	public static boolean debug = false;
     	/** The tag_all. */
     	public static String tag_all = "<[A-Za-z][^>]+>";
     	/** The Weblex enclitics. */
     	public static String enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
     	public static String encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
     	/** The TT enclitics. */
     	public static String FClitic_en = "'(s|re|ve|d|m|em|ll)|n['‘’]t";
     	public static String PClitic_fr = '[dcjlmnstDCJLNMST][\'‘’]|[Qq]u[\'‘’]|[Jj]usqu[\'‘’]|[Ll]orsqu[\'‘’]|[Pp]uisqu[\'‘’]|[Qq]uoiqu[\'‘’]';
     	public static String FClitic_fr = '-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m[\'‘’]|-moi|-nous|-on|-toi|-tu|-t[\'‘’]|-vous|-en|-y|-ci|-là';
     	public static String PClitic_it = '[dD][ae]ll[\'‘’]|[nN]ell[\'‘’]|[Aa]ll[\'‘’]|[lLDd][\'‘’]|[Ss]ull[\'‘’]|[Qq]uest[\'‘’]|[Uu]n[\'‘’]|[Ss]enz[\'‘’]|[Tt]utt[\'‘’]';
     	public static String FClitic_gl = '-la|-las|-lo|-los|-nos';
     	/** The div_tags. */
     	public static String div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl|opener|dateline";
     	/** The q_tags. */
     	public static String q_tags = "q|quote|said|item|stage|cit|label|heraldry";
     	/** The extraword_tags. */
     	public static String extraword1_tags = "expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
     	/** The corr_tags_no_seg. */
     	public static String corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno|surplus";
     	/** The word_tags. */
     	public static String word_tags = "w";
     	/** The intraword_tags. */
     	public static String intraword_tags = "c|ex|caesura";
     	/** The punct_quotes. */
     	public static String punct_quotes = "'‘’’"
     	/** The punct_strong1. */
     	public static String punct_strong1 = ".!?";
     	/** The punct_strong2. */
     	public static String punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
     	/** The punct_paren_open1. */
     	public static String punct_paren_open1 = "«";
     	/** The punct_paren_open2. */
     	public static String punct_paren_open2 = "``|\\(|\\[|\\{";
     	/** The punct_paren_close1. */
     	public static String punct_paren_close1 = "»";
     	/** The punct_paren_close2. */
     	public static String punct_paren_close2 = "''|‘‘|’’|\\)|\\]|\\}";
     	/** The punct_weak. */
     	public static String punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\≤≥<>\\—ν√μ•@≈→αγ∞≡σ&%|#¼½Θĩ†φθΓ§ẽə∈";
     	public static String entity = "&[^;]+;";
     	public static String seg_tags = "seg"
     	//// COMBINAISONS ////
     	/** The corr_tags. */
     	public static String corr_tags = "$corr_tags_no_seg|$seg_tags";
     	/** The extraword_tags. */
     	public static String extraword_tags = "$div_tags|$q_tags|$extraword1_tags";
     	/** The punct_strong. */
     	public static String punct_strong = "[$punct_strong1]+|$punct_strong2";
     	/** The punct_paren_open. */
     	public static String punct_paren_open = "$punct_paren_open1|$punct_paren_open2";
     	/** The punct_paren_close. */
     	public static String punct_paren_close = "$punct_paren_close1|$punct_paren_close2";
     	/** The punct_paren. */
     	public static String punct_paren = "$punct_paren_open|$punct_paren_close";
     	/** The punct_all. */
     	public static String punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
     	/** The word_chars. */
     	public static String word_chars = "[^ $punct_quotes$punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|$entity";
     	public static def tests = [
     		new TTest(/\A(.*-)(je|m[eo]i|tu|t[eo]i|lui|luy|ilz|ils?|no?u?s|vo?u?s|on|leurs|ce|ci|là|elles?)(.*)\Z/),
     		new TTest(/\A(.*)(-)()\Z/, "pon"),
     		//new TTest(/\A(.*)(Bande dessinée|eau de vie|machine à écrire|Moyen Âge|petit pois|poule d'eau|Avoir l'air|se rendre compte|faire semblant|prendre froid|s'en aller|Comme il faut|bon marché|bon enfant|en retard|en colère|à la mode|de bonne humeur|hors de propos|de travers|en vacances|de standing|à l'abandon|sans défense|pieds nus|Tout à fait|d'ores et déjà|de temps en temps|tout de suite|de gré ou de force|ne pas|ne jamais|ne plus|sans doute|on ne peut plus|Quelque chose|ce dernier|n'importe quoi|N'importe quel|beaucoup de|plus d'un|peu de|un tas de|un groupe de|je ne sais quel|De sorte que|bien que|parce que|c'est pourquoi|de même que|depuis que|Quant à|au-dessus de|par rapport à|à côté de|grâce à|hors de|le long de|Par ma barbe !|Mon dieu !|Eh bien !|Bonne nuit !|Pas de panique !)(.*)\Z/),
     		new TTest(/\A([Aa]ujourd'hui)(.*)/,0,1,2),
     		// BFM ONLY
     //		new TTest(/\A()([.·][^ .·]+[.·])(.*)\Z/, "num"),
     		// TXM REFMAN ONLY
     //		new TTest(/\A(.*)($tag_all)(.*)\Z/, "tag"),
     //		new TTest(/\A(.*)([0-9][0-9][\/][0-9][0-9][\/][0-9][0-9])(.*)\Z/, "date", 1,2,3),
     //		new TTest(/\A(.*)([0-9][0-9]:[0-9][0-9]:[0-9][0-9])(.*)\Z/, "time", 1,2,3),
     //		new TTest(/\A([^0-9]*)([0-9]+(\.[0-9]+[a-z]+))(.*)\Z/, "version", 1,2,4),
     //		new TTest(/\A([^0-9]*)([0-9]+(\.[0-9]+)(\.[0-9]+)+)(\.[0-9]+[a-z]+)?(.*)\Z/, "version", 1,2,6),
     //		new TTest(/\A()([a-z]++:\/\/)(.*)\Z/, "protocol", 1,2,3),
     //		new TTest(/\A()([0-9]+([.,][0-9]+)?)()\Z/, "num", 1,2,4),
     //		new TTest(/\A()(([A-Z]:)?([\/\\][^\/\\]+)+)()\Z/, "path", 1,2,5),
     		new TTest(/\A([\p{Ps}|\p{Pe}|\p{Pi}|\p{Pf}|\p{Po}|\p{S}]*)(\p{L}++['‘’])(.*)\Z/, "w", 1, 2, 3),
     		new TTest(/\A(.*)(\.\.\.)(.*)\Z/, "pon"),
     		new TTest(/\A(.*)(\p{Ps}|\p{Pe}|\p{Pi}|\p{Pf}|\p{Po}|\p{S})(.*)\Z/, "pon")
     	];
     	//tag_all = "<[^>]+>";
     	//enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
     	//encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
     	//div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl";
     	//q_tags = "q|quote|item|stage";
     	//extraword_tags = "$div_tags|$q_tags|expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
     	//corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno";
     	//corr_tags = "$corr_tags_no_seg|seg";
     	//word_tags = "w|abbr|num";
     	//intraword_tags = "c|ex";
     	public static String whitespaces = "[\\p{Z}\\p{C}]+";
     	public static String regElision = "['‘’]" // obsolete
     	public static String regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]"
     	//punct_strong1 = ".!?";
     	//punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
     	//punct_strong = "[$punct_strong1]|$punct_strong2";
     	//punct_paren_open1 = "«";
     	//punct_paren_open2 = "<<|``|\\(|\\[|\\{";
     	//punct_paren_open = "$punct_paren_open1|$punct_paren_open2";
     	//punct_paren_close1 = "»";
     	//punct_paren_close2 = ">>|''|‘‘|’’|\\)|\\]|\\}";
     	//punct_paren_close = "$punct_paren_close1|$punct_paren_close2";
     	//punct_paren = "$punct_paren_open|$punct_paren_close";
     	//punct_weak = ",;∼ˆ·*:\"`'“”\\+±=/\\-≤≥<>\\—_ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
     	//punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
     	//word_chars = "[^ $punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|&[^;]+;";
     	public static reset() {
     		tag_all = "<[^>]+>";
     		enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
     		encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
     		div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl";
     		q_tags = "q|quote|item|stage|cit";
     		extraword1_tags = "expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
     		corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno|surplus";
     		word_tags = "w";
     		intraword_tags = "c|ex|caesura";
     		punct_quotes = "'‘’’"
     		punct_strong1 = ".!?";
     		punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
     		punct_paren_open1 = "«";
     		punct_paren_open2 = "``|\\(|\\[|\\{";
     		punct_paren_close1 = "»";
     		punct_paren_close2 = "''|‘‘|’’|\\)|\\]|\\}";
     		punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\-≤≥<>\\—ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
     		entity = "&[^;]+;";
     		seg_tags = "seg"
     		whitespaces = "[\\p{Z}\\p{C}]+";
     		regElision = "['‘’]"
     		regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]"
     		recombine();
+    	}
     	public static recombine() {
     		corr_tags = "$corr_tags_no_seg|$seg_tags";
     		extraword_tags = "$div_tags|$q_tags|$extraword1_tags";
     		punct_strong = "[$punct_strong1]|$punct_strong2";
     		punct_paren_open = "$punct_paren_open1|$punct_paren_open2";
     		punct_paren_close = "$punct_paren_close1|$punct_paren_close2";
     		punct_paren = "$punct_paren_open|$punct_paren_close";
     		punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
     		word_chars = "[^ $punct_quotes$punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|$entity";
+    	}
     	/**
     	 * To dom.
+    	 *
     	 * @param doc the doc
     	 * @param parent the parent
     	 * @return the java.lang. object
     	 */
     	public static toDom(Document doc, Node parent) {
     		Element tokenizer = doc.createElement("tokenizer");
     		tokenizer.setAttribute("onlyThoseTests", "false")
     		parent.appendChild(tokenizer)
     		//String tag_all = "<[^>]+>";
     		Element p = doc.createElement("param");
     		p.setAttribute("key","tag_all");
     		p.setTextContent(tag_all);
     		tokenizer.appendChild(p)
     		//String enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
     		p = doc.createElement("param");
     		p.setAttribute("key","enclitics");
     		p.setTextContent(enclitics);
     		tokenizer.appendChild(p)
     		//String encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
     		p = doc.createElement("param");
     		p.setAttribute("key","encliticsFR");
     		p.setTextContent(encliticsFR);
     		tokenizer.appendChild(p)
     		//String div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl";
     		p = doc.createElement("param");
     		p.setAttribute("key","div_tags");
     		p.setTextContent(div_tags);
     		tokenizer.appendChild(p)
     		//String q_tags = "q|quote|item|stage";
     		p = doc.createElement("param");
     		p.setAttribute("key","q_tags");
     		p.setTextContent(q_tags);
     		tokenizer.appendChild(p)
     		//String extraword_tags = "$div_tags|$q_tags|expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
     		p = doc.createElement("param");
     		p.setAttribute("key","extraword_tags");
     		p.setTextContent(extraword_tags);
     		tokenizer.appendChild(p)
     		//String corr_tags_no_seg = "expan|unclear|choice|corr|sic|reg|orig|foreign|hi|title|name|supplied|subst|add|del|damage|date|idno";
     		p = doc.createElement("param");
     		p.setAttribute("key","enclitics");
     		p.setTextContent(enclitics);
     		tokenizer.appendChild(p)
     		//String corr_tags = "$corr_tags_no_seg|seg";
     		p = doc.createElement("param");
     		p.setAttribute("key","corr_tags");
     		p.setTextContent(corr_tags);
     		tokenizer.appendChild(p)
     		//String word_tags = "w|abbr|num";
     		p = doc.createElement("param");
     		p.setAttribute("key","word_tags");
     		p.setTextContent(word_tags);
     		tokenizer.appendChild(p)
     		//String intraword_tags = "c|ex";
     		p = doc.createElement("param");
     		p.setAttribute("key","intraword_tags");
     		p.setTextContent(intraword_tags);
     		tokenizer.appendChild(p)
     		//String punct_strong1 = ".!?";
     		p = doc.createElement("param");
     		p.setAttribute("key","punct_strong1");
     		p.setTextContent(punct_strong1);
     		tokenizer.appendChild(p)
     		//String punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
     		p = doc.createElement("param");
     		p.setAttribute("key","punct_strong2");
     		p.setTextContent(punct_strong2);
     		tokenizer.appendChild(p)
     		//String punct_strong = "[$punct_strong1]|$punct_strong2";
     		p = doc.createElement("param");
     		p.setAttribute("key","punct_strong");
     		p.setTextContent(punct_strong);
     		tokenizer.appendChild(p)
     		//String punct_paren_open1 = "«";
     		p = doc.createElement("param");
     		p.setAttribute("key","punct_paren_open1");
     		p.setTextContent(punct_paren_open1);
     		tokenizer.appendChild(p)
     		//String punct_paren_open2 = "<<|``|\\(|\\[|\\{";
     		p = doc.createElement("param");
     		p.setAttribute("key","punct_paren_open2");
     		p.setTextContent(punct_paren_open2);
     		tokenizer.appendChild(p)
     		//String punct_paren_open = "$punct_paren_open1|$punct_paren_open2";
     		p = doc.createElement("param");
     		p.setAttribute("key","punct_paren_open");
     		p.setTextContent(punct_paren_open);
     		tokenizer.appendChild(p)
     		//String punct_paren_close1 = "»";
     		p = doc.createElement("param");
     		p.setAttribute("key","punct_paren_close1");
     		p.setTextContent(punct_paren_close1);
     		tokenizer.appendChild(p)
     		//String punct_paren_close2 = ">>|''|‘‘|’’|\\)|\\]|\\}";
     		p = doc.createElement("param");
     		p.setAttribute("key","punct_paren_close2");
     		p.setTextContent(punct_paren_close2);
     		tokenizer.appendChild(p)
     		//String punct_paren_close = "$punct_paren_close1|$punct_paren_close2";
     		p = doc.createElement("param");
     		p.setAttribute("key","punct_paren_close");
     		p.setTextContent(punct_paren_close);
     		tokenizer.appendChild(p)
     		//String punct_paren = "$punct_paren_open|$punct_paren_close";
     		p = doc.createElement("param");
     		p.setAttribute("key","punct_paren");
     		p.setTextContent(punct_paren);
     		tokenizer.appendChild(p)
     		//String punct_weak = ",;∼ˆ·*:\"`'“”\\+±=/\\-≤≥<>\\—_ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
     		p = doc.createElement("param");
     		p.setAttribute("key","punct_weak");
     		p.setTextContent(punct_weak);
     		tokenizer.appendChild(p)
     		//String punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
     		p = doc.createElement("param");
     		p.setAttribute("key","punct_all");
     		p.setTextContent(punct_all);
     		tokenizer.appendChild(p)
     		//String word_chars = "[^ $punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|&[^;]+;";
     		p = doc.createElement("param");
     		p.setAttribute("key","word_chars");
     		p.setTextContent(word_chars);
     		tokenizer.appendChild(p)
     		//whitespaces = "[\\p{Z}\\p{C}]+";
     		p = doc.createElement("param");
     		p.setAttribute("key","whitespaces");
     		p.setTextContent(whitespaces);
     		tokenizer.appendChild(p)
     		//regElision = "['‘’]"
     		p = doc.createElement("param");
     		p.setAttribute("key","regElision");
     		p.setTextContent(regElision);
     		tokenizer.appendChild(p)
     		//regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]"
     		p = doc.createElement("param");
     		p.setAttribute("key","regPunct");
     		p.setTextContent(regPunct);
     		tokenizer.appendChild(p)
+    	}
     	/**
     	 * Dump.
     	 */
     	public static void dump() {
     		println "BRUT"
     		println "tag_all = $tag_all"
     		println "enclitics = $enclitics"
     		println "encliticsFR = $encliticsFR"
     		println "div_tags = $div_tags"
     		println "q_tags = $q_tags"
     		println "extraword1_tags = $extraword1_tags"
     		println "corr_tags_no_seg = $corr_tags_no_seg"
     		println "word_tags = $word_tags"
     		println "intraword_tags = $intraword_tags"
     		println "punct_quotes = $punct_quotes"
     		println "punct_strong1 = $punct_strong1"
     		println "punct_strong2 = $punct_strong2"
     		println "punct_paren_open1 = $punct_paren_open1"
     		println "punct_paren_open2 = $punct_paren_open2"
     		println "punct_paren_close1 = $punct_paren_close1"
     		println "punct_paren_close2 = $punct_paren_close2"
     		println "punct_weak = $punct_weak"
     		println "entity = $entity"
     		println "seg_tags = $seg_tags"
     		println "COMBINED"
     		println "corr_tags = $corr_tags"
     		println "extraword_tags = $extraword_tags"
     		println "punct_strong = $punct_strong"
     		println "punct_paren_open = $punct_paren_open"
     		println "punct_paren_close = $punct_paren_close"
     		println "punct_paren = $punct_paren"
     		println "punct_all = $punct_all"
     		println "word_chars = $word_chars"
     		println "whitespaces = $whitespaces"
     		println "regElision = $regElision"
     		println "regPunct = $regPunct"
     		println "TESTS"
     		for(def test : tests)
     			println " $test"
+    	}
     	public static boolean isValid() {
     		try { Pattern.compile(tag_all) }
     		catch (Exception e) { println "tag_all=${tag_all}: "+e;return false; }
     		try { Pattern.compile(enclitics) }
     		catch (Exception e) { println "enclitics=${enclitics}: "+e;return false; }
     		try { Pattern.compile(encliticsFR) }
     		catch (Exception e) { println "encliticsFR=${encliticsFR}: "+e;return false; }
     		try { Pattern.compile(div_tags) }
     		catch (Exception e) { println "div_tags=${div_tags}: "+e;return false; }
     		try { Pattern.compile(q_tags) }
     		catch (Exception e) { println "q_tags=${q_tags}: "+e;return false; }
     		try { Pattern.compile(extraword1_tags) }
     		catch (Exception e) { println "extraword1_tags=${extraword1_tags}: "+e;return false; }
     		try { Pattern.compile(corr_tags_no_seg) }
     		catch (Exception e) { println "corr_tags_no_seg=${corr_tags_no_seg}: "+e;return false; }
     		try { Pattern.compile(word_tags) }
     		catch (Exception e) { println "word_tags=${word_tags}: "+e;return false; }
     		try { Pattern.compile(intraword_tags) }
     		catch (Exception e) { println "intraword_tags=${intraword_tags}: "+e;return false; }
     		try { Pattern.compile(punct_quotes) }
     		catch (Exception e) { println "punct_quotes=${punct_quotes}: "+e;return false; }
     		try { Pattern.compile(punct_strong1) }
     		catch (Exception e) { println "punct_strong1=${punct_strong1}: "+e;return false; }
     		try { Pattern.compile(punct_strong2) }
     		catch (Exception e) { println "punct_strong2=${punct_strong2}: "+e;return false; }
     		try { Pattern.compile(punct_paren_open1) }
     		catch (Exception e) { println "punct_paren_open1=${punct_paren_open1}: "+e;return false; }
     		try { Pattern.compile(punct_paren_open2) }
     		catch (Exception e) { println "punct_paren_open2=${punct_paren_open2}: "+e;return false; }
     		try { Pattern.compile(punct_paren_close1) }
     		catch (Exception e) { println "punct_paren_close1=${punct_paren_close1}: "+e;return false; }
     		try { Pattern.compile(punct_paren_close2) }
     		catch (Exception e) { println "punct_paren_close2=${punct_paren_close2}: "+e;return false; }
     		try { Pattern.compile(punct_weak) }
     		catch (Exception e) { println "punct_weak=${punct_weak}: "+e;return false; }
     		try { Pattern.compile(entity) }
     		catch (Exception e) { println "entity=${entity}: "+e;return false; }
     		try { Pattern.compile(seg_tags) }
     		catch (Exception e) { println "seg_tags=${seg_tags}: "+e;return false; }
     		try { Pattern.compile(corr_tags) }
     		catch (Exception e) { println "corr_tags: "+e;return false; }
     		try { 	Pattern.compile(extraword_tags) }
     		catch (Exception e) { println "extraword_tags: "+e;return false; }
     		if (punct_strong != null)
     		try { 	Pattern.compile(punct_strong) }
     		catch (Exception e) { println "punct_strong: "+e;return false; }
     		try { Pattern.compile(punct_paren_open) }
     		catch (Exception e) { println "punct_paren_open: "+e;return false; }
     		try { Pattern.compile(punct_paren_close) }
     		catch (Exception e) { println "punct_paren_close: "+e;return false; }
     		try { Pattern.compile(punct_paren) }
     		catch (Exception e) { println "punct_paren: "+e;return false; }
     		try { Pattern.compile(punct_all) }
     		catch (Exception e) { println "punct_all: "+e;return false; }
     		try { Pattern.compile(word_chars) }
     		catch (Exception e) { println "word_chars: "+e;return false; }
     		if (regPunct != null)
     		try { Pattern.compile(regPunct) }
     		catch (Exception e) { println "regPunct=${regPunct}: "+e;return false; }
     		if (regElision != null)
     		try { Pattern.compile(regElision) }
     		catch (Exception e) { println "regElision=${regElision}: "+e;return false; }
     		if (whitespaces != null)
     		try { Pattern.compile(whitespaces) }
     		catch (Exception e) { println "whitespaces=${whitespaces}: "+e;return false; }
     		for(TTest test : tests) {
     			try { Pattern.compile(test.regex) }
     			catch (Exception e) { println test.regex+": "+e;return false; }
+    		}
     		return true;
+    	}
     	public static boolean loadFromNode(Element tokenizerElement) {
     		//load params
     		String tmp_strong_punct;
     		NodeList params = tokenizerElement.getElementsByTagName("param");
     		for (int i = 0 ; i < params.getLength() ; i++) {
     			Element param = params.item(i);
     			String key = param.getAttribute("name")
     			String value = param.getAttribute("value")
     			if (value == null || value.length() == 0) value = param.getTextContent()
     			if (value.length() == 0) value = null;
     			if (debug) println (" Tokenizer parametrized with $key=$value.")
     			if (key == "tag_all")
     				tag_all = value
     			else if (key == "enclitics")
     				enclitics = value
     			else if (key == "encliticsFR")
     				encliticsFR = value
     			else if (key == "div_tags")
     				div_tags = value
     			else if (key == "q_tags")
     				q_tags = value
     			else if (key == "extraword1_tags")
     				extraword1_tags = value
     			else if (key == "corr_tags_no_seg")
     				corr_tags_no_seg = value
     			else if (key == "word_tags")
     				word_tags = value
     			else if (key == "intraword_tags")
     				intraword_tags = value
     			else if (key == "punct_quotes")
     				punct_quotes = value
     			else if (key == "punct_strong1")
     				punct_strong1 = value
     			else if (key == "punct_strong2")
     				punct_strong2 = value
     			else if (key == "punct_paren_open1")
     				punct_paren_open1 = value
     			else if (key == "punct_paren_open2")
     				punct_paren_open2 = value
     			else if (key == "punct_paren_close1")
     				punct_paren_close1 = value
     			else if (key == "punct_paren_close2")
     				punct_paren_close2 = value
     			else if (key == "punct_weak")
     				punct_weak = value
     			else if (key == "entity")
     				entity = value
     			else if (key == "seg_tags")
     				seg_tags = value
     			else if (key == "regPunct")
     				regPunct = value
     			else if (key == "regElision")
     				regElision = value
     			else if (key == "whitespaces")
     				whitespaces = value
     			else if (key == "punct_strong") // this is temporary
     				tmp_strong_punct = value // this is temporary
     			else
     				println "MISSING TOKENIZER KEY: "+key
+    		}
     		//recombine
     		recombine();
     		if (tmp_strong_punct != null) punct_strong = tmp_strong_punct;  // this is temporary
     		String shouldResetTests = tokenizerElement.getAttribute("onlyThoseTests")
     		if ("true".equals(shouldResetTests)) {
     			println "Warning: tokenizer only using import parameters tests"
     			tests = []
+    		}
     		NodeList testsList = tokenizerElement.getElementsByTagName("test");
     		if (testsList.getLength() > 0) {
     			//println "Add "+testsList.getLength()+" tests to the tokenizer"
     			for (int i = 0 ; i < testsList.getLength() ; i++) {
     				Element test = testsList.item(i);
     				tests.add(i, new TTest(test.getTextContent(), test.getAttribute("type"),
     						Integer.parseInt(test.getAttribute("before")),
     						Integer.parseInt(test.getAttribute("hit")),
     						Integer.parseInt(test.getAttribute("after")))
+    						)
+    			}
     			//println "Tests: "+tests
+    		}
     		if (isValid())
     			return true
     		else {
     			reset();
     			recombine();
     			return false;
+    		}
+    	}
     	/**
     	 * The main method.
+    	 *
     	 * @param args the arguments
     	 */
     	public static void main(String[] args) {
     		Document doc = DomUtils.load(new File("/home/mdecorde/xml/discours/import.xml"));
     		Node corpora = doc.getDocumentElement().getElementsByTagName("corpora").item(0);
     		Node corpus = ((Element)corpora).getElementsByTagName("corpus").item(0)
     		Node tokenizer = ((Element)corpus).getElementsByTagName("tokenizer").item(0)
     		println "OK? "+ TokenizerClasses.loadFromNode(tokenizer)
+    	}
+    }

tmp/org.txm.tokenizer.core/src/org/txm/tokenizer/package.html (revision 887)
	1	<html>
	2	<body>
	3	<p>All necessary classes to tokenize XML-TEI</p>
	4	</body>
	5	</html>
0	6

tmp/org.txm.tokenizer.core/build.properties (revision 887)
	1	source.. = src/
	2	output.. = bin/
	3	bin.includes = META-INF/,\
	4	.
0	5

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 887