/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 3796

     	 * @throws Exception
     	 */
     	public StringTokenizer getStringTokenizer(String lang) throws Exception {
     		return new SimpleStringTokenizer(lang);
     		return null;
+    	}
     	/**

     		ArrayList<String> sresult = new ArrayList<String>()
     		if (regSplitWhiteSpaces != null) {
     			for (String s : regSplitWhiteSpaces.split(text)) {		// separate with unicode white spaces
     				// if (DEBUG){println "process $s"}
     				if (DEBUG){println "process $s"}
     				sresult.addAll(iterate(s));
+    			}
+    		}
-...
     	protected ArrayList<String> iterate(String s) {
     		ArrayList<String> result = new ArrayList<String>();
     		while (s != null && s.length() > 0) {
     			//	if (DEBUG){println "  > $s"}
     			if (DEBUG){println "  > $s"}
     			s = standardChecks(result, s);
+    		}
     		return result;
-...
+    		}
     		if (fclitics != null && (m = s =~ regFClitics) ) {
     			//	if (DEBUG) println "CLITIC found: $s ->"+ m
     			if (DEBUG) println "CLITIC found: $s ->"+ m
     			result.addAll(iterate(m.group(1)))
     			result.add(m.group(2));
-...
     			return "";
     		} else if (regElision != null && (m = s =~ regElision) ) {
     			//	if (DEBUG) println "Elision found: $s ->"+ m.group(1)+" + "+m.group(2)+" + "+m.group(3)
     			if (DEBUG) println "Elision found: $s ->"+ m.group(1)+" + "+m.group(2)+" + "+m.group(3)
     			//iterate(m.group(1))
     			//			int sep = s.indexOf("'");
-...
     			return "";
     		} else if (reg3pts != null && (m = s =~ reg3pts) )	{
     			//	if(DEBUG){println "REG '...' found: $s -> "+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
     			if(DEBUG){println "REG '...' found: $s -> "+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
     			result.addAll(iterate(m.group(1)))
     			result.add("...");
     			return m.group(3);
     		} else if (regPunct != null && (m = s =~ regPunct) ) {
     			if(DEBUG){println "PUNCT found: $s ->"+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
     			if(DEBUG){println "PUNCT '$regPunct' found: $s ->"+m.group(1)+" + "+m.group(2)+" + "+m.group(3)}
     			result.addAll(iterate(m.group(1)))
     			result.add(m.group(2));

     import org.w3c.dom.NodeList;
     import org.xml.sax.SAXException;
     import cern.colt.Arrays;
     /**
      * Holder for tokenizer rules regexp
      */
     public class TokenizerClasses {
     	public String lang;
     	public TokenizerClasses(String lang) {
     		if (lang != null) {
     			this.lang = lang;
-...
+    		}
     		reset();
+    	}
     	public TokenizerClasses() {
     		this(Locale.getDefault().getCountry());
+    	}
     	/**
     	 * Helper
+    	 *
-...
     		tc.lang = lang;
     		return tc;
+    	}
     	//// ROOTS ? ////
     	public boolean debug = false;
     	/** The tag_all. */
     	public String tag_all = null;
     	/** The Weblex enclitics. */
     	public String enclitics = null;
     	public String encliticsFR = null;
     	/** The TT enclitics. */
     	public String FClitic_en = null;
     	public String PClitic_fr = null;
     	public String FClitic_fr = null;
     	public String PClitic_it = null;
     	public String FClitic_gl = null;
     	/** The div_tags. */
     	public String div_tags = null;
     	/** The q_tags. */
     	public String q_tags = null;
     	/** The extraword_tags. */
     	public String extraword1_tags = null;
     	/** The corr_tags_no_seg. */
     	public String corr_tags_no_seg = null;
     	/** The word_tags. */
     	public String word_tags = null;
     	/** the element to create when a word is created */
     	public String word_element_to_create = null;
     	/** The intraword_tags. */
     	public String intraword_tags = null;
     	/** The punct_quotes. */
     	public String punct_quotes = null;
     	/** The punct_strong1. */
     	public String punct_strong1 = null;
     	/** The punct_strong2. */
     	public String punct_strong2 = null;
     	/** The punct_paren_open1. */
     	public String punct_paren_open1 = null;
     	/** The punct_paren_open2. */
     	public String punct_paren_open2 = null;
     	/** The punct_paren_close1. */
     	public String punct_paren_close1 = null;
     	/** The punct_paren_close2. */
     	public String punct_paren_close2 = null;
     	/** The punct_weak. */
     	public String punct_weak = null;
     	public String entity = null;
     	public String seg_tags = null;
     	//// COMBINAISONS ////
     	/** The corr_tags. */
     	public String corr_tags = null;
     	/** The extraword_tags. */
     	public String extraword_tags = null;
     	/** The punct_strong. */
     	public String punct_strong = null;
     	/** The punct_paren_open. */
     	public String punct_paren_open = null;
     	/** The punct_paren_close. */
     	public String punct_paren_close = null;
     	/** The punct_paren. */
     	public String punct_paren = null;
     	/** The punct_all. */
     	public String punct_all = null;
     	/** The word_chars. */
     	public String word_chars = null;
     	public TTest[] tests = {
     			// new TTest(/\A(.*-)(je|m[eo]i|tu|t[eo]i|lui|luy|ilz|ils?|no?u?s|vo?u?s|on|leurs|ce|ci|là|elles?)(.*)\Z/),
     			// new TTest(/\A(.*)(-)()\Z/, "pon"),
-...
     			// de|un groupe de|je ne sais quel|De sorte que|bien que|parce que|c'est pourquoi|de même que|depuis que|Quant à|au-dessus de|par rapport à|à côté de|grâce à|hors de|le long de|Par ma
     			// barbe !|Mon dieu !|Eh bien !|Bonne nuit !|Pas de panique !)(.*)\Z/),
     			// new TTest(/\A([Aa]ujourd'hui)(.*)/,0,1,2),
     			// BFM ONLY
     			// new TTest(/\A()([.·][^ .·]+[.·])(.*)\Z/, "num"),
     			// TXM REFMAN ONLY
     			// new TTest(/\A(.*)($tag_all)(.*)\Z/, "tag"),
     			// new TTest(/\A(.*)([0-9][0-9][\/][0-9][0-9][\/][0-9][0-9])(.*)\Z/, "date", 1,2,3),
-...
     			// new TTest("^(.*)(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)(.*)$", "url", 1, 2, 5),
     			// new TTest(/\A()([0-9]+([.,][0-9]+)?)()\Z/, "num", 1,2,4),
     			// new TTest(/\A()(([A-Z]:)?([\/\\][^\/\\]+)+)()\Z/, "path", 1,2,5),
     			// new TTest(/\A([\p{Ps}|\p{Pe}|\p{Pi}|\p{Pf}|\p{Po}|\p{S}]*)(\p{L}++['‘’])(.*)\Z/, "w", 1, 2, 3),
     			// new TTest(/\A(.*)(\.\.\.)(.*)\Z/, "pon"),
     			// new TTest(/\A(.*)(\p{Ps}|\p{Pe}|\p{Pi}|\p{Pf}|\p{Po}|\p{S})(.*)\Z/, "pon")
     	};
     	public String whitespaces = null;
     	public String regElision = null;
     	public String regPunct = null;
     	// punct_strong1 = ".!?";
     	// punct_strong2 = "\\.\\.|\\.\\.\\.|…|\\|";
     	// punct_strong = "[$punct_strong1]|$punct_strong2";
-...
     	// punct_weak = ",;∼ˆ·*:\"`'“”\\+±=/\\-≤≥<>\\—_ν√μ•@≈→αγ∞≡σ&¼½Θĩ†φθΓ§ẽə∈";
     	// punct_all = "$punct_strong|$punct_paren|[$punct_weak]";
     	// word_chars = "[^ $punct_strong1$punct_paren_open1$punct_paren_close1$punct_weak]|&[^;]+;";
     	public void reset() {
     		tag_all = "<[A-Za-z][^>]+>";
     		if (debug) System.out.println("Reset TC");
     		tag_all = "<[A-Za-z][^>]+>";
     		div_tags = "TEI|text|front|body|div|div1|div2|div3|div4|div5|div6|back|head|trailer|p|ab|sp|speaker|list|notice|bibl|opener|dateline";
     		q_tags = "q|quote|said|item|stage|cit|label|heraldry";
     		extraword1_tags = "expan|pb|lb|milestone|gap|note|s|locus|title|ref|hi|witDetail";
-...
     		punct_weak = "\\-,;∼ˆ·*:\"“”\\+±=/\\≤≥<>\\—ν√μ•@≈→αγ∞≡σ&%|#¼½Θĩ†φθΓ§ẽə∈";
     		entity = "&[^;]+;";
     		seg_tags = "seg";
     		enclitics = "je|m[eo]i|tu|t[eo]i|il|lui|luy|ilz|ils|no?u?s|vo?u?s|on|ce|ci|là|elles?"; // FRO
     		encliticsFR = "je|tu|il|elle|on|nous|vous|ils|elles|toi|moi|en|y|t|leur|lui|le|la|les"; // FR
     		/** The TT enclitics. */
     		FClitic_en = "'(s|re|ve|d|m|em|ll)|n['‘’]t";
     		PClitic_fr = "[dcjlmnstyDCJLNMSTY][\'‘’]|[Qq]u[\'‘’]|[Jj]usqu[\'‘’]|[Ll]orsqu[\'‘’]|[Pp]uisqu[\'‘’]|[Qq]uoiqu[\'‘’]";
     		FClitic_fr = "-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m[\'‘’]|-moi|-nous|-on|-toi|-tu|-t[\'‘’]|-vous|-en|-y|-ci|-là";
     		PClitic_it = "[dD][ae]ll[\'‘’]|[nN]ell[\'‘’]|[Aa]ll[\'‘’]|[lLDd][\'‘’]|[Ss]ull[\'‘’]|[Qq]uest[\'‘’]|[Uu]n[\'‘’]|[Ss]enz[\'‘’]|[Tt]utt[\'‘’]";
     		FClitic_gl = "-la|-las|-lo|-los|-nos";
     		whitespaces = "[\\p{Z}\\p{C}]+";
     		regElision = "['‘’]";
     		regPunct = "[\\p{Ps}\\p{Pe}\\p{Pi}\\p{Pf}\\p{Po}\\p{S}]";
     		recombine();
+    	}
     	public void recombine() {
     		if (corr_tags == null) {
     			corr_tags = "" + corr_tags_no_seg + "|" + seg_tags + "";
+    		}
-...
     			word_chars = "[^ " + punct_quotes + "" + punct_strong1 + "" + punct_paren_open1 + "" + punct_paren_close1 + "" + punct_weak + "]+|" + entity + "";
+    		}
+    	}
     	/**
     	 * To dom.
+    	 *
-...
     	 */
     	@Deprecated
     	public void toDom(Document doc, Node parent) {
     		Element tokenizer = doc.createElement("tokenizer");
     		tokenizer.setAttribute("onlyThoseTests", "false");
     		parent.appendChild(tokenizer);
     		// String tag_all = "<[^>]+>";
     		Element p = doc.createElement("param");
     		p.setAttribute("key", "tag_all");
-...
     		p.setAttribute("key", "intraword_tags");
     		p.setTextContent(intraword_tags);
     		tokenizer.appendChild(p);
     		// String punct_strong1 = ".!?";
     		p = doc.createElement("param");
     		p.setAttribute("key", "punct_strong1");
-...
     		p.setTextContent(regPunct);
     		tokenizer.appendChild(p);
+    	}
     	/**
     	 * Dump.
     	 */
-...
     		System.out.println("regElision = " + regElision + "");
     		System.out.println("regPunct = " + regPunct + "");
     		System.out.println("TESTS:");
     		for (TTest test : tests) {
     			System.out.println(" " + test + "");
+    		}
+    	}
     	public boolean isValid() {
     		try {
     			Pattern.compile(tag_all);
-...
     			System.out.println("tag_all=" + tag_all + ": " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(enclitics);
+    		}
-...
     			System.out.println("enclitics=" + enclitics + ": " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(encliticsFR);
+    		}
-...
     			System.out.println("encliticsFR=" + encliticsFR + ": " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(div_tags);
+    		}
-...
     			System.out.println("div_tags=" + div_tags + ": " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(q_tags);
+    		}
-...
     			System.out.println("q_tags=" + q_tags + ": " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(extraword1_tags);
+    		}
-...
     			System.out.println("extraword1_tags=" + extraword1_tags + ": " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(corr_tags_no_seg);
+    		}
-...
     			System.out.println("corr_tags_no_seg=" + corr_tags_no_seg + ": " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(word_tags);
+    		}
-...
     			System.out.println("word_tags=" + word_tags + ": " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(intraword_tags);
+    		}
-...
     			System.out.println("intraword_tags=" + intraword_tags + ": " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(punct_quotes);
+    		}
-...
     			System.out.println("punct_quotes=" + punct_quotes + ": " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(punct_strong1);
+    		}
-...
     			System.out.println("punct_strong1=" + punct_strong1 + ": " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(punct_strong2);
+    		}
-...
     			System.out.println("punct_strong2=" + punct_strong2 + ": " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(punct_paren_open1);
+    		}
-...
     			System.out.println("punct_paren_open1=" + punct_paren_open1 + ": " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(punct_paren_open2);
+    		}
-...
     			System.out.println("punct_paren_open2=" + punct_paren_open2 + ": " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(punct_paren_close1);
+    		}
-...
     			System.out.println("punct_paren_close1=" + punct_paren_close1 + ": " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(punct_paren_close2);
+    		}
-...
     			System.out.println("punct_paren_close2=" + punct_paren_close2 + ": " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(punct_weak);
+    		}
-...
     			System.out.println("punct_weak=" + punct_weak + ": " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(entity);
+    		}
-...
     			System.out.println("entity=" + entity + ": " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(seg_tags);
+    		}
-...
     			System.out.println("seg_tags=" + seg_tags + ": " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(corr_tags);
+    		}
-...
     			System.out.println("corr_tags: " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(extraword_tags);
+    		}
-...
     			System.out.println("extraword_tags: " + e);
     			return false;
+    		}
     		if (punct_strong != null)
     			try {
     			Pattern.compile(punct_strong);
     				Pattern.compile(punct_strong);
+    			}
     			catch (Exception e) {
     		catch (Exception e) {
     			System.out.println("punct_strong: " + e);
     			return false;
+    			}
+    		}
     		try {
     			Pattern.compile(punct_paren_open);
+    		}
-...
     			System.out.println("punct_paren_open: " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(punct_paren_close);
+    		}
-...
     			System.out.println("punct_paren_close: " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(punct_paren);
+    		}
-...
     			System.out.println("punct_paren: " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(punct_all);
+    		}
-...
     			System.out.println("punct_all: " + e);
     			return false;
+    		}
     		try {
     			Pattern.compile(word_chars);
+    		}
-...
     			System.out.println("word_chars: " + e);
     			return false;
+    		}
     		if (regPunct != null)
     			try {
     			Pattern.compile(regPunct);
     				Pattern.compile(regPunct);
+    			}
     			catch (Exception e) {
     		catch (Exception e) {
     			System.out.println("regPunct=" + regPunct + ": " + e);
     			return false;
+    			}
+    		}
     		if (regElision != null)
     			try {
     			Pattern.compile(regElision);
     				Pattern.compile(regElision);
+    			}
     			catch (Exception e) {
     		catch (Exception e) {
     			System.out.println("regElision=" + regElision + ": " + e);
     			return false;
+    			}
+    		}
     		if (whitespaces != null)
     			try {
     			Pattern.compile(whitespaces);
     				Pattern.compile(whitespaces);
+    			}
     			catch (Exception e) {
     		catch (Exception e) {
     			System.out.println("whitespaces=" + whitespaces + ": " + e);
     			return false;
+    			}
+    		}
     		for (TTest test : tests) {
     			try {
     				Pattern.compile(test.getRegex());
-...
+    		}
     		return true;
+    	}
     	/**
     	 * old way used in TXM 0.7.9 AND in corpus 0.7.9 -> 0.8.0 restoration
+    	 *
-...
     			String value = param.getAttribute("value");
     			if (value == null || value.length() == 0) value = param.getTextContent();
     			if (value.length() == 0) value = null;
     			if (debug) System.out.println(" Tokenizer parametrized with " + key + "=" + value + "");
     			if (key.equals("tag_all"))
     				tag_all = value;
     			else if (key.equals("enclitics"))
-...
     		// recombine
     		recombine();
     		if (tmp_strong_punct != null) punct_strong = tmp_strong_punct;  // this is temporary
     		String shouldResetTests = tokenizerElement.getAttribute("onlyThoseTests");
     		if ("true".equals(shouldResetTests)) {
     			System.out.println("Warning: tokenizer only using import parameters tests");
     			// tests = new ArrayList<>();
+    		}
     		NodeList testsList = tokenizerElement.getElementsByTagName("test");
     		if (testsList.getLength() > 0) {
     			// System.out.println("Add "+testsList.getLength()+" tests to the tokenizer"
-...
+    			}
     			// System.out.println("Tests: "+tests);
+    		}
     		if (isValid())
     			return true;
     		else {
-...
     			return false;
+    		}
+    	}
     	/**
     	 * The main method.
+    	 *
-...
     	 * @throws ParserConfigurationException
     	 */
     	public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException {
     		// Document doc = DomUtils.load(new File("/home/mdecorde/xml/discours/import.xml"));
     		// Node corpora = doc.getDocumentElement().getElementsByTagName("corpora").item(0);
     		// Node corpus = ((Element)corpora).getElementsByTagName("corpus").item(0);
     		// Node tokenizer = ((Element)corpus).getElementsByTagName("tokenizer").item(0);
     		// System.out.println("OK? "+ TokenizerClasses.loadFromNode((Element) tokenizer));
+    	}
     	public boolean loadFromProject(ProjectScope projectScope) {
     		// load params
     		IEclipsePreferences params = projectScope.getNode("Tokenizer");
     		try {
     			if (debug) System.out.println(Arrays.toString(params.keys()));
     			if (debug) System.out.println(Arrays.toString(params.childrenNames()));
     		} catch (BackingStoreException e1) {
     			// TODO Auto-generated catch block
     			e1.printStackTrace();
+    		}
     		try {
     			Field[] fields = TokenizerClasses.class.getFields();
     			for (Field field : fields) {
     				int m = field.getModifiers();
     				if (!Modifier.isStatic(m) && Modifier.isPublic(m) && field.getType().equals(String.class)) {
     					String name = field.getName();
     					String value = params.get(name, null);
     					String defaultValue = null;
     					try {
     						if (field.get(this) != null) {
     							defaultValue = field.get(this).toString();
+    						}
     					} catch (Exception e) {
     						e.printStackTrace();
+    					}
     					String value = params.get(name, defaultValue);
     					if (value != null) {
     						try {
     							if (debug) System.out.println(" Tokenizer parametrized with " + name + "=" + value);
-...
+    					}
+    				}
+    			}
     			// // old way, now using Reflection
     			// String[] keys = params.keys();
     			// for (String key : keys) {
     			// // Element param = (Element) params.item(i);
     			// // String key = param.getAttribute("name");
     			// String value = params.get(key, "");
     			// // if (value == null || value.length() == 0) value = param.getTextContent();
     			// if (value.length() == 0) value = null;
     			//
     			// if (debug) System.out.println(" Tokenizer parametrized with "+key+"="+value+"");
     			//
     			// if (key.equals("tag_all"))
     			// tag_all = value;
     			// else if (key.equals("enclitics"))
     			// enclitics = value;
     			// else if (key.equals("encliticsFR"))
     			// encliticsFR = value;
     			// else if (key.equals("div_tags"))
     			// div_tags = value;
     			// else if (key.equals("q_tags"))
     			// q_tags = value;
     			// else if (key.equals("extraword1_tags"))
     			// extraword1_tags = value;
     			// else if (key.equals("corr_tags_no_seg"))
     			// corr_tags_no_seg = value;
     			// else if (key.equals("word_tags")) {
     			// word_tags = value;
     			// word_element_to_create = value; // FIXME for now
     			// } else if (key.equals("intraword_tags"))
     			// intraword_tags = value;
     			// else if (key.equals("punct_quotes"))
     			// punct_quotes = value;
     			// else if (key.equals("punct_strong1"))
     			// punct_strong1 = value;
     			// else if (key.equals("punct_strong2"))
     			// punct_strong2 = value;
     			// else if (key.equals("punct_paren_open1"))
     			// punct_paren_open1 = value;
     			// else if (key.equals("punct_paren_open2"))
     			// punct_paren_open2 = value;
     			// else if (key.equals("punct_paren_close1"))
     			// punct_paren_close1 = value;
     			// else if (key.equals("punct_paren_close2"))
     			// punct_paren_close2 = value;
     			// else if (key.equals("punct_weak"))
     			// punct_weak = value;
     			// else if (key.equals("entity"))
     			// entity = value;
     			// else if (key.equals("seg_tags"))
     			// seg_tags = value;
     			// else if (key.equals("regPunct"))
     			// regPunct = value;
     			// else if (key.equals("regElision"))
     			// regElision = value;
     			// else if (key.equals("whitespaces"))
     			// whitespaces = value;
     			// else if (key.equals("punct_strong")) // this is temporary
     			// tmp_strong_punct = value; // this is temporary
     			//// else
     			//// System.out.println("MISSING TOKENIZER KEY: "+key);
     			// }
     			// recombine
     			recombine();
     			if (params.getBoolean("onlyThoseTests", false)) {
     				System.out.println("Warning: tokenizer only using import parameters tests");
     				// tests = new ArrayList<>();
+    			}
     			org.osgi.service.prefs.Preferences testsList = params.node("tests");
     			String[] tests;
     			tests = testsList.childrenNames();
     			// System.out.println("Add "+testsList.getLength()+" tests to the tokenizer"
     			for (String testname : tests) {
     				org.osgi.service.prefs.Preferences testdef = testsList.node(testname);
     				// Element test = (Element) testsList.item(i);
     				TTest t = new TTest(testdef.get("content", null), testdef.get("type", null),
     						testdef.getInt("before", 0),
-...
     						testdef.getInt("after", 0));
+    			}
     			// System.out.println("Tests: "+tests);
+    		}
     		catch (BackingStoreException e) {
     			// TODO Auto-generated catch block
     			e.printStackTrace();
+    		}
     		if (isValid()) {
     			return true;
+    		}

     				cpb.tick()
     				File infile = f;
     				File outfile = new File(module.getBinaryDirectory(), "tokenized/"+f.getName());
     				SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, TokenizerClasses.newTokenizerClasses(project.getPreferencesScope(), lang))
     				def tc = TokenizerClasses.newTokenizerClasses(project.getPreferencesScope(), lang)
     				SimpleTokenizerXml tokenizer = new SimpleTokenizerXml(infile, outfile, tc)
     				if (module.getProject().getAnnotate()) { // an annotation will be done, does the annotation engine needs another tokenizer ?
     					String engineName = module.getProject().getImportParameters().node("annotate").get("engine", "TreeTagger")
     					def engine = Toolbox.getEngineManager(EngineType.ANNOTATION).getEngine(engineName)

     	 * @param outfile the outfile
     	 */
     	public SimpleTokenizerXml(File infile, File outfile, TokenizerClasses tc) {
     		this.lang = tc.lang;
     		this.stringTokenizer = new SimpleStringTokenizer(lang);
     		this.stringTokenizer = new SimpleStringTokenizer(tc);
     		word_tags = tc.word_tags;
     		word_element_to_create = tc.word_element_to_create;

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 3796